From 2e6528beb77f2d64e85fdf9248ce5c640ee7d1ae Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Tue, 30 Jun 2026 02:17:06 +0000
Subject: [PATCH 01/27] fix(zisk): drop --max-witness-stored flag, use Zisk's
 default cap

Lowering the witness cap below Zisk's built-in default (10) was measured to
have a negligible effect on host RAM and prove time for the kernel typecheck
workload, so the CLI override (which defaulted to 5) is removed and the prover
uses EmbeddedOpts::default(). Drop the now-stale flag mentions and per-RAM
tuning guidance from the README, the cost-model doc, and the shard.rs model
comments.
---
 README.md                     | 33 ++++++++-------------------------
 crates/kernel/src/shard.rs    | 11 +++++------
 docs/zisk-cycle-cost-model.md | 12 +++++-------
 zisk/host/src/main.rs         | 21 +++------------------
 4 files changed, 21 insertions(+), 56 deletions(-)

diff --git a/README.md b/README.md
index d17190b0..892b7162 100644
--- a/README.md
+++ b/README.md
@@ -487,31 +487,14 @@ Non-Nix users: install Zisk manually per the
    *Sharding large environments* below): each shard ships only its own closure
    sub-env, so the pieces fit the cap even when the whole env does not.
 
-   **Host RAM cap (`--max-witness-stored`).** Distinct from the in-guest
-   heap cap above, the prover side (Zisk's `proofman`) holds in-flight
-   witness traces in host RAM during `CALCULATING_CONTRIBUTIONS`. Peak
-   host RAM per shard ≈ `fixed-overhead + N × avg-witness-size`, where
-   `N` is the `max_witness_stored` setting. With the Blake3f precompile the
-   Ix kernel typecheck workload measures roughly `40 GB + N × 16 GB` on
-   typical 200–300 kB anon-byte shards — e.g. `N = 10` peaks near 200 GB
-   (a `--shard-bytes 250000 --max-witness-stored 10` mergesort run completes
-   under a 200 GiB guard without tripping it). An earlier pre-Blake3f figure
-   of ~25 GB per witness is stale; the precompile shrank the witness.
-
-   The `zisk-host` CLI defaults to `--max-witness-stored 5` (Zisk's
-   built-in default is 10, tuned for larger-memory boxes). Override per
-   machine:
-
-   | Host RAM | `--max-witness-stored` | Notes                                                  |
-   | -------- | ---------------------- | ------------------------------------------------------ |
-   | ≤ 128 GB | `3`                    | Override down; consider smaller shards too             |
-   | 256 GB   | `5` (project default)  | Comfortable margin on the typical setup                |
-   | 512 GB   | `10` (Zisk default)    | Override up for maximum prover parallelism             |
-   | ≥ 1 TB   | `10` (Zisk default)    | Override up; default is conservative for this workload |
-
-   Lowering the cap roughly linearly bounds peak RAM but throttles
-   prover parallelism (~10–30 % slower in practice). Raise it if your
-   machine has more RAM headroom; lower it if you OOM during
+   **Host RAM during proving.** Distinct from the in-guest heap cap above,
+   the prover side (Zisk's `proofman`) holds in-flight witness traces in host
+   RAM during `CALCULATING_CONTRIBUTIONS`. The number of resident witnesses
+   (Zisk's `max_witness_stored`) is left at Zisk's built-in default of 10:
+   we measured that lowering it does not materially reduce peak host RAM or
+   prove time for the Ix kernel typecheck workload, so it is not exposed as a
+   knob. Peak host RAM per shard is instead governed by shard size — prove
+   smaller shards (`--shard-bytes`) if you OOM during
    `CALCULATING_CONTRIBUTIONS`. Not relevant for `--execute` or
    `--verify-constraints` modes.
 
diff --git a/crates/kernel/src/shard.rs b/crates/kernel/src/shard.rs
index 8e312f6c..e594ff09 100644
--- a/crates/kernel/src/shard.rs
+++ b/crates/kernel/src/shard.rs
@@ -1653,8 +1653,8 @@ pub fn block_step_cost(b: &BlockEntry) -> u64 {
 /// size shards straight from `MemTotal` without ever picking a budget. Inverts
 /// the measured single-leaf prover model on this setup
 /// (`peak_RAM_GiB ≈ 50 + 33 × steps_billions`, measured by a guarded 7-shard GPU
-/// prove sweep over 0.27–3.79e9-step Init shards, R²=0.99, `--max-witness-stored
-/// 5`) at [`RAM_USABLE_FRAC`] of RAM (reserving the rest for the OS, cross-shard
+/// prove sweep over 0.27–3.79e9-step Init shards, R²=0.99) at
+/// [`RAM_USABLE_FRAC`] of RAM (reserving the rest for the OS, cross-shard
 /// re-ingress, and run-to-run variance). Returns 0 when the box can't even hold the ~50 GiB
 /// prover base (nothing will prove). Approximate by design — pair with
 /// [`partition_for_cycle_cap`] to get N. The earlier `45 + 32` model was
@@ -1662,8 +1662,7 @@ pub fn block_step_cost(b: &BlockEntry) -> u64 {
 /// target actually used ~225 GB).
 /// Measured prover-RAM model (the single source of truth, used by both
 /// [`cycle_cap_for_ram`] and [`ram_gib_for_steps`]): peak host RAM ≈
-/// `RAM_BASE_GIB + RAM_GIB_PER_BCYCLE × steps_billions` (at
-/// `--max-witness-stored 5`).
+/// `RAM_BASE_GIB + RAM_GIB_PER_BCYCLE × steps_billions`.
 pub const RAM_BASE_GIB: f64 = 50.0;
 pub const RAM_GIB_PER_BCYCLE: f64 = 33.0;
 /// Usable fraction of a host-RAM budget (headroom for OS + variance) — applied
@@ -1685,8 +1684,8 @@ pub fn cycle_cap_for_ram(ram_gb: f64) -> u64 {
 }
 
 /// Measured single-GPU **leaf prove time**: `≈ PROVE_SETUP_SECS +
-/// PROVE_SECS_PER_BCYCLE × steps_billions` per shard (RTX PRO 6000,
-/// `--max-witness-stored 5`). Aggregation adds a smaller per-fold term this model
+/// PROVE_SECS_PER_BCYCLE × steps_billions` per shard (RTX PRO 6000).
+/// Aggregation adds a smaller per-fold term this model
 /// omits — minutes next to hours of leaf proving at large shard counts.
 pub const PROVE_SETUP_SECS: f64 = 54.0;
 pub const PROVE_SECS_PER_BCYCLE: f64 = 158.0;
diff --git a/docs/zisk-cycle-cost-model.md b/docs/zisk-cycle-cost-model.md
index 04e7a192..86e18822 100644
--- a/docs/zisk-cycle-cost-model.md
+++ b/docs/zisk-cycle-cost-model.md
@@ -109,13 +109,12 @@ flags as the Aiur `bench-typecheck`.
 
 ## Prover RAM and prove time
 
-Measured proving 7 Init shards on GPU at `--max-witness-stored 5`, STEPS
-0.27–3.79e9:
+Measured proving 7 Init shards on GPU, STEPS 0.27–3.79e9:
 ```
 peak host RAM (GiB) ≈ 50 + 33·STEPS_billions     R² 0.99
 GPU prove time (s)  ≈ 54 + 158·STEPS_billions     R² 0.98   (~6.3M steps/s)
 ```
-RAM scales with `--max-witness-stored N` (this is N=5). Inverting the RAM model
+Inverting the RAM model
 at `RAM_USABLE_FRAC` = 0.85 gives a safe per-shard cap of **~3.6e9 steps** for a
 200 GiB target (`shard.rs:cycle_cap_for_ram`).
 
@@ -171,7 +170,7 @@ side.
 
 Applying the cost model to every Init constant's native profile (51,003 blocks /
 51,678 constants) vs a single-leaf cap: **~99.98% of Init typechecks on Zisk.**
-Exactly **12 constants** exceed a single 250 GiB leaf (`--max-witness-stored 5`)
+Exactly **12 constants** exceed a single 250 GiB leaf
 — all single atomic constants (un-shardable), `hb`-dominated. Listed by name
 (all are `_private` proof terms; private prefix elided):
 
@@ -190,7 +189,7 @@ Exactly **12 constants** exceed a single 250 GiB leaf (`--max-witness-stored 5`)
 | `Array.extract_append_extract._proof_1_1` | 13,226 | 4.7e9 |
 | `Char.ofOrdinal_ordinal` | 14,136 | 4.5e9 |
 
-Each has a workaround — lower `--max-witness-stored`, a bigger box, `--skip-deps`
+Each has a workaround — a bigger box, `--skip-deps`
 (subject-only), or upstream proof restructuring — so 12 is an upper bound on
 truly-stuck constants. At a 200 GiB cap the list grows to 16. Estimate uses the
 planner's `block_step_cost` model (`162,339·hb + 4,276·subst + 652·bytes`, the
@@ -311,8 +310,7 @@ native profiling run and compile out on the zkvm target.
   5–8%); the shard model is still n=12 and Init-derived, so the ~190M shard
   intercept is the term most likely to
   shift on Std/Mathlib — worth re-checking there.
-- RAM/prove-time models are for one GPU at `--max-witness-stored 5`; both scale
-  with that setting.
+- RAM/prove-time models are for one GPU.
 - Cycle counts are deterministic for a fixed guest ELF; the profiler counters
   compile out on the zkvm target, so the proven ELF is unaffected.
 - **Regimes have distinct coefficients and don't transfer.**
diff --git a/zisk/host/src/main.rs b/zisk/host/src/main.rs
index 1eb35094..6bc6b11f 100644
--- a/zisk/host/src/main.rs
+++ b/zisk/host/src/main.rs
@@ -208,18 +208,6 @@ struct Args {
   /// `bench-typecheck --skip-deps`.
   #[arg(long, requires = "constant")]
   skip_deps: bool,
-
-  /// Cap on resident witness traces during the prove phase, bounding
-  /// peak host RAM per shard. Zisk's prover queues witnesses up to this
-  /// count before committing them; peak RAM ≈ N × avg-witness-size +
-  /// fixed overheads. Zisk's built-in default is 10 (tuned for
-  /// large-memory boxes); we default to 5 here as a safer fit for
-  /// ~256 GB machines. Override up to 10 on bigger boxes for maximum
-  /// parallelism, or down to 3 on smaller ones. See the Zisk section
-  /// of the top-level README for a per-RAM recommendation table. Has
-  /// no effect on `--execute` / `--verify-constraints` modes.
-  #[arg(long, default_value_t = 5)]
-  max_witness_stored: usize,
 }
 
 /// 112-byte public output of one shard-guest proof.
@@ -764,7 +752,6 @@ fn check_input_coherence(
 fn build_client(
   gpu: bool,
   asm: bool,
-  max_witness_stored: Option<usize>,
 ) -> Result<EmbeddedClient> {
   // Executor choice. The default is the Assembly executor (`asm = true`,
   // i.e. no `--emulator`): it is markedly faster at trace generation and is
@@ -785,10 +772,8 @@ fn build_client(
   // docs ("Reduce memory footprint during proving at the cost of
   // speed"). We have ~94 GB of free GPU memory, so the speed
   // trade-off is the wrong direction for this hardware.
-  let mut opts = EmbeddedOpts::default();
-  if let Some(n) = max_witness_stored {
-    opts = opts.max_witness_stored(n);
-  }
+  // Zisk's default embedded opts (witness cap 10).
+  let opts = EmbeddedOpts::default();
   let mut builder: EmbeddedClientBuilder =
     ProverClient::embedded().with_embedded_opts(opts);
   if asm {
@@ -1700,7 +1685,7 @@ async fn main() -> Result<()> {
   let total_leaves: usize = plans.iter().map(|p| p.shards.len()).sum();
 
   let client =
-    build_client(args.gpu, !args.emulator, Some(args.max_witness_stored))?;
+    build_client(args.gpu, !args.emulator)?;
   client.setup(&SHARD_PROGRAM).run()?.await?;
   // Skip agg-guest setup unless we'll produce more than one leaf proof.
   // The shard-plan path sets up the agg program itself, after its leaves.

From 8c3909a830736fbb88a191c4dbfe18df3cacf42b Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Tue, 30 Jun 2026 18:55:04 +0000
Subject: [PATCH 02/27] feat(ci): CSV-driven !benchmark + Zisk/SP1/native
 benchmarks

Rebased onto main (post #411 + #459) and integrated with the renamed
bench-main.yml and reworked bencher-track interface.

- Benchmarks/Vectors.csv: single shared source of truth (71 library constants
  from Init/Std/Mathlib/Lean). Consumed by Aiur (bench-typecheck --manifest),
  the zkVM hosts, and shell.
- bench-pr.yml: `!benchmark [aiur] [zisk] [sp1|all] [execute|prove]` over the
  curated set, posting a main-vs-PR table; main results cached by base SHA.
  Hardened: comment body from env (no injection), allowlisted env parsing.
- .github/scripts/{bench.py,run.sh}: parse/manifest/compare/comment, and the
  compile-.ixe + backend driver (cycles/execute-time/throughput/peak-rss, plus
  shards/max-shard-cycles for sharded runs).
- .github/actions/install-{sp1,zisk}: shared zkVM toolchain + deps install,
  used by bench-pr.yml, bench-main.yml, and riscv-bench.yml.
- bench-main.yml: add zkvm-execute (Zisk/SP1 cycle counts + time/RAM) and
  native-check (native parallel `ix check --anon` throughput) jobs, using the
  new bencher-track workload/thresholds interface.
- bench-typecheck: add --constant / --skip-deps (align with the zkVM hosts;
  --skip-deps replaces --subject-only) and --execute-only (fast execute path).
---
 .github/actions/install-sp1/action.yml  |  31 +++
 .github/actions/install-zisk/action.yml |  66 +++++
 .github/scripts/bench.py                | 270 +++++++++++++++++++++
 .github/scripts/run.sh                  | 135 +++++++++++
 .github/workflows/bench-main.yml        | 183 +++++++++++++-
 .github/workflows/bench-pr.yml          | 304 +++++++++++++++---------
 .github/workflows/riscv-bench.yml       |  65 +----
 Benchmarks/Typecheck.lean               |  57 +++--
 Benchmarks/Vectors.csv                  |  93 ++++++++
 9 files changed, 1009 insertions(+), 195 deletions(-)
 create mode 100644 .github/actions/install-sp1/action.yml
 create mode 100644 .github/actions/install-zisk/action.yml
 create mode 100644 .github/scripts/bench.py
 create mode 100644 .github/scripts/run.sh
 create mode 100644 Benchmarks/Vectors.csv

diff --git a/.github/actions/install-sp1/action.yml b/.github/actions/install-sp1/action.yml
new file mode 100644
index 00000000..71aa2037
--- /dev/null
+++ b/.github/actions/install-sp1/action.yml
@@ -0,0 +1,31 @@
+name: Install SP1
+description: >-
+  Install the system build deps and the SP1 zkVM toolchain (sp1up) needed to
+  build and run the SP1 host. Assumes a Rust toolchain is already set up.
+
+runs:
+  using: composite
+  steps:
+    # The shared zkVM apt superset (the ZisK book's full Ubuntu list — the
+    # prebuilt cargo tooling and proofman's C++ link OpenMPI/OpenMP/GMP/
+    # nlohmann-json/nasm/secp256k1/… — plus pkg-config + libssl-dev for SP1's
+    # host crates). The Nix shells provided this; a bare runner doesn't.
+    - name: Install system build deps
+      shell: bash
+      run: |
+        # Some warpbuild images ship an unreachable azure mirror that hangs
+        # `apt-get update`; drop it first (no-op elsewhere).
+        sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true
+        sudo apt-get update
+        sudo apt-get install -y \
+          xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \
+          nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \
+          libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \
+          openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \
+          pkg-config libssl-dev time
+    - name: Install SP1 toolchain (sp1up, latest)
+      shell: bash
+      run: |
+        curl -L https://sp1up.succinct.xyz | bash
+        ~/.sp1/bin/sp1up
+        echo "$HOME/.sp1/bin" >> "$GITHUB_PATH"
diff --git a/.github/actions/install-zisk/action.yml b/.github/actions/install-zisk/action.yml
new file mode 100644
index 00000000..ff883957
--- /dev/null
+++ b/.github/actions/install-zisk/action.yml
@@ -0,0 +1,66 @@
+name: Install Zisk
+description: >-
+  Install the system build deps and the ZisK zkVM toolchain (ziskup, CPU build,
+  no proving keys) needed to build and run the Zisk host. Assumes a Rust
+  toolchain is already set up.
+
+runs:
+  using: composite
+  steps:
+    # The shared zkVM apt superset — the ZisK book's full Ubuntu list (prebuilt
+    # cargo-zisk + proofman's C++ link OpenMPI/OpenMP/GMP/nlohmann-json/nasm/
+    # secp256k1/…), kept identical to install-sp1 so a host that links both is
+    # covered. The Nix shells provided this; a bare runner doesn't.
+    - name: Install system build deps
+      shell: bash
+      run: |
+        # Some warpbuild images ship an unreachable azure mirror that hangs
+        # `apt-get update`; drop it first (no-op elsewhere).
+        sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true
+        sudo apt-get update
+        sudo apt-get install -y \
+          xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \
+          nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \
+          libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \
+          openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \
+          pkg-config libssl-dev time
+    # `--version 0.18.0` pins the toolchain to match our deps. Our host links the
+    # argumentcomputer/zisk `blake3-precompile` fork, which is based on v0.18.0
+    # (its cargo-zisk has `check-setup`, used below to regenerate the key's
+    # const-trees). Without the pin, ziskup installs `releases/latest`, which
+    # resolves to upstream `v1.0.0-alpha` — a different circuit whose cargo-zisk
+    # dropped the `check-setup` subcommand, breaking the key step. `--cpu` picks
+    # the CPU build (no GPU on the runner) and `--nokey` skips ziskup's key
+    # install — both avoid its interactive /dev/tty prompts. We keep `--nokey`
+    # because the upstream `zisk-setup` bucket only carries the upstream circuit's
+    # key; our fork has a different circuit (extra Blake3f AIR), so we restore the
+    # fork-matching key from our own S3 in the next step. `--prefix $HOME/.zisk`
+    # pins the install where cargo-zisk's ZiskPaths fallback looks (the runner
+    # sets XDG_CONFIG_HOME, which would otherwise relocate it).
+    - name: Install Zisk toolchain (ziskup, pinned v0.18.0)
+      shell: bash
+      run: |
+        curl -L https://raw.githubusercontent.com/0xPolygonHermez/zisk/main/ziskup/install.sh \
+          | bash -s -- --cpu --nokey -y --version 0.18.0 --prefix "$HOME/.zisk"
+        echo "$HOME/.zisk/bin" >> "$GITHUB_PATH"
+    # Execute still needs a proving key present: zisk-host calls `client.setup()`
+    # (which the SDK runs before the execute branch), and that loads the circuit's
+    # const-tree files. We host the fork-matching key in a public S3 bucket
+    # WITHOUT the const-trees — exactly like Zisk's released
+    # `zisk-provingkey-*.tar.gz` on `storage.googleapis.com/zisk-setup` — and
+    # regenerate them here with `cargo-zisk check-setup -a`, which is how `ziskup`
+    # itself populates them. That keeps the artifact ~3 GB (gzip) instead of
+    # ~48 GB. The object name carries the fork rev so a circuit change can't
+    # silently reuse a stale key. Public bucket → plain curl, no AWS creds.
+    - name: Restore Zisk proving key (fork circuit) from S3
+      shell: bash
+      run: |
+        mkdir -p "$HOME/.zisk"
+        curl -fSL --retry 3 \
+          https://argument-zisk-setup.s3.amazonaws.com/zisk-provingkey-blake3-8f9e24d5-cpu.tar.gz \
+          -o /tmp/zisk-provingkey.tar.gz
+        tar -C "$HOME/.zisk" -xzf /tmp/zisk-provingkey.tar.gz
+        rm -f /tmp/zisk-provingkey.tar.gz
+        # Regenerate the const-tree files omitted from the artifact (CPU build, so
+        # no --gpu). This is the "may take a while" step ziskup prints.
+        cargo-zisk check-setup --proving-key "$HOME/.zisk/provingKey" -a
diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
new file mode 100644
index 00000000..c27582c9
--- /dev/null
+++ b/.github/scripts/bench.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+"""All data-wrangling for the `!benchmark` PR workflow, as subcommands:
+
+  parse     COMMENT_BODY env → matrix + config (writes $GITHUB_OUTPUT)
+  manifest  Benchmarks/Vectors.csv → the constant names for one cell
+  compare   main.json + pr.json   → a Markdown main-vs-PR table
+  comment   per-cell table files  → the final PR comment body
+
+The neutral results JSON every backend normalises to (see run.sh) is
+`{ "<name>": { "<metric>": <number>, ... }, ... }`. All metrics are
+lower-is-better, so a positive Δ% is a regression.
+"""
+import argparse
+import glob
+import hashlib
+import json
+import os
+
+
+# ───────────────────────── parse ─────────────────────────
+BACKENDS = ("aiur", "zisk", "sp1")
+MODES = ("execute", "prove")
+ENVS = ("initStd", "lean", "mathlib")
+CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_GPU"}
+PASSTHROUGH_KEYS = {"RUST_LOG", "WITHOUT_VK_VERIFICATION", "RUSTFLAGS"}
+
+
+def runner_for(backend, mode, gpu):
+    """(runs-on label, skip?) for a cell."""
+    if backend == "aiur":
+        return "warp-ubuntu-latest-x64-32x", False
+    if mode == "execute":
+        return "warp-ubuntu-latest-x64-16x", False
+    if gpu:                       # zkVM prove needs a GPU
+        return "self-hosted-gpu", False
+    return "ubuntu-latest", True
+
+
+def cmd_parse(_a):
+    body = os.environ.get("COMMENT_BODY", "")
+    lines = [ln.replace("\r", "") for ln in body.splitlines()]
+    cmd = next((ln for ln in lines if "!benchmark" in ln), "")
+    toks = cmd.split("!benchmark", 1)[1].split() if "!benchmark" in cmd else []
+
+    backends, mode = [], "execute"
+    for t in (t.lower() for t in toks):
+        if t == "all":
+            backends = list(BACKENDS)
+        elif t in BACKENDS and t not in backends:
+            backends.append(t)
+        elif t in MODES:
+            mode = t
+    if not backends:
+        backends = ["aiur"]
+
+    cfg, passthrough = {}, []
+    for ln in lines[(lines.index(cmd) + 1) if cmd in lines else 0:]:
+        s = ln.strip()
+        if not s or "=" not in s:
+            continue
+        key, val = (x.strip() for x in s.split("=", 1))
+        if key in CONFIG_KEYS:
+            cfg[key] = val
+        elif key in PASSTHROUGH_KEYS:
+            passthrough.append(f"{key}={val}")
+
+    envs = [e.strip() for e in cfg.get("BENCH_ENVS", "initStd").split(",") if e.strip()]
+    envs = [e for e in envs if e in ENVS] or ["initStd"]
+    tier = cfg.get("BENCH_TIER", "")
+    if tier not in ("cheap", "heavy", "all"):
+        tier = ""             # empty ⇒ derived from mode at manifest time
+    shard = "1" if cfg.get("BENCH_SHARD") == "1" else "0"
+    gpu = cfg.get("BENCH_GPU") == "1"
+
+    cells = []
+    for b in backends:
+        for e in envs:
+            runner, skip = runner_for(b, mode, gpu)
+            cells.append({"backend": b, "env": e, "mode": mode, "runner": runner,
+                          "skip": "true" if skip else "false", "label": f"{b}-{e}-{mode}"})
+
+    summary = (f"backends: `{' '.join(backends)}` · mode: `{mode}` · "
+               f"envs: `{','.join(envs)}` · tier: `{tier or 'auto'}` · "
+               f"shard: `{shard}` · gpu: `{int(gpu)}`")
+    if passthrough:
+        summary += " · env: `" + " ".join(passthrough) + "`"
+
+    with open(os.environ.get("GITHUB_OUTPUT", "/dev/stdout"), "a") as f:
+        f.write(f"matrix={json.dumps(cells)}\n")
+        f.write(f"mode={mode}\ntier={tier}\nshard={shard}\n")
+        f.write(f"config-summary={summary}\n")
+        f.write("passthrough-env<<PTENV\n" + "\n".join(passthrough)
+                + ("\n" if passthrough else "") + "PTENV\n")
+    print(summary)
+    print(json.dumps(cells, indent=2))
+
+
+# ──────────────────────── manifest ────────────────────────
+def cmd_manifest(a):
+    tier = a.tier or ("cheap" if a.mode == "prove" else "all")
+    names = []
+    with open(a.csv) as f:
+        for line in f:
+            row = line.rstrip("\n")
+            if not row or row.startswith("#"):
+                continue
+            cols = row.split(",")
+            if cols[0] == "name" or len(cols) < 4:
+                continue
+            name, env, ctier, shard = cols[:4]
+            if env != a.env:
+                continue
+            if tier in ("cheap", "heavy") and ctier != tier:
+                continue
+            if a.shard == "1" and shard != "1":
+                continue
+            names.append(name)
+    with open(a.out, "w") as f:
+        f.write("\n".join(names) + ("\n" if names else ""))
+    vhash = hashlib.sha256(open(a.csv, "rb").read()).hexdigest()[:16]
+    print(f"count={len(names)}\nvhash={vhash}\ntier={tier}")
+
+
+# ───────────────────────── compare ─────────────────────────
+METRICS = {
+    ("aiur", "execute"): ["fft-cost", "execute-time"],
+    ("aiur", "prove"): ["prove-time", "peak-rss"],
+    ("zisk", "execute"): ["cycles", "execute-time", "throughput", "peak-rss"],
+    ("sp1", "execute"): ["cycles", "execute-time", "throughput", "peak-rss"],
+    ("zisk", "prove"): ["prove-time", "steps"], ("sp1", "prove"): ["prove-time", "steps"],
+}
+
+
+def _num(d, name, metric):
+    v = d.get(name, {}).get(metric)
+    return v if isinstance(v, (int, float)) else None
+
+
+def _human(v):
+    if v is None:
+        return "n/a"
+    if isinstance(v, int) or (isinstance(v, float) and v.is_integer()):
+        return f"{int(v):,}"
+    return f"{v:,.3f}"
+
+
+def _delta(main, pr):
+    if main is None or pr is None or main == 0:
+        return None
+    return (pr - main) / main * 100.0
+
+
+def _load(path):
+    try:
+        with open(path) as f:
+            d = json.load(f)
+        return d if isinstance(d, dict) else {}
+    except (FileNotFoundError, json.JSONDecodeError):
+        return {}
+
+
+def cmd_compare(a):
+    metrics = a.metric or METRICS.get((a.backend, a.mode))
+    if not metrics:
+        raise SystemExit("compare: pass --metric or a known --backend/--mode")
+    title = a.title
+    if title is None and a.backend:
+        hit = "hit (cached)" if a.cache_hit == "true" else "miss (ran main)"
+        cnt = f"{a.count} constants · " if a.count else ""
+        title = f"### `{a.backend}` · `{a.env}` · `{a.mode}` — {cnt}main cache: {hit}"
+
+    def emit(text):
+        if a.out:
+            open(a.out, "w").write(text + "\n")
+        else:
+            print(text)
+
+    main_d, pr_d = _load(a.main), _load(a.pr)
+    names = sorted(set(main_d) | set(pr_d))
+    if not names:
+        emit((title or "") + "\n\n_No results were produced (every constant failed, "
+             "timed out, or was dropped). See the workflow logs._")
+        return
+
+    primary = metrics[0]
+    names.sort(key=lambda n: (0, -v) if (v := (_num(pr_d, n, primary)
+               if _num(pr_d, n, primary) is not None else _num(main_d, n, primary))) is not None
+               else (1, 0))
+
+    head = ["constant"]
+    for m in metrics:
+        head += [f"{m} (main)", f"{m} (PR)", "Δ%"]
+    rows = ["| " + " | ".join(head) + " |", "|" + "|".join(["---"] * len(head)) + "|"]
+
+    n_reg = n_imp = 0
+    worst = None
+    for n in names:
+        cells = [f"`{n}`"]
+        for i, m in enumerate(metrics):
+            mv, pv = _num(main_d, n, m), _num(pr_d, n, m)
+            dp = _delta(mv, pv)
+            cell = "n/a" if dp is None else f"{dp:+.1f}%"
+            if i == 0 and dp is not None:
+                if dp > a.threshold:
+                    cell += " ⚠️"; n_reg += 1
+                elif dp < -a.threshold:
+                    cell += " 🟢"; n_imp += 1
+                if worst is None or dp > worst[0]:
+                    worst = (dp, n)
+            cells += [_human(mv), _human(pv), cell]
+        rows.append("| " + " | ".join(cells) + " |")
+
+    out = ([title, ""] if title else []) + rows + [""]
+    s = (f"_{len(names)} constants · {n_reg} regressed · {n_imp} improved "
+         f"(|Δ| > {a.threshold:g}% on `{primary}`)._")
+    if worst and worst[0] is not None and worst[0] > a.threshold:
+        s += f" Worst: `{worst[1]}` {worst[0]:+.1f}%."
+    out.append(s)
+    emit("\n".join(out))
+
+
+# ───────────────────────── comment ─────────────────────────
+def cmd_comment(a):
+    commit = f"[`{a.head[:7]}`]({a.repo_url}/commit/{a.head})"
+    parts = [f"## `!benchmark` — main vs {commit}", "", a.summary, ""]
+    tables = sorted(glob.glob(os.path.join(a.tables, "table-*.md")))
+    if tables:
+        for t in tables:
+            parts += [open(t).read().rstrip(), ""]
+    else:
+        parts += ["_No result tables were produced — see the workflow logs._", ""]
+    parts.append(f"[Workflow logs]({a.repo_url}/actions/runs/{a.run_id})")
+    open(a.out, "w").write("\n".join(parts) + "\n")
+    print(open(a.out).read())
+
+
+# ────────────────────────── cli ──────────────────────────
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    sub.add_parser("parse").set_defaults(fn=cmd_parse)
+
+    m = sub.add_parser("manifest")
+    m.add_argument("--csv", required=True); m.add_argument("--env", required=True)
+    m.add_argument("--mode", required=True); m.add_argument("--tier", default="")
+    m.add_argument("--shard", default="0"); m.add_argument("--out", required=True)
+    m.set_defaults(fn=cmd_manifest)
+
+    c = sub.add_parser("compare")
+    c.add_argument("--main", required=True); c.add_argument("--pr", required=True)
+    c.add_argument("--metric", action="append", default=[])
+    c.add_argument("--threshold", type=float, default=3.0)
+    c.add_argument("--title"); c.add_argument("--backend"); c.add_argument("--env")
+    c.add_argument("--mode"); c.add_argument("--count"); c.add_argument("--cache-hit", default="")
+    c.add_argument("--out")
+    c.set_defaults(fn=cmd_compare)
+
+    cm = sub.add_parser("comment")
+    cm.add_argument("--tables", required=True); cm.add_argument("--summary", default="")
+    cm.add_argument("--head", required=True); cm.add_argument("--repo-url", required=True)
+    cm.add_argument("--run-id", required=True); cm.add_argument("--out", required=True)
+    cm.set_defaults(fn=cmd_comment)
+
+    a = ap.parse_args()
+    a.fn(a)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
new file mode 100644
index 00000000..045a1f4f
--- /dev/null
+++ b/.github/scripts/run.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+# Compile one library env to a `.ixe` from a checked-out repo, then benchmark the
+# given backend over a manifest, emitting the neutral results JSON
+#   { "<name>": { "<metric>": <number>, ... }, ... }
+# that bench.py compare consumes.
+#
+#   run.sh <repo_dir> <env> <backend> <mode> <names_file> <out_json>
+#     repo_dir : checked-out worktree (has .lake/build/bin/{ix,bench-typecheck})
+#     env      : initStd | lean | mathlib
+#     backend  : aiur | zisk | sp1
+#     mode     : execute | prove
+#
+# `ix` / `bench-typecheck` are taken from <repo_dir> (so base measures base's
+# code, PR the PR's — the caller puts <repo_dir>/.lake/build/bin on PATH). The
+# zkVM hosts run from <repo_dir>/<backend>. Per-constant subprocesses for the
+# zkVM hosts so one OOM/timeout drops only that row.
+set -uo pipefail
+
+repo=${1:?repo_dir}; benv=${2:?env}; backend=${3:?backend}; mode=${4:?mode}
+names=${5:?names}; out=${6:?out}
+# Absolute repo path: the zkVM branch cd's into the host workspace, so the .ixe
+# path passed to the host must not be relative to the original cwd.
+repo=$(cd "$repo" && pwd)
+: > "$out"
+emit_empty() { [ -s "$out" ] || echo '{}' > "$out"; }
+
+case "$benv" in
+  initStd) module=CompileInitStd ;;
+  lean)    module=CompileLean ;;
+  mathlib) module=CompileMathlib ;;
+  *) echo "unknown env: $benv" >&2; exit 2 ;;
+esac
+
+ixe="$repo/$benv.ixe"
+if [ "${REUSE_IXE:-0}" = 1 ] && [ -f "$ixe" ]; then
+  echo "reusing existing $ixe (REUSE_IXE)"
+else
+  echo "::group::ix compile $module → $benv.ixe ($backend/$mode)"
+  "$repo/.lake/build/bin/ix" compile "$repo/Benchmarks/Compile/$module.lean" --out "$ixe"
+  echo "::endgroup::"
+fi
+
+case "$backend" in
+  aiur)
+    # bench-typecheck runs Phase 1 (execute) always; Phase 2 (prove) unless
+    # --execute-only. It writes the neutral JSON itself via --json.
+    args=(--ixe "$ixe" --manifest "$names" --json "$out")
+    if [ "$mode" = execute ]; then
+      bench-typecheck "${args[@]}" --execute-only || true
+    else
+      bench-typecheck "${args[@]}" --texray 2> tx.log || true
+      # Fold texray's proving RSS high-water mark into every entry (max over
+      # spans; $2+0 stops at the first non-digit) — same parse as aiur-bench.yml.
+      rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>m {m=$2+0} END {if (m>0) print m}' tx.log)
+      if [ -n "${rss:-}" ] && [ -s "$out" ]; then
+        jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' "$out" > "$out.t" \
+          && mv "$out.t" "$out" || true
+      fi
+    fi
+    emit_empty
+    ;;
+
+  zisk|sp1)
+    host="${backend}-host"; work="$repo/$backend"
+    # Build the host once so per-constant timing excludes compilation, and run the
+    # binary directly under `/usr/bin/time` — a `timeout`/`cargo run` wrapper would
+    # report the wrapper's RSS, not the host's. (No per-constant timeout in execute
+    # mode; the job-level timeout bounds a hang.)
+    echo "::group::cargo build $host"
+    ( cd "$work" && cargo build --quiet --release --bin "$host" )
+    echo "::endgroup::"
+    bin="$work/target/release/$host"
+    # ZisK's ASM microservices mmap the ROM with MAP_LOCKED, needing unlimited
+    # locked memory (the Zisk book's DefaultLimitMEMLOCK=infinity). The warp
+    # runner caps the memlock hard limit and can't be rebooted, so raise it
+    # in-session as root; the host children inherit it. Without this the ASM
+    # services die with `mmap(rom) errno=11`. SP1 needs no such raise.
+    [ "$backend" = zisk ] && sudo prlimit --pid $$ --memlock=unlimited:unlimited
+    tmp=$(mktemp -d)
+    while IFS= read -r c; do
+      [ -z "$c" ] && continue
+      slug=$(printf '%s' "$c" | tr '/ .:' '____')
+      log="$tmp/$slug.out"; tmf="$tmp/$slug.time"
+      if [ "$mode" = execute ]; then
+        # `/usr/bin/time -f '%e %M'` → elapsed seconds + max RSS (kB).
+        ( cd "$work" && /usr/bin/time -f '%e %M' -o "$tmf" \
+            "$bin" --execute --ixe "$ixe" --constant "$c" --skip-deps ) > "$log" 2>>"$log" \
+          || { echo "::warning::$backend execute '$c' failed; dropping"; continue; }
+        fail=$(grep -oE 'failures[:=] ?[0-9]+' "$log" | head -1 | grep -oE '[0-9]+')
+        if [ "${fail:-1}" != 0 ]; then
+          echo "::warning::$backend '$c': nonzero/missing failures; dropping"; continue
+        fi
+        # Total cycles: sharded prints "total cycles: N", single prints "cycles: N".
+        cyc=$(grep -oE 'total cycles: [0-9]+' "$log" | tail -1 | grep -oE '[0-9]+')
+        [ -z "$cyc" ] && cyc=$(grep -oE 'cycles: [0-9]+' "$log" | tail -1 | grep -oE '[0-9]+')
+        [ -z "$cyc" ] && { echo "::warning::$backend '$c': no cycle count; dropping"; continue; }
+        secs=$(awk 'NR==1{print $1}' "$tmf"); rssk=$(awk 'NR==1{print $2}' "$tmf")
+        rss=$(( ${rssk:-0} * 1024 ))
+        tput=$(awk -v c="$cyc" -v s="${secs:-0}" 'BEGIN{ if (s>0) printf "%.0f", c/s; else print 0 }')
+        # Per-shard "cycles=<n>" lines appear only for sharded runs.
+        mapfile -t sh < <(grep -oE 'cycles=[0-9]+' "$log" | grep -oE '[0-9]+')
+        base="cycles:\$cyc, \"execute-time\":\$secs, throughput:\$tput, \"peak-rss\":\$rss"
+        if [ "${#sh[@]}" -gt 0 ]; then
+          maxsh=$(printf '%s\n' "${sh[@]}" | sort -n | tail -1)
+          jq -n --arg n "$c" --argjson cyc "$cyc" --argjson secs "${secs:-0}" \
+                --argjson tput "$tput" --argjson rss "$rss" \
+                --argjson nsh "${#sh[@]}" --argjson maxsh "$maxsh" \
+            "{(\$n): {$base, shards:\$nsh, \"max-shard-cycles\":\$maxsh}}"
+        else
+          jq -n --arg n "$c" --argjson cyc "$cyc" --argjson secs "${secs:-0}" \
+                --argjson tput "$tput" --argjson rss "$rss" \
+            "{(\$n): {$base}}"
+        fi
+      else
+        # prove (single-leaf, GPU runner only — the workflow gates this cell).
+        ( cd "$work" && timeout 60m cargo run --quiet --release --bin "$host" -- \
+            --gpu --ixe "$ixe" --constant "$c" --skip-deps ) > "$log" 2>&1 \
+          || { echo "::warning::$backend prove '$c' failed/timed out; dropping"; continue; }
+        secs=$(grep -oE 'prove [0-9.]+s'   "$log" | head -1 | grep -oE '[0-9.]+')
+        steps=$(grep -oE '\([0-9]+ steps\)' "$log" | head -1 | grep -oE '[0-9]+')
+        fail=$(grep -oE 'failures=[0-9]+'  "$log" | head -1 | grep -oE '[0-9]+')
+        if [ -n "${secs:-}" ] && [ "${fail:-1}" = 0 ]; then
+          jq -n --arg n "$c" --argjson t "$secs" --argjson s "${steps:-0}" \
+            '{($n): {"prove-time": $t, steps: $s}}'
+        else
+          echo "::warning::$backend prove '$c': no clean prove line; dropping"
+        fi
+      fi
+    done < "$names" | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null
+    emit_empty
+    ;;
+
+  *) echo "unknown backend: $backend" >&2; exit 2 ;;
+esac
+echo "rows in $out: $(jq 'length' "$out" 2>/dev/null || echo '?')"
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index 957b51b3..734bf787 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -1,14 +1,19 @@
-name: Aiur benchmarks
+name: Benchmarks
 
-# One workflow, two benchmarks per library env, on every push to main:
-#   1. compile job — `ix compile` the Lean env to a `.ixe` (compile-throughput
-#                    metrics) and cache the `.ixe`.
-#   2. prove job   — restore that `.ixe` from the cache (no recompile) and
-#                    STARK-check selected constants over it via bench-typecheck
-#                    (Aiur execute + prove metrics).
-# The prove job reuses the exact `.ixe` the compile job built, so the compiler
-# runs once. Compile and prove report to separate bencher testbeds so each one's
-# `--thresholds-reset` only touches its own measures.
+# Benchmarks tracked on Bencher on every push to main, all reusing the one
+# compiled `.ixe` so the compiler runs once:
+#   1. compile      — `ix compile` the Lean env to a `.ixe` (compile-throughput
+#                     metrics) and cache the `.ixe`.
+#   2. prove        — restore that `.ixe` (no recompile) and STARK-check selected
+#                     constants over it via bench-typecheck (Aiur execute + prove).
+#   3. zkvm-execute — restore that `.ixe` and execute the same constants through
+#                     the Zisk and SP1 zkVM hosts (deterministic cycle counts +
+#                     time/throughput/RAM; proving needs a GPU, so execute-only).
+#   4. native-check — restore that `.ixe` and run the native Rust kernel (the same
+#                     kernel, out-of-circuit and parallel — far faster) over the
+#                     whole env via `ix check --anon`, tracking throughput.
+# Each job reports to its own bencher testbed/workload so a threshold reset only
+# touches its own measures.
 
 on:
   push:
@@ -272,3 +277,161 @@ jobs:
             --threshold-measure throughput --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
             --threshold-lower-boundary 0.10
+
+  # Execute the same constants through the Zisk and SP1 zkVM hosts and track
+  # cycles / execute-time / throughput / peak-rss (and shards / max-shard-cycles
+  # for any sharded run). Lean-free: reuses the compile job's cached `.ixe` and
+  # only builds the Rust host. zkVM proving needs a GPU (absent here), so this is
+  # execute-only. Toolchain + deps come from the shared install-{zisk,sp1} actions.
+  zkvm-execute:
+    needs: compile
+    runs-on: warp-ubuntu-latest-x64-16x
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: [zisk, sp1]
+        include:
+          - bench: InitStd
+            consts: Nat.add_comm Nat.sub_le_of_le_add String.append Array.append_assoc
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          fetch-tags: true   # bencher-track reads the baseline-reset tag
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          cache-workspaces: ${{ matrix.backend }}
+      - name: Install Zisk
+        if: matrix.backend == 'zisk'
+        uses: ./.github/actions/install-zisk
+      - name: Install SP1
+        if: matrix.backend == 'sp1'
+        uses: ./.github/actions/install-sp1
+      # Pull the `.ixe` the compile job built — no recompile (REUSE_IXE).
+      - uses: actions/cache/restore@v5
+        with:
+          path: ${{ matrix.bench }}.ixe
+          key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
+          fail-on-cache-miss: true
+      - name: Run ${{ matrix.backend }} execute benchmark
+        env:
+          REUSE_IXE: "1"
+        run: |
+          printf '%s\n' ${{ matrix.consts }} > names.txt
+          bash .github/scripts/run.sh . ${{ matrix.bench }} ${{ matrix.backend }} execute \
+            names.txt neutral.json
+          # Wrap neutral { name: { metric: v } } → Bencher Metric Format.
+          jq 'map_values(map_values({value: .}))' neutral.json > bench.json
+          cat bench.json
+      # cycles / shards / max-shard-cycles are deterministic per guest ELF →
+      # pinned (0/0). execute-time / peak-rss / throughput are noisy wall-clock →
+      # percentage bounds (throughput's regression is a drop).
+      - uses: ./.github/actions/bencher-track
+        with:
+          testbed: ${{ matrix.backend }}-execute-x64-16x
+          workload: ${{ matrix.backend }}
+          file: bench.json
+          key: ${{ secrets.BENCHER_API_KEY }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          thresholds: |
+            --threshold-measure cycles --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
+            --threshold-lower-boundary 0
+            --threshold-measure shards --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
+            --threshold-lower-boundary 0
+            --threshold-measure max-shard-cycles --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
+            --threshold-lower-boundary 0
+            --threshold-measure execute-time --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
+            --threshold-measure peak-rss --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
+            --threshold-measure throughput --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
+            --threshold-lower-boundary 0.10
+
+  # Native Rust kernel typecheck — the same kernel as the zkVM guest, but run
+  # out-of-circuit and in parallel (`--workers` defaults to the core count), so
+  # far faster than proving. Checks the whole env via `ix check --anon`, tracking
+  # throughput (constants/sec), wall time, and peak RAM. Reuses the compile job's
+  # cached `.ixe` and the staged `ix` binary — no recompile.
+  native-check:
+    needs: compile
+    runs-on: warp-ubuntu-latest-x64-32x
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        bench: [InitStd, Mathlib]
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          fetch-tags: true   # bencher-track reads the baseline-reset tag
+      - uses: actions/cache/restore@v5
+        with:
+          path: ~/.local/bin
+          key: aiur-bench-bins-${{ github.sha }}
+      - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
+      # Provision the toolchain so `ix` finds libleanshared (no package build).
+      - uses: leanprover/lean-action@v1
+        with:
+          auto-config: false
+          build: false
+          use-github-cache: false
+      - name: Install GNU time
+        run: |
+          sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true
+          sudo apt-get update && sudo apt-get install -y time
+      - uses: actions/cache/restore@v5
+        with:
+          path: ${{ matrix.bench }}.ixe
+          key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
+          fail-on-cache-miss: true
+      # `ix check --anon` checks every kernel-checkable address across
+      # available_parallelism workers and prints a machine-readable
+      # `##check## <elapsed_ms> <passed> <failures> <total>` line. ix check is a
+      # single multi-threaded process, so `/usr/bin/time -f '%M'` is the true peak RSS.
+      - name: Run native kernel check
+        run: |
+          /usr/bin/time -f '%e %M' -o time.txt \
+            ix check ${{ matrix.bench }}.ixe --anon 2>&1 | tee out.txt
+          line=$(grep '^##check##' out.txt | tail -1)
+          elapsed_ms=$(echo "$line" | awk '{print $2}')
+          failures=$(echo "$line" | awk '{print $4}')
+          total=$(echo "$line" | awk '{print $5}')
+          [ "${failures:-1}" = 0 ] || { echo "kernel check reported ${failures:-?} failure(s)"; exit 1; }
+          rssk=$(awk 'NR==1{print $2}' time.txt); rss=$(( ${rssk:-0} * 1024 ))
+          check_s=$(awk -v e="$elapsed_ms" 'BEGIN{printf "%.3f", e/1000}')
+          tput=$(awk -v t="$total" -v e="$elapsed_ms" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }')
+          jq -n --arg n "${{ matrix.bench }}" --argjson c "$total" --argjson s "$check_s" \
+                --argjson tp "$tput" --argjson rss "$rss" \
+            '{($n): {constants:{value:$c}, "check-time":{value:$s}, throughput:{value:$tp}, "peak-rss":{value:$rss}}}' \
+            > bench.json
+          cat bench.json
+      # constants is deterministic → pinned (0/0); check-time / throughput /
+      # peak-rss are noisy parallel wall-clock → percentage bounds.
+      - uses: ./.github/actions/bencher-track
+        with:
+          testbed: native-check-x64-32x
+          workload: native-check
+          file: bench.json
+          key: ${{ secrets.BENCHER_API_KEY }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          thresholds: |
+            --threshold-measure constants --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
+            --threshold-lower-boundary 0
+            --threshold-measure check-time --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
+            --threshold-measure throughput --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
+            --threshold-lower-boundary 0.10
+            --threshold-measure peak-rss --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index a2a1823c..c623d432 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -1,4 +1,17 @@
-# Creates a PR benchmark comment with a comparison to main
+# `!benchmark` PR command: run the curated constant set (Benchmarks/Vectors.csv)
+# through chosen prover backend(s) and post a main-vs-PR comparison table.
+#
+#   !benchmark [aiur] [zisk] [sp1 | all]  [execute|prove]
+#   BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
+#   BENCH_TIER=cheap|heavy|all     # override the mode default (execute=all, prove=cheap)
+#   BENCH_SHARD=1                  # restrict to the multi-shard target constants
+#   BENCH_GPU=1                    # allow zkVM prove on a self-hosted GPU runner
+#   RUST_LOG=info                  # passthrough env (allowlisted)
+#
+# Per-PR scope: Aiur runs execute (fast, --execute-only) or prove on CPU; the
+# Zisk/SP1 zkVM hosts run execute (deterministic cycle counts). zkVM prove needs
+# a GPU and is skipped with a note unless BENCH_GPU=1 selects a GPU runner. main's
+# numbers are cached by base SHA so they are not recomputed on every comment.
 name: Benchmark pull requests
 
 on:
@@ -11,145 +24,220 @@ permissions:
   pull-requests: read
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  group: ${{ github.workflow }}-${{ github.event.issue.number }}
   cancel-in-progress: true
 
 jobs:
   setup:
-    name: Comparative PR benchmark comment
-    if:
-      github.event.issue.pull_request
-      && github.event.issue.state == 'open'
-      && (contains(github.event.comment.body, '!benchmark'))
-      && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER')
+    name: Parse !benchmark comment
+    if: >-
+      github.event.issue.pull_request &&
+      github.event.issue.state == 'open' &&
+      contains(github.event.comment.body, '!benchmark') &&
+      (github.event.comment.author_association == 'MEMBER' ||
+       github.event.comment.author_association == 'OWNER')
     runs-on: ubuntu-latest
     outputs:
-      benches: ${{ steps.bench-params.outputs.benches }}
-      env-vars: ${{ steps.bench-params.outputs.env-vars }}
+      matrix: ${{ steps.parse.outputs.matrix }}
+      mode: ${{ steps.parse.outputs.mode }}
+      tier: ${{ steps.parse.outputs.tier }}
+      shard: ${{ steps.parse.outputs.shard }}
+      passthrough-env: ${{ steps.parse.outputs.passthrough-env }}
+      config-summary: ${{ steps.parse.outputs.config-summary }}
+      base-sha: ${{ steps.comment-branch.outputs.base_sha }}
+      head-sha: ${{ steps.comment-branch.outputs.head_sha }}
     steps:
       - uses: actions/checkout@v6
-      - name: Parse PR comment body
-        id: bench-params
-        run: |
-          # Parse `issue_comment` body
-          printf '${{ github.event.comment.body }}' > comment.txt
-          BENCH_COMMAND=$(head -n 1 comment.txt | tr -d '\r')
-          echo "$BENCH_COMMAND"
-
-          BENCHES=$(echo $BENCH_COMMAND | awk -F'!benchmark ' '{ print $2 }')
-          # Set default benches to run if none specified
-          BENCHES=${BENCHES:-"bench-aiur"}
-          echo "BENCHES:"
-          echo "$BENCHES"
-          JSON=$(echo $BENCHES | jq -R -c 'split(" ")')
-
-          echo "JSON:"
-          echo "$JSON"
-
-          echo "benches=$JSON" | tee -a $GITHUB_OUTPUT
-
-          # Can't persist env vars between jobs, so we pass them as an output and set them in the next job
-          echo "env-vars=$(tail -n +2 comment.txt | tr -d '\r' | tr '\n' ' ')" | tee -a $GITHUB_OUTPUT
+      # Resolve the PR's base/head SHAs from the comment's issue.
+      - uses: xt0rted/pull-request-comment-branch@v3
+        id: comment-branch
+      # Parse the comment from an env var (never inline-interpolated) → matrix +
+      # config. The allowlist drops anything that isn't a known flag/env key.
+      - name: Parse comment
+        id: parse
+        env:
+          COMMENT_BODY: ${{ github.event.comment.body }}
+        run: python3 .github/scripts/bench.py parse
 
   benchmark:
-    needs: [ setup ]
-    runs-on: warp-ubuntu-latest-x64-16x
+    needs: setup
+    runs-on: ${{ matrix.cell.runner }}
+    timeout-minutes: 120
     strategy:
+      fail-fast: false
       matrix:
-        # Runs a job for each benchmark specified in the `issue_comment` input
-        bench: ${{ fromJSON(needs.setup.outputs.benches) }}
+        cell: ${{ fromJSON(needs.setup.outputs.matrix) }}
+    env:
+      BACKEND: ${{ matrix.cell.backend }}
+      BENV: ${{ matrix.cell.env }}
+      MODE: ${{ matrix.cell.mode }}
+      LABEL: ${{ matrix.cell.label }}
+      BASE_SHA: ${{ needs.setup.outputs.base-sha }}
+      HEAD_SHA: ${{ needs.setup.outputs.head-sha }}
+      TIER: ${{ needs.setup.outputs.tier }}
+      SHARD: ${{ needs.setup.outputs.shard }}
     steps:
-      - name: Set env vars
+      # ---------- skipped cell (zkVM prove without a GPU runner) ----------
+      - name: Skip note
+        if: matrix.cell.skip == 'true'
         run: |
-          # Overrides default env vars with those specified in the `issue_comment` input if identically named
-          for var in ${{ needs.setup.outputs.env-vars }}
-          do
-            echo "$var" | tee -a $GITHUB_ENV
-          done
-      - uses: actions/checkout@v6
-        # Get base branch of the PR
-      - uses: xt0rted/pull-request-comment-branch@v3
-        id: comment-branch
-      - name: Checkout base branch
+          mkdir -p out
+          {
+            echo "### \`$BACKEND\` · \`$BENV\` · \`$MODE\`"
+            echo
+            echo "_Skipped: zkVM proving needs a GPU runner. Re-run with \`BENCH_GPU=1\` on a GPU-enabled runner._"
+          } > "out/table-$LABEL.md"
+
+      # ---------- real cell ----------
+      # PR checked out at the workspace root so the local install actions and the
+      # helper scripts resolve; base (cache-miss only) goes under base/.
+      - name: Checkout PR
+        if: matrix.cell.skip != 'true'
         uses: actions/checkout@v6
         with:
-          ref: ${{ steps.comment-branch.outputs.base_sha }}
-          path : ${{ github.workspace }}/base
-      - name: Run `lake build` on base branch
+          ref: ${{ env.HEAD_SHA }}
+      - name: Apply passthrough env
+        if: matrix.cell.skip != 'true'
+        run: |
+          while IFS= read -r line; do
+            [ -n "$line" ] && printf '%s\n' "$line" >> "$GITHUB_ENV"
+          done <<'PTENV'
+          ${{ needs.setup.outputs.passthrough-env }}
+          PTENV
+      # Select the constants for this cell → names.txt; emit count/vhash/tier
+      # (vhash is part of the main cache key, so editing Vectors.csv invalidates
+      # stale main results).
+      - name: Select constants from Benchmarks/Vectors.csv
+        if: matrix.cell.skip != 'true'
+        id: man
+        run: |
+          python3 .github/scripts/bench.py manifest \
+            --csv Benchmarks/Vectors.csv --env "$BENV" --mode "$MODE" \
+            --tier "$TIER" --shard "$SHARD" --out "$GITHUB_WORKSPACE/names.txt" \
+            | tee -a "$GITHUB_OUTPUT"
+
+      # Lean toolchain + build `ix` and `bench-typecheck` for the PR side (every
+      # backend needs `ix` to compile the env to a `.ixe`). Mathlib cache only
+      # pulled for the mathlib env.
+      - name: Build PR (ix, bench-typecheck)
+        if: matrix.cell.skip != 'true'
         uses: leanprover/lean-action@v1
         with:
-          lake-package-directory: ${{ github.workspace }}/base
-          test: false
-      - name: Run bench on base branch
-        run: |
-          if $(lake run get-exe-targets | grep -q ${{ matrix.bench }}); then
-            lake exe ${{ matrix.bench }}
-          else
-            echo "No matching bench target found on base branch"
-          fi
-        working-directory: ${{ github.workspace }}/base
-      - name: Checkout PR branch
+          lake-package-directory: .
+          auto-config: false
+          build: true
+          build-args: "ix bench-typecheck"
+          use-github-cache: false
+          use-mathlib-cache: ${{ matrix.cell.env == 'mathlib' && 'true' || 'false' }}
+      # zkVM cells additionally need the Rust toolchain + the backend's toolchain
+      # and system deps (the shared composite install actions).
+      - name: Set up zkVM Rust toolchain
+        if: matrix.cell.skip != 'true' && matrix.cell.backend != 'aiur'
+        uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          cache-workspaces: ${{ matrix.cell.backend }}
+      - name: Install SP1
+        if: matrix.cell.skip != 'true' && matrix.cell.backend == 'sp1'
+        uses: ./.github/actions/install-sp1
+      - name: Install Zisk
+        if: matrix.cell.skip != 'true' && matrix.cell.backend == 'zisk'
+        uses: ./.github/actions/install-zisk
+
+      # ---------- main side (cached by base SHA) ----------
+      - name: Restore cached main results
+        if: matrix.cell.skip != 'true'
+        id: main-cache
+        uses: actions/cache/restore@v4
+        with:
+          path: main.json
+          key: bench-${{ matrix.cell.label }}-${{ env.BASE_SHA }}-${{ steps.man.outputs.vhash }}
+      - name: Checkout base
+        if: matrix.cell.skip != 'true' && steps.main-cache.outputs.cache-hit != 'true'
         uses: actions/checkout@v6
         with:
-          path: ${{ github.workspace }}/pr
-          ref: ${{ steps.comment-branch.outputs.head_sha }}
-      - name: Run `lake build` on PR branch
+          ref: ${{ env.BASE_SHA }}
+          path: base
+      - name: Build base (ix, bench-typecheck)
+        if: matrix.cell.skip != 'true' && steps.main-cache.outputs.cache-hit != 'true'
         uses: leanprover/lean-action@v1
         with:
-          lake-package-directory: ${{ github.workspace }}/pr
-          test: false
-      - name: Copy base benchmarks into PR dir for comparison
+          lake-package-directory: base
+          auto-config: false
+          build: true
+          build-args: "ix bench-typecheck"
+          use-github-cache: false
+          use-mathlib-cache: ${{ matrix.cell.env == 'mathlib' && 'true' || 'false' }}
+      - name: Run backend on base → main.json
+        if: matrix.cell.skip != 'true' && steps.main-cache.outputs.cache-hit != 'true'
+        run: |
+          export PATH="$PWD/base/.lake/build/bin:$PATH"
+          bash .github/scripts/run.sh base "$BENV" "$BACKEND" "$MODE" \
+            "$GITHUB_WORKSPACE/names.txt" "$GITHUB_WORKSPACE/main.json"
+      - name: Save cached main results
+        if: matrix.cell.skip != 'true' && steps.main-cache.outputs.cache-hit != 'true'
+        uses: actions/cache/save@v4
+        with:
+          path: main.json
+          key: bench-${{ matrix.cell.label }}-${{ env.BASE_SHA }}-${{ steps.man.outputs.vhash }}
+
+      # ---------- PR side ----------
+      - name: Run backend on PR → pr.json
+        if: matrix.cell.skip != 'true'
         run: |
-          BENCH_DIR_PR=${{ github.workspace }}/pr/.lake/benches
-          BENCH_DIR_BASE=${{ github.workspace }}/base/.lake/benches
-          mkdir -p $BENCH_DIR_PR
-          [ -d "$BENCH_DIR_BASE" ] && cp -r $BENCH_DIR_BASE/. $BENCH_DIR_PR/
-          ls $BENCH_DIR_PR
-      - name: Run bench on PR branch and generate comparison report
+          export PATH="$PWD/.lake/build/bin:$PATH"
+          bash .github/scripts/run.sh . "$BENV" "$BACKEND" "$MODE" \
+            "$GITHUB_WORKSPACE/names.txt" "$GITHUB_WORKSPACE/pr.json"
+
+      # ---------- compare ----------
+      - name: Build comparison table
+        if: matrix.cell.skip != 'true'
         run: |
-          BENCH_REPORT=1 lake exe ${{ matrix.bench }}
-        working-directory: ${{ github.workspace }}/pr
-      - name: Get env for PR body
+          mkdir -p out
+          python3 .github/scripts/bench.py compare \
+            --main main.json --pr pr.json --out "out/table-$LABEL.md" \
+            --backend "$BACKEND" --env "$BENV" --mode "$MODE" \
+            --count "${{ steps.man.outputs.count }}" \
+            --cache-hit "${{ steps.main-cache.outputs.cache-hit }}"
+          cat "out/table-$LABEL.md"
+
+      - name: Upload table
         if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: table-${{ env.LABEL }}
+          path: out/table-${{ env.LABEL }}.md
+          if-no-files-found: warn
+
+  comment:
+    needs: [setup, benchmark]
+    if: always() && needs.setup.result == 'success'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - name: Download tables
+        uses: actions/download-artifact@v4
+        with:
+          path: tables
+          pattern: table-*
+          merge-multiple: true
+      - name: Build comment body
+        env:
+          SUMMARY: ${{ needs.setup.outputs.config-summary }}
         run: |
-          SHORT_SHA_PR=$(git rev-parse --short HEAD)
-          REPO_URL=${{ github.server_url }}/${{ github.repository }}
-          echo "COMMIT_LINK=[\`$SHORT_SHA_PR\`]($REPO_URL/commit/${{ steps.comment-branch.outputs.head_sha }})" | tee -a $GITHUB_ENV
-          echo "WORKFLOW_LINK=[Workflow logs]($REPO_URL/actions/runs/${{ github.run_id }})" | tee -a $GITHUB_ENV
-        working-directory: ${{ github.workspace }}/pr
+          python3 .github/scripts/bench.py comment \
+            --tables tables --summary "$SUMMARY" \
+            --head "${{ needs.setup.outputs.head-sha }}" \
+            --repo-url "${{ github.server_url }}/${{ github.repository }}" \
+            --run-id "${{ github.run_id }}" --out comment-body.md
       - name: Generate token to write PR comment
-        uses: actions/create-github-app-token@v3
-        if: always()
         id: app-token
+        uses: actions/create-github-app-token@v3
         with:
           app-id: ${{ secrets.TOKEN_APP_ID }}
           private-key: ${{ secrets.TOKEN_APP_PRIVATE_KEY }}
-      - name: Build benchmark comment body
-        if: success()
-        run: |
-          {
-            echo '## Benchmark for `${{ matrix.bench }}` at ${{ env.COMMIT_LINK }}';
-            echo "";
-            for file in .lake/benches/*/report.md; do
-              [ -f "$file" ] && cat "$file" && echo ""
-            done
-            echo "${{ env.WORKFLOW_LINK }}";
-          } > ${{ github.workspace }}/comment-body.md
-        working-directory: ${{ github.workspace }}/pr
-      - name: Comment on successful run
-        if: success()
+      - name: Post / update comment
         uses: peter-evans/create-or-update-comment@v5
         with:
           token: ${{ steps.app-token.outputs.token }}
           issue-number: ${{ github.event.issue.number }}
-          body-path: 'comment-body.md'
-      - name: Comment on failing run
-        if: failure()
-        uses: peter-evans/create-or-update-comment@v5
-        with:
-          token: ${{ steps.app-token.outputs.token }}
-          issue-number: ${{ github.event.issue.number }}
-          body: |
-            ## Benchmark for `${{ matrix.bench }}` at ${{ env.COMMIT_LINK }} failed :x:
-
-            [Workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
+          body-path: comment-body.md
diff --git a/.github/workflows/riscv-bench.yml b/.github/workflows/riscv-bench.yml
index d9ec22b7..79ab4950 100644
--- a/.github/workflows/riscv-bench.yml
+++ b/.github/workflows/riscv-bench.yml
@@ -60,23 +60,10 @@ jobs:
       - uses: actions-rust-lang/setup-rust-toolchain@v1
         with:
           cache-workspaces: sp1
-      - name: Install system build deps
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y \
-            xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \
-            nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \
-            libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \
-            openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \
-            pkg-config libssl-dev
+      - uses: ./.github/actions/install-sp1
       - uses: actions/download-artifact@v4
         with:
           name: minimal-ixe
-      - name: Install SP1 toolchain (sp1up, latest)
-        run: |
-          curl -L https://sp1up.succinct.xyz | bash
-          ~/.sp1/bin/sp1up
-          echo "$HOME/.sp1/bin" >> "$GITHUB_PATH"
       # The precompile-aware SP1 runner-binary is auto-built from the fork git
       # dep by `sp1-core-executor-runner`'s build script — no manual override.
       - name: SP1 — execute minimal.ixe (assert failures == 0)
@@ -94,58 +81,10 @@ jobs:
       - uses: actions-rust-lang/setup-rust-toolchain@v1
         with:
           cache-workspaces: zisk
-      - name: Install system build deps
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y \
-            xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \
-            nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \
-            libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \
-            openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \
-            pkg-config libssl-dev
+      - uses: ./.github/actions/install-zisk
       - uses: actions/download-artifact@v4
         with:
           name: minimal-ixe
-      - name: Install Zisk toolchain (ziskup, pinned v0.18.0)
-        # `--version 0.18.0` pins the toolchain to match our deps. Our host links
-        # the argumentcomputer/zisk `blake3-precompile` fork, which is based on
-        # v0.18.0 (its cargo-zisk has `check-setup`, used below to regenerate the
-        # key's const-trees). Without the pin, ziskup installs `releases/latest`,
-        # which resolves to upstream `v1.0.0-alpha` — a different circuit whose
-        # cargo-zisk dropped the `check-setup` subcommand, breaking the key step.
-        # `--cpu` picks the CPU build (no GPU on the runner) and `--nokey` skips
-        # ziskup's key install — both avoid its interactive /dev/tty prompts. We
-        # keep `--nokey` because the upstream `zisk-setup` bucket only carries the
-        # upstream circuit's key; our fork has a different circuit (extra Blake3f
-        # AIR), so we restore the fork-matching key from our own S3 in the next
-        # step. `--prefix $HOME/.zisk` pins the install where cargo-zisk's
-        # ZiskPaths fallback looks (the runner sets XDG_CONFIG_HOME, which would
-        # otherwise relocate it).
-        run: |
-          curl -L https://raw.githubusercontent.com/0xPolygonHermez/zisk/main/ziskup/install.sh \
-            | bash -s -- --cpu --nokey -y --version 0.18.0 --prefix "$HOME/.zisk"
-          echo "$HOME/.zisk/bin" >> "$GITHUB_PATH"
-      # Execute still needs a proving key present: zisk-host calls
-      # `client.setup()` (which the SDK runs before the execute branch), and that
-      # loads the circuit's const-tree files. We host the fork-matching key in a
-      # public S3 bucket WITHOUT the const-trees — exactly like Zisk's released
-      # `zisk-provingkey-*.tar.gz` on `storage.googleapis.com/zisk-setup` — and
-      # regenerate them here with `cargo-zisk check-setup -a`, which is how
-      # `ziskup` itself populates them. That keeps the artifact ~3 GB (gzip)
-      # instead of ~48 GB. The object name carries the fork rev so a circuit
-      # change can't silently reuse a stale key. Public bucket → plain curl, no
-      # AWS creds.
-      - name: Restore Zisk proving key (fork circuit) from S3
-        run: |
-          mkdir -p "$HOME/.zisk"
-          curl -fSL --retry 3 \
-            https://argument-zisk-setup.s3.amazonaws.com/zisk-provingkey-blake3-8f9e24d5-cpu.tar.gz \
-            -o /tmp/zisk-provingkey.tar.gz
-          tar -C "$HOME/.zisk" -xzf /tmp/zisk-provingkey.tar.gz
-          rm -f /tmp/zisk-provingkey.tar.gz
-          # Regenerate the const-tree files omitted from the artifact (CPU build,
-          # so no --gpu). This is the "may take a while" step ziskup prints.
-          cargo-zisk check-setup --proving-key "$HOME/.zisk/provingKey" -a
       - name: Zisk — execute minimal.ixe (assert failures == 0)
         run: |
           cd zisk
diff --git a/Benchmarks/Typecheck.lean b/Benchmarks/Typecheck.lean
index 047cbec7..1196c055 100644
--- a/Benchmarks/Typecheck.lean
+++ b/Benchmarks/Typecheck.lean
@@ -18,25 +18,34 @@ runtime. Useful standalone (per-constant timeline + RAM breakdown via
 tracing-texray) and as a machine source (neutral results JSON).
 
 ```
-lake exe bench-typecheck --ixe <path> [names…] [flags]
+lake exe bench-typecheck --ixe <path> [--constant <name>] [names…] [flags]
 
   --ixe <path>       serialized `Ixon.Env`, e.g. from `ix compile Foo.lean`
                      (writes `foo.ixe`). Required.
-  [names…]           zero or more fully-qualified constant names to benchmark,
+  --constant <name>  constant to benchmark, by fully-qualified Lean name. The
+                     canonical single-target flag, shared with the Zisk
+                     `zisk-host --constant`. Unions with names / manifest.
+  [names…]           zero or more additional constant names to benchmark,
                      e.g. `Nat.add_comm String.append`.
   --manifest <path>  additionally read names from a file: one per line, blank
                      lines and `#` comments ignored. Unions with [names…].
 
-  Names (from either source) resolve against the env's named map via
+  Names (from any source) resolve against the env's named map via
   `String.toName` plus a `toString` fallback (mirrors `ix check --ixe`), so
   numeric / private components round-trip (`Foo.0.Bar`, `_private.M.0.foo`).
-  Pass at least one name or a `--manifest`.
+  Pass at least one name via --constant, positional args, or --manifest.
 
+  --skip-deps    check only each target itself (verify_const, trusting its
+                 deps) instead of its whole transitive closure (verify_claim,
+                 the default). Same flag as `zisk-host --skip-deps`; reserved
+                 for targets too expensive to full-closure-check.
   --json <path>  write per-constant results JSON to <path>. Off by default:
                  normal CLI usage prints only the human-readable summary.
   --texray       force the tracing-texray timeline + RAM breakdown on.
   --no-texray    force it off. Default: on iff `--json` was NOT given, so a
                  plain local run gets the breakdown while a JSON run stays quiet.
+  --execute-only run only Phase 1 (constants / fft-cost / execute-time) and skip
+                 proving — the fast `execute`-mode signal.
 ```
 
 For each constant the harness STARK-checks `Ix.Claim.check addr none` (the full
@@ -130,7 +139,13 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
   let some ixeArg := p.flag? "ixe"
     | IO.eprintln "error: --ixe <path> is required"; return 1
   let ixePath := ixeArg.as! String
-  -- Names come from the variadic positional args and/or a `--manifest` file.
+  -- Names come from `--constant`, the variadic positional args, and/or a
+  -- `--manifest` file. `--constant` is the canonical single-target flag shared
+  -- with the Zisk `zisk-host --constant`; positional names and `--manifest`
+  -- remain for benchmarking many constants at once.
+  let constName : Array String := match p.flag? "constant" with
+    | some f => #[f.as! String]
+    | none => #[]
   let cliNames := p.variableArgsAs! String
   let fileNames ← match p.flag? "manifest" with
     | some f => pure (parseManifest (← IO.FS.readFile (f.as! String)))
@@ -139,16 +154,19 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
   let nameArgs := Id.run do
     let mut seen : Std.HashSet String := {}
     let mut acc : Array String := #[]
-    for n in cliNames ++ fileNames do
+    for n in constName ++ cliNames ++ fileNames do
       if !seen.contains n then seen := seen.insert n; acc := acc.push n
     return acc
   if nameArgs.isEmpty then
-    IO.eprintln "error: provide one or more constant names and/or --manifest <path>"
+    IO.eprintln "error: provide a constant via --constant, positional name(s), and/or --manifest <path>"
     return 1
   let jsonOut : Option String := (p.flag? "json").map (·.as! String)
-  -- subject-only: check just the target (`verify_const`, trusting its deps)
+  -- skip-deps: check just the target (`verify_const`, trusting its deps)
   -- instead of re-checking the whole transitive closure (`verify_claim`).
-  let subjectOnly := p.hasFlag "subject-only"
+  let skipDeps := p.hasFlag "skip-deps"
+  -- Execute-only: run just Phase 1 (constants / fft-cost / execute-time) and
+  -- skip the Phase 2 prove loop.
+  let executeOnly := p.hasFlag "execute-only"
   -- Default: trace iff we're not in JSON/bencher mode.
   let useTexray :=
     if p.hasFlag "texray" then true
@@ -160,7 +178,7 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
     | throw (IO.userError "Merging IxVM kernel failed")
   let .ok compiled := toplevel.compile
     | throw (IO.userError "Compilation of IxVM kernel failed")
-  let entrypoint := if subjectOnly then `verify_const else `verify_claim
+  let entrypoint := if skipDeps then `verify_const else `verify_claim
   let some funIdx := compiled.getFuncIdx entrypoint
     | throw (IO.userError s!"{entrypoint} entrypoint missing")
   let aiurSystem := Aiur.AiurSystem.build compiled.bytecode commitmentParameters
@@ -190,7 +208,7 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
 
   -- Phase 1: execute every constant (cheap, deterministic structural metrics).
   -- For full-closure check claims, use `checkAddrWithEnv` against the
-  -- shared `envHandle`. For `--subject-only` (`buildVerifyConst`), the
+  -- shared `envHandle`. For `--skip-deps` (`buildVerifyConst`), the
   -- witness is a small subject-only blob — keep Lean witness +
   -- `executeIxVM`.
   IO.println "── Phase 1: execute (witness generation) ──"
@@ -198,7 +216,7 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
   for (label, addr) in targets do
     try
       let (res, execSec) ← timed fun _ =>
-        if subjectOnly then
+        if skipDeps then
           let witness := IxVM.ClaimHarness.buildVerifyConst ixonEnv addr
           compiled.bytecode.executeIxVM funIdx witness.input witness.inputIOBuffer
         else
@@ -223,6 +241,15 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
       IO.FS.writeFile path (Json.mkObj (results.map Result.toJsonEntry).toList).pretty
     | none => pure ()
 
+  -- `--execute-only`: stop after Phase 1; the results JSON (if requested) is
+  -- already complete with the execute metrics.
+  if executeOnly then
+    writeJson (execed.map (·.1))
+    match jsonOut with
+    | some path => IO.println s!"wrote {execed.size} execute-only benchmarks to {path}"
+    | none => IO.println s!"executed {execed.size} constants (--execute-only); pass --json <path> to emit results"
+    return 0
+
   -- Phase 2: prove cheap→expensive. Refine each entry with its prove-time as it
   -- lands. Install texray first so the prove spans (timeline + RAM Δ/peak) render.
   if useTexray then TracingTexray.init {}
@@ -234,7 +261,7 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
     let (r, addr) := ordered[i]!
     try
       let (proveRes, proveSec) ← timed fun _ =>
-        if subjectOnly then
+        if skipDeps then
           let witness := IxVM.ClaimHarness.buildVerifyConst ixonEnv addr
           let (claim, proof, ioBuf) :=
             aiurSystem.proveIxVM friParameters funIdx witness.input witness.inputIOBuffer
@@ -270,9 +297,11 @@ def typecheckCmd : Cli.Cmd := `[Cli|
 
   FLAGS:
     "ixe"       : String; "Path to a serialized `Ixon.Env` (e.g. produced by `ix compile`). Required."
+    "constant"  : String; "Constant to benchmark, by fully-qualified Lean name. The canonical single-target flag (shared with `zisk-host --constant`). Unions with positional names / --manifest."
     "manifest"  : String; "Additionally read constant names from a file (one per line; `#` comments and blank lines ignored). Unions with the positional names."
     "json"      : String; "Write per-constant results JSON to this path. Off by default; normal CLI usage prints only the human-readable summary."
-    "subject-only";       "Check only each target itself (verify_const, trusting its deps) instead of re-checking its whole transitive closure (verify_claim)."
+    "skip-deps";          "Check only each target itself (verify_const, trusting its deps) instead of re-checking its whole transitive closure (verify_claim). Same flag as `zisk-host --skip-deps`."
+    "execute-only";       "Execute only (Phase 1: constants / fft-cost / execute-time) and skip proving. The fast per-PR `execute`-mode signal."
     texray;               "Force the tracing-texray timeline + RAM breakdown on (per-prove spans on stderr)."
     "no-texray";          "Force the breakdown off. Default: on iff --json was not given."
 
diff --git a/Benchmarks/Vectors.csv b/Benchmarks/Vectors.csv
new file mode 100644
index 00000000..43adb520
--- /dev/null
+++ b/Benchmarks/Vectors.csv
@@ -0,0 +1,93 @@
+# Benchmark constant vectors -- single shared source of truth.
+# Consumed identically by Aiur (bench-typecheck --manifest), the Zisk/SP1 hosts
+# (--constant loop), and CI shell (awk). One row per curated constant.
+#
+# Provenance: measured across this box's benchmarking dirs, de-duplicated --
+#   aiur_fft / env : ~/ix-aiur/Benchmarks/Statistics/data/aiur/cost.csv
+#   zisk_cycles    : ~/ix-aiur/Benchmarks/Statistics/data/zisk/single_shard.csv
+# 27 IxVM kernel-primitive constants from cost.csv are intentionally excluded
+# (not Init/Std/Mathlib library constants).
+#
+# Columns:
+#   name         fully-qualified Lean name (resolves via NameResolve.resolveIxeAddr).
+#   env          compile target / .ixe it resolves in: initStd | lean | mathlib.
+#   tier         cheap = prove-feasible per-PR; heavy = execute-only / sharded.
+#                Boundary: Aiur fft >= 1e9 => heavy.
+#   shard_target 1 = heavy constant designated as a multi-shard prove target.
+#   aiur_fft     measured Aiur fft-cost (proving-cost proxy). Informational.
+#   zisk_cycles  measured Zisk ziskemu step count; empty if unmeasured. Informational.
+#
+# CI filters on the env column (not line number), so these '#' lines and the
+# header are skipped by: awk -F, '$1!~/^#/ && $1!="name" && $2==env ...'
+name,env,tier,shard_target,aiur_fft,zisk_cycles
+HEq,initStd,cheap,0,1716582,
+Nat,initStd,cheap,0,1857523,975244
+Eq.rec,initStd,cheap,0,2575400,2348520
+HEq.rec,initStd,cheap,0,2692988,2727278
+Trans.mk,initStd,cheap,0,2911629,7229214
+Array.toList,initStd,cheap,0,3332563,2580844
+Acc.rec,initStd,cheap,0,3505064,5105888
+Std.Time.Month.Offset.ofNat,initStd,cheap,0,3607673,1493508
+Sum.elim,initStd,cheap,0,5589905,6618130
+Prod.map,initStd,cheap,0,6904183,8177456
+Option.bind,initStd,cheap,0,7329183,7440608
+Except.bind,initStd,cheap,0,7667869,9427477
+WellFounded.fix,initStd,cheap,0,10125144,13415585
+Nat.add,initStd,cheap,0,13343000,10606339
+List.foldr,initStd,cheap,0,18579757,16707100
+List.dropLast,initStd,cheap,0,19509718,17522863
+List.range,initStd,cheap,0,20251801,13666491
+List.zipWith,initStd,cheap,0,20439088,20229977
+List.filterMap,initStd,cheap,0,25335779,21435279
+List.foldlM,initStd,cheap,0,39202740,
+Int.add,initStd,cheap,0,44714703,27635032
+BitVec.toFin,initStd,cheap,0,50437466,28681028
+Nat.add_comm,initStd,cheap,0,56084908,53239676
+UInt32.toNat,initStd,cheap,0,59331806,29980254
+USize.toNat,initStd,cheap,0,71607481,35811906
+Nat.decEq,initStd,cheap,0,71921625,57411966
+ByteSlice.ofByteArray,initStd,cheap,0,107574377,53555107
+Nat.decLe,initStd,cheap,0,209641496,143391161
+Nat.strongRecOn,initStd,cheap,0,273068854,190849758
+Int.emod,initStd,cheap,0,422940733,269380418
+Int.ediv,initStd,cheap,0,430476738,270987292
+Array.foldlM,initStd,cheap,0,434577494,
+Array.foldl,initStd,cheap,0,449323126,278537034
+Array.filter,initStd,cheap,0,464818232,285847515
+Nat.sub_le_of_le_add,initStd,cheap,0,567575653,373184538
+BitVec.add,initStd,cheap,0,617113462,373772656
+Int.gcd,initStd,cheap,0,657502637,409112011
+Nat.toDigits,initStd,cheap,0,663606297,357145741
+Array.map,initStd,cheap,0,734574964,443199245
+Array.zipWith,initStd,cheap,0,736658636,445195121
+String.Internal.append,initStd,cheap,0,793580333,
+Lean.Name.hash,initStd,cheap,0,861742653,447441591
+BitVec.umod,initStd,cheap,0,926177790,526467117
+Nat.repr,initStd,cheap,0,966452765,498729913
+Int.repr,initStd,cheap,0,993792541,504234535
+String.intercalate,initStd,heavy,0,1089240518,599428829
+_private.Init.Prelude.0.Lean.extractMainModule._unsafe_rec,initStd,heavy,0,1197925029,
+Char.toLower,initStd,heavy,0,1198467414,665920824
+Nat.gcd_comm,initStd,heavy,0,1954958779,1144352360
+Int.emod_emod_of_dvd,initStd,heavy,0,3856852693,2201588182
+Array.append_assoc,initStd,heavy,0,3938574533,1570256148
+Vector.append,initStd,heavy,0,4023268168,1614275115
+Fin.foldl,initStd,heavy,0,10853255199,5110854190
+List.mergeSort,initStd,heavy,1,13825318985,6706906294
+Array.binSearch,initStd,heavy,1,14397133548,6785827470
+Array.qsort,initStd,heavy,0,15781689533,7199288749
+Array.qsortOrd,initStd,heavy,0,15841062472,7206704674
+String.split,initStd,heavy,0,19578088286,8657387499
+Std.Time.Week.Offset.ofMilliseconds,initStd,heavy,0,24577209792,6653972854
+Vector.extract_append,initStd,heavy,1,61830646478,
+Lean.Expr.replace,lean,cheap,0,859625514,
+List.Sorted,mathlib,cheap,0,9578666,
+Nat.choose,mathlib,cheap,0,29018862,
+Nat.factorial,mathlib,cheap,0,33562426,
+Nat.fib,mathlib,cheap,0,34171209,
+GCDMonoid.gcd,mathlib,heavy,0,1005736276,
+Nat.Prime.two_le,mathlib,heavy,0,1504045298,
+Finset.prod,mathlib,heavy,0,3045165822,
+Finset.sum,mathlib,heavy,0,3045189408,
+Polynomial.eval,mathlib,heavy,0,5342731754,
+Multiset.sort,mathlib,heavy,1,18670960624,

From 457ff2e6824de5610006a8694e4a91909f029073 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Tue, 30 Jun 2026 18:55:19 +0000
Subject: [PATCH 03/27] test(ci): trigger bench-pr.yml on pull_request
 (TEMPORARY)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

issue_comment workflows only run from the default branch, so the `!benchmark`
path can't be exercised on a PR branch. Add a pull_request trigger (base/head
from the PR payload; empty comment → parser defaults) to test pre-merge.

Revert before merge — delete the `pull_request:` trigger; the dual base/head
resolution and `|| pull_request.number` fallbacks are harmless to keep.
---
 .github/workflows/bench-pr.yml | 48 ++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index c623d432..f162e847 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -17,6 +17,12 @@ name: Benchmark pull requests
 on:
   issue_comment:
     types: [created]
+  # TEST: temporary trigger so this workflow can be exercised from a PR branch.
+  # issue_comment only fires once the workflow is on the default branch, so a
+  # `!benchmark` comment can't test it pre-merge. On pull_request there is no
+  # comment, so the run uses the parser defaults (aiur / initStd / execute).
+  # Remove this trigger before merge.
+  pull_request:
 
 permissions:
   contents: read
@@ -24,18 +30,19 @@ permissions:
   pull-requests: read
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.issue.number }}
+  group: ${{ github.workflow }}-${{ github.event.issue.number || github.event.pull_request.number }}
   cancel-in-progress: true
 
 jobs:
   setup:
     name: Parse !benchmark comment
     if: >-
-      github.event.issue.pull_request &&
-      github.event.issue.state == 'open' &&
-      contains(github.event.comment.body, '!benchmark') &&
-      (github.event.comment.author_association == 'MEMBER' ||
-       github.event.comment.author_association == 'OWNER')
+      github.event_name == 'pull_request' ||
+      (github.event.issue.pull_request &&
+       github.event.issue.state == 'open' &&
+       contains(github.event.comment.body, '!benchmark') &&
+       (github.event.comment.author_association == 'MEMBER' ||
+        github.event.comment.author_association == 'OWNER'))
     runs-on: ubuntu-latest
     outputs:
       matrix: ${{ steps.parse.outputs.matrix }}
@@ -44,16 +51,29 @@ jobs:
       shard: ${{ steps.parse.outputs.shard }}
       passthrough-env: ${{ steps.parse.outputs.passthrough-env }}
       config-summary: ${{ steps.parse.outputs.config-summary }}
-      base-sha: ${{ steps.comment-branch.outputs.base_sha }}
-      head-sha: ${{ steps.comment-branch.outputs.head_sha }}
+      base-sha: ${{ steps.shas.outputs.base }}
+      head-sha: ${{ steps.shas.outputs.head }}
     steps:
       - uses: actions/checkout@v6
-      # Resolve the PR's base/head SHAs from the comment's issue.
-      - uses: xt0rted/pull-request-comment-branch@v3
+      # issue_comment exposes the PR's base/head only via this action; the
+      # pull_request event carries them on the payload directly.
+      - if: github.event_name == 'issue_comment'
+        uses: xt0rted/pull-request-comment-branch@v3
         id: comment-branch
-      # Parse the comment from an env var (never inline-interpolated) → matrix +
-      # config. The allowlist drops anything that isn't a known flag/env key.
-      - name: Parse comment
+      - name: Resolve base/head SHAs
+        id: shas
+        run: |
+          if [ "${{ github.event_name }}" = pull_request ]; then
+            echo "base=${{ github.event.pull_request.base.sha }}" >> "$GITHUB_OUTPUT"
+            echo "head=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "base=${{ steps.comment-branch.outputs.base_sha }}" >> "$GITHUB_OUTPUT"
+            echo "head=${{ steps.comment-branch.outputs.head_sha }}" >> "$GITHUB_OUTPUT"
+          fi
+      # Parse the !benchmark command from an env var (never inline-interpolated).
+      # The allowlist drops anything that isn't a known flag/env key; an empty
+      # body (pull_request) yields the parser defaults.
+      - name: Parse command
         id: parse
         env:
           COMMENT_BODY: ${{ github.event.comment.body }}
@@ -239,5 +259,5 @@ jobs:
         uses: peter-evans/create-or-update-comment@v5
         with:
           token: ${{ steps.app-token.outputs.token }}
-          issue-number: ${{ github.event.issue.number }}
+          issue-number: ${{ github.event.issue.number || github.event.pull_request.number }}
           body-path: comment-body.md

From 2937a9d900abf8d86f1e45c90d7ec268cd167ee4 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Tue, 30 Jun 2026 19:52:12 +0000
Subject: [PATCH 04/27] feat(ci): default to ~11 primary constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The full curated set (~60 InitStd) is too slow to run on every !benchmark and on
the bencher prove/zkVM jobs. Add a `primary` column to Vectors.csv marking 11
constants spanning shape (nat/list/array/int/string/vector/multiset, defs +
proofs) and the cheap→heavy cost range (incl. 3 shard targets), and make it the
default:

- bench.py manifest --primary; parse honors BENCH_FULL (run the full set).
- bench-pr.yml: !benchmark defaults to the primary subset; BENCH_FULL=1 runs the
  whole curated set.
- bench-main.yml: prove + zkvm-execute derive constants from `manifest --primary`
  (replacing the hardcoded lists), so all backends bench the same set from the one
  source of truth. The tier filter keeps prove on the cheap primaries (heavy ones
  would OOM a single-shard prove); execute/native get the heavy ones for scale.
---
 .github/scripts/bench.py         |  14 ++-
 .github/workflows/bench-main.yml |  24 +++--
 .github/workflows/bench-pr.yml   |   9 +-
 Benchmarks/Vectors.csv           | 149 ++++++++++++++++---------------
 4 files changed, 110 insertions(+), 86 deletions(-)

diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index c27582c9..4622b9f4 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -21,7 +21,7 @@
 BACKENDS = ("aiur", "zisk", "sp1")
 MODES = ("execute", "prove")
 ENVS = ("initStd", "lean", "mathlib")
-CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_GPU"}
+CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_GPU", "BENCH_FULL"}
 PASSTHROUGH_KEYS = {"RUST_LOG", "WITHOUT_VK_VERIFICATION", "RUSTFLAGS"}
 
 
@@ -70,6 +70,7 @@ def cmd_parse(_a):
     if tier not in ("cheap", "heavy", "all"):
         tier = ""             # empty ⇒ derived from mode at manifest time
     shard = "1" if cfg.get("BENCH_SHARD") == "1" else "0"
+    full = "1" if cfg.get("BENCH_FULL") == "1" else "0"  # full set vs primary subset
     gpu = cfg.get("BENCH_GPU") == "1"
 
     cells = []
@@ -80,14 +81,14 @@ def cmd_parse(_a):
                           "skip": "true" if skip else "false", "label": f"{b}-{e}-{mode}"})
 
     summary = (f"backends: `{' '.join(backends)}` · mode: `{mode}` · "
-               f"envs: `{','.join(envs)}` · tier: `{tier or 'auto'}` · "
-               f"shard: `{shard}` · gpu: `{int(gpu)}`")
+               f"envs: `{','.join(envs)}` · set: `{'full' if full == '1' else 'primary'}` · "
+               f"tier: `{tier or 'auto'}` · shard: `{shard}` · gpu: `{int(gpu)}`")
     if passthrough:
         summary += " · env: `" + " ".join(passthrough) + "`"
 
     with open(os.environ.get("GITHUB_OUTPUT", "/dev/stdout"), "a") as f:
         f.write(f"matrix={json.dumps(cells)}\n")
-        f.write(f"mode={mode}\ntier={tier}\nshard={shard}\n")
+        f.write(f"mode={mode}\ntier={tier}\nshard={shard}\nfull={full}\n")
         f.write(f"config-summary={summary}\n")
         f.write("passthrough-env<<PTENV\n" + "\n".join(passthrough)
                 + ("\n" if passthrough else "") + "PTENV\n")
@@ -108,8 +109,11 @@ def cmd_manifest(a):
             if cols[0] == "name" or len(cols) < 4:
                 continue
             name, env, ctier, shard = cols[:4]
+            rep = cols[4] if len(cols) >= 5 else "0"
             if env != a.env:
                 continue
+            if a.primary and rep != "1":
+                continue
             if tier in ("cheap", "heavy") and ctier != tier:
                 continue
             if a.shard == "1" and shard != "1":
@@ -245,6 +249,8 @@ def main():
     m.add_argument("--csv", required=True); m.add_argument("--env", required=True)
     m.add_argument("--mode", required=True); m.add_argument("--tier", default="")
     m.add_argument("--shard", default="0"); m.add_argument("--out", required=True)
+    m.add_argument("--primary", action="store_true",
+                   help="Restrict to the primary subset (the primary=1 column).")
     m.set_defaults(fn=cmd_manifest)
 
     c = sub.add_parser("compare")
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index 734bf787..5ddda359 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -178,11 +178,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        include:
-          - bench: InitStd
-            consts: Nat.add_comm Nat.sub_le_of_le_add String.append Array.append_assoc
-          - bench: Mathlib
-            consts: Nat.factorial Nat.Coprime Nat.Prime.two_le
+        bench: [InitStd, Mathlib]
     steps:
       - uses: actions/checkout@v6
         with:
@@ -230,7 +226,14 @@ jobs:
                 "res-$c.json" > "res-$c.json.tmp" && mv "res-$c.json.tmp" "res-$c.json" || true
             fi
           }
-          for c in ${{ matrix.consts }}; do measure "$c"; done
+          # The primary subset's prove tier (cheap primaries) — heavy primaries are
+          # execute-only (a single-shard prove would OOM the runner).
+          benv="${{ matrix.bench }}"; benv="${benv,}"   # InitStd→initStd, Mathlib→mathlib
+          python3 .github/scripts/bench.py manifest \
+            --csv Benchmarks/Vectors.csv --env "$benv" --mode prove --primary \
+            --out names.txt
+          echo "proving $(wc -l < names.txt) primary constants:"; cat names.txt
+          while IFS= read -r c; do [ -n "$c" ] && measure "$c"; done < names.txt
           # Merge the per-constant results; if none produced anything, emit `{}`.
           jq -s 'reduce .[] as $o ({}; . + $o)' res-*.json > results.json 2>/dev/null \
             || echo '{}' > results.json
@@ -293,7 +296,6 @@ jobs:
         backend: [zisk, sp1]
         include:
           - bench: InitStd
-            consts: Nat.add_comm Nat.sub_le_of_le_add String.append Array.append_assoc
     steps:
       - uses: actions/checkout@v6
         with:
@@ -318,7 +320,13 @@ jobs:
         env:
           REUSE_IXE: "1"
         run: |
-          printf '%s\n' ${{ matrix.consts }} > names.txt
+          # The primary subset in execute mode = all primaries for the env
+          # (cheap + heavy); execute handles the heavy ones, unlike prove.
+          benv="${{ matrix.bench }}"; benv="${benv,}"   # InitStd→initStd
+          python3 .github/scripts/bench.py manifest \
+            --csv Benchmarks/Vectors.csv --env "$benv" --mode execute --primary \
+            --out names.txt
+          echo "executing $(wc -l < names.txt) primary constants:"; cat names.txt
           bash .github/scripts/run.sh . ${{ matrix.bench }} ${{ matrix.backend }} execute \
             names.txt neutral.json
           # Wrap neutral { name: { metric: v } } → Bencher Metric Format.
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index f162e847..9f75a308 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -3,6 +3,7 @@
 #
 #   !benchmark [aiur] [zisk] [sp1 | all]  [execute|prove]
 #   BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
+#   BENCH_FULL=1                   # run the full curated set, not the ~11 primary
 #   BENCH_TIER=cheap|heavy|all     # override the mode default (execute=all, prove=cheap)
 #   BENCH_SHARD=1                  # restrict to the multi-shard target constants
 #   BENCH_GPU=1                    # allow zkVM prove on a self-hosted GPU runner
@@ -49,6 +50,7 @@ jobs:
       mode: ${{ steps.parse.outputs.mode }}
       tier: ${{ steps.parse.outputs.tier }}
       shard: ${{ steps.parse.outputs.shard }}
+      full: ${{ steps.parse.outputs.full }}
       passthrough-env: ${{ steps.parse.outputs.passthrough-env }}
       config-summary: ${{ steps.parse.outputs.config-summary }}
       base-sha: ${{ steps.shas.outputs.base }}
@@ -96,6 +98,7 @@ jobs:
       HEAD_SHA: ${{ needs.setup.outputs.head-sha }}
       TIER: ${{ needs.setup.outputs.tier }}
       SHARD: ${{ needs.setup.outputs.shard }}
+      FULL: ${{ needs.setup.outputs.full }}
     steps:
       # ---------- skipped cell (zkVM prove without a GPU runner) ----------
       - name: Skip note
@@ -126,14 +129,16 @@ jobs:
           PTENV
       # Select the constants for this cell → names.txt; emit count/vhash/tier
       # (vhash is part of the main cache key, so editing Vectors.csv invalidates
-      # stale main results).
+      # stale main results). Defaults to the primary subset; BENCH_FULL=1
+      # (→ full=1) runs the whole curated set.
       - name: Select constants from Benchmarks/Vectors.csv
         if: matrix.cell.skip != 'true'
         id: man
         run: |
+          PRIMARY=--primary; [ "$FULL" = 1 ] && PRIMARY=
           python3 .github/scripts/bench.py manifest \
             --csv Benchmarks/Vectors.csv --env "$BENV" --mode "$MODE" \
-            --tier "$TIER" --shard "$SHARD" --out "$GITHUB_WORKSPACE/names.txt" \
+            --tier "$TIER" --shard "$SHARD" $PRIMARY --out "$GITHUB_WORKSPACE/names.txt" \
             | tee -a "$GITHUB_OUTPUT"
 
       # Lean toolchain + build `ix` and `bench-typecheck` for the PR side (every
diff --git a/Benchmarks/Vectors.csv b/Benchmarks/Vectors.csv
index 43adb520..65fe4dcf 100644
--- a/Benchmarks/Vectors.csv
+++ b/Benchmarks/Vectors.csv
@@ -14,80 +14,85 @@
 #   tier         cheap = prove-feasible per-PR; heavy = execute-only / sharded.
 #                Boundary: Aiur fft >= 1e9 => heavy.
 #   shard_target 1 = heavy constant designated as a multi-shard prove target.
+#   primary 1 = part of the ~11-constant primary subset spanning
+#                shape + the cheap->heavy cost range. Default for the !benchmark
+#                PR comment and the bencher prove / zkVM jobs (full set via
+#                BENCH_FULL=1). Heavy reps run only in execute/native; prove
+#                mode keeps the cheap primaries (tier filter).
 #   aiur_fft     measured Aiur fft-cost (proving-cost proxy). Informational.
 #   zisk_cycles  measured Zisk ziskemu step count; empty if unmeasured. Informational.
 #
 # CI filters on the env column (not line number), so these '#' lines and the
 # header are skipped by: awk -F, '$1!~/^#/ && $1!="name" && $2==env ...'
-name,env,tier,shard_target,aiur_fft,zisk_cycles
-HEq,initStd,cheap,0,1716582,
-Nat,initStd,cheap,0,1857523,975244
-Eq.rec,initStd,cheap,0,2575400,2348520
-HEq.rec,initStd,cheap,0,2692988,2727278
-Trans.mk,initStd,cheap,0,2911629,7229214
-Array.toList,initStd,cheap,0,3332563,2580844
-Acc.rec,initStd,cheap,0,3505064,5105888
-Std.Time.Month.Offset.ofNat,initStd,cheap,0,3607673,1493508
-Sum.elim,initStd,cheap,0,5589905,6618130
-Prod.map,initStd,cheap,0,6904183,8177456
-Option.bind,initStd,cheap,0,7329183,7440608
-Except.bind,initStd,cheap,0,7667869,9427477
-WellFounded.fix,initStd,cheap,0,10125144,13415585
-Nat.add,initStd,cheap,0,13343000,10606339
-List.foldr,initStd,cheap,0,18579757,16707100
-List.dropLast,initStd,cheap,0,19509718,17522863
-List.range,initStd,cheap,0,20251801,13666491
-List.zipWith,initStd,cheap,0,20439088,20229977
-List.filterMap,initStd,cheap,0,25335779,21435279
-List.foldlM,initStd,cheap,0,39202740,
-Int.add,initStd,cheap,0,44714703,27635032
-BitVec.toFin,initStd,cheap,0,50437466,28681028
-Nat.add_comm,initStd,cheap,0,56084908,53239676
-UInt32.toNat,initStd,cheap,0,59331806,29980254
-USize.toNat,initStd,cheap,0,71607481,35811906
-Nat.decEq,initStd,cheap,0,71921625,57411966
-ByteSlice.ofByteArray,initStd,cheap,0,107574377,53555107
-Nat.decLe,initStd,cheap,0,209641496,143391161
-Nat.strongRecOn,initStd,cheap,0,273068854,190849758
-Int.emod,initStd,cheap,0,422940733,269380418
-Int.ediv,initStd,cheap,0,430476738,270987292
-Array.foldlM,initStd,cheap,0,434577494,
-Array.foldl,initStd,cheap,0,449323126,278537034
-Array.filter,initStd,cheap,0,464818232,285847515
-Nat.sub_le_of_le_add,initStd,cheap,0,567575653,373184538
-BitVec.add,initStd,cheap,0,617113462,373772656
-Int.gcd,initStd,cheap,0,657502637,409112011
-Nat.toDigits,initStd,cheap,0,663606297,357145741
-Array.map,initStd,cheap,0,734574964,443199245
-Array.zipWith,initStd,cheap,0,736658636,445195121
-String.Internal.append,initStd,cheap,0,793580333,
-Lean.Name.hash,initStd,cheap,0,861742653,447441591
-BitVec.umod,initStd,cheap,0,926177790,526467117
-Nat.repr,initStd,cheap,0,966452765,498729913
-Int.repr,initStd,cheap,0,993792541,504234535
-String.intercalate,initStd,heavy,0,1089240518,599428829
-_private.Init.Prelude.0.Lean.extractMainModule._unsafe_rec,initStd,heavy,0,1197925029,
-Char.toLower,initStd,heavy,0,1198467414,665920824
-Nat.gcd_comm,initStd,heavy,0,1954958779,1144352360
-Int.emod_emod_of_dvd,initStd,heavy,0,3856852693,2201588182
-Array.append_assoc,initStd,heavy,0,3938574533,1570256148
-Vector.append,initStd,heavy,0,4023268168,1614275115
-Fin.foldl,initStd,heavy,0,10853255199,5110854190
-List.mergeSort,initStd,heavy,1,13825318985,6706906294
-Array.binSearch,initStd,heavy,1,14397133548,6785827470
-Array.qsort,initStd,heavy,0,15781689533,7199288749
-Array.qsortOrd,initStd,heavy,0,15841062472,7206704674
-String.split,initStd,heavy,0,19578088286,8657387499
-Std.Time.Week.Offset.ofMilliseconds,initStd,heavy,0,24577209792,6653972854
-Vector.extract_append,initStd,heavy,1,61830646478,
-Lean.Expr.replace,lean,cheap,0,859625514,
-List.Sorted,mathlib,cheap,0,9578666,
-Nat.choose,mathlib,cheap,0,29018862,
-Nat.factorial,mathlib,cheap,0,33562426,
-Nat.fib,mathlib,cheap,0,34171209,
-GCDMonoid.gcd,mathlib,heavy,0,1005736276,
-Nat.Prime.two_le,mathlib,heavy,0,1504045298,
-Finset.prod,mathlib,heavy,0,3045165822,
-Finset.sum,mathlib,heavy,0,3045189408,
-Polynomial.eval,mathlib,heavy,0,5342731754,
-Multiset.sort,mathlib,heavy,1,18670960624,
+name,env,tier,shard_target,primary,aiur_fft,zisk_cycles
+HEq,initStd,cheap,0,0,1716582,
+Nat,initStd,cheap,0,0,1857523,975244
+Eq.rec,initStd,cheap,0,0,2575400,2348520
+HEq.rec,initStd,cheap,0,0,2692988,2727278
+Trans.mk,initStd,cheap,0,0,2911629,7229214
+Array.toList,initStd,cheap,0,0,3332563,2580844
+Acc.rec,initStd,cheap,0,0,3505064,5105888
+Std.Time.Month.Offset.ofNat,initStd,cheap,0,0,3607673,1493508
+Sum.elim,initStd,cheap,0,0,5589905,6618130
+Prod.map,initStd,cheap,0,0,6904183,8177456
+Option.bind,initStd,cheap,0,0,7329183,7440608
+Except.bind,initStd,cheap,0,0,7667869,9427477
+WellFounded.fix,initStd,cheap,0,0,10125144,13415585
+Nat.add,initStd,cheap,0,0,13343000,10606339
+List.foldr,initStd,cheap,0,1,18579757,16707100
+List.dropLast,initStd,cheap,0,0,19509718,17522863
+List.range,initStd,cheap,0,0,20251801,13666491
+List.zipWith,initStd,cheap,0,0,20439088,20229977
+List.filterMap,initStd,cheap,0,0,25335779,21435279
+List.foldlM,initStd,cheap,0,0,39202740,
+Int.add,initStd,cheap,0,0,44714703,27635032
+BitVec.toFin,initStd,cheap,0,0,50437466,28681028
+Nat.add_comm,initStd,cheap,0,1,56084908,53239676
+UInt32.toNat,initStd,cheap,0,0,59331806,29980254
+USize.toNat,initStd,cheap,0,0,71607481,35811906
+Nat.decEq,initStd,cheap,0,0,71921625,57411966
+ByteSlice.ofByteArray,initStd,cheap,0,0,107574377,53555107
+Nat.decLe,initStd,cheap,0,0,209641496,143391161
+Nat.strongRecOn,initStd,cheap,0,0,273068854,190849758
+Int.emod,initStd,cheap,0,0,422940733,269380418
+Int.ediv,initStd,cheap,0,0,430476738,270987292
+Array.foldlM,initStd,cheap,0,0,434577494,
+Array.foldl,initStd,cheap,0,1,449323126,278537034
+Array.filter,initStd,cheap,0,0,464818232,285847515
+Nat.sub_le_of_le_add,initStd,cheap,0,0,567575653,373184538
+BitVec.add,initStd,cheap,0,0,617113462,373772656
+Int.gcd,initStd,cheap,0,1,657502637,409112011
+Nat.toDigits,initStd,cheap,0,0,663606297,357145741
+Array.map,initStd,cheap,0,0,734574964,443199245
+Array.zipWith,initStd,cheap,0,0,736658636,445195121
+String.Internal.append,initStd,cheap,0,1,793580333,
+Lean.Name.hash,initStd,cheap,0,0,861742653,447441591
+BitVec.umod,initStd,cheap,0,0,926177790,526467117
+Nat.repr,initStd,cheap,0,0,966452765,498729913
+Int.repr,initStd,cheap,0,0,993792541,504234535
+String.intercalate,initStd,heavy,0,0,1089240518,599428829
+_private.Init.Prelude.0.Lean.extractMainModule._unsafe_rec,initStd,heavy,0,0,1197925029,
+Char.toLower,initStd,heavy,0,0,1198467414,665920824
+Nat.gcd_comm,initStd,heavy,0,1,1954958779,1144352360
+Int.emod_emod_of_dvd,initStd,heavy,0,0,3856852693,2201588182
+Array.append_assoc,initStd,heavy,0,0,3938574533,1570256148
+Vector.append,initStd,heavy,0,0,4023268168,1614275115
+Fin.foldl,initStd,heavy,0,0,10853255199,5110854190
+List.mergeSort,initStd,heavy,1,1,13825318985,6706906294
+Array.binSearch,initStd,heavy,1,0,14397133548,6785827470
+Array.qsort,initStd,heavy,0,0,15781689533,7199288749
+Array.qsortOrd,initStd,heavy,0,0,15841062472,7206704674
+String.split,initStd,heavy,0,1,19578088286,8657387499
+Std.Time.Week.Offset.ofMilliseconds,initStd,heavy,0,0,24577209792,6653972854
+Vector.extract_append,initStd,heavy,1,1,61830646478,
+Lean.Expr.replace,lean,cheap,0,0,859625514,
+List.Sorted,mathlib,cheap,0,0,9578666,
+Nat.choose,mathlib,cheap,0,0,29018862,
+Nat.factorial,mathlib,cheap,0,1,33562426,
+Nat.fib,mathlib,cheap,0,0,34171209,
+GCDMonoid.gcd,mathlib,heavy,0,0,1005736276,
+Nat.Prime.two_le,mathlib,heavy,0,0,1504045298,
+Finset.prod,mathlib,heavy,0,0,3045165822,
+Finset.sum,mathlib,heavy,0,0,3045189408,
+Polynomial.eval,mathlib,heavy,0,0,5342731754,
+Multiset.sort,mathlib,heavy,1,1,18670960624,

From efd91aa1dbf50f39a48770ed92c48574a139b6b0 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Tue, 30 Jun 2026 20:53:00 +0000
Subject: [PATCH 05/27] feat(ci): native backend on !benchmark + bencher
 parity/cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bring both surfaces (!benchmark PR comment and bencher.dev) to parity across all
four backends and remove the test scaffolding.

- Native kernel on !benchmark: add a `native` backend (whole-env `ix check
  --anon`, the parallel out-of-circuit kernel) — bench.py backend/runner/metrics,
  a run.sh `native` branch, and a GNU-time install for native PR cells. `all`
  now fans out aiur/zisk/sp1/native.
- run.sh: single-source the Aiur path through a per-constant bench-typecheck loop
  (per-constant peak-rss), add a per-constant `timeout` to the zkVM execute path
  (heavy primaries can't hang a job), and accept the env arg case-insensitively
  (bencher reuses the cached InitStd.ixe; bench-pr compiles initStd.ixe).
- bench-main.yml: the prove and native-check jobs now drive run.sh too (dedup
  with the PR path); zkvm-execute gains the Mathlib env to match prove/native.
- bench-thresholds-reset.yml: register the zisk / sp1 / native-check workloads.
- bench-pr.yml: drop the temporary pull_request trigger (keep the harmless
  dual-SHA / number fallbacks).
- docs/benchmarking.md: document the two surfaces, backends, Vectors.csv, the
  !benchmark grammar, and the bencher workloads / threshold resets.
---
 .github/scripts/bench.py                      |  7 +-
 .github/scripts/run.sh                        | 99 +++++++++++++------
 .github/workflows/bench-main.yml              | 68 ++++---------
 .github/workflows/bench-pr.yml                | 27 +++--
 .../workflows/bencher-thresholds-reset.yml    | 13 +--
 docs/benchmarking.md                          | 70 +++++++++++++
 6 files changed, 183 insertions(+), 101 deletions(-)
 create mode 100644 docs/benchmarking.md

diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index 4622b9f4..018a1b89 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -18,7 +18,7 @@
 
 
 # ───────────────────────── parse ─────────────────────────
-BACKENDS = ("aiur", "zisk", "sp1")
+BACKENDS = ("aiur", "zisk", "sp1", "native")
 MODES = ("execute", "prove")
 ENVS = ("initStd", "lean", "mathlib")
 CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_GPU", "BENCH_FULL"}
@@ -29,6 +29,8 @@ def runner_for(backend, mode, gpu):
     """(runs-on label, skip?) for a cell."""
     if backend == "aiur":
         return "warp-ubuntu-latest-x64-32x", False
+    if backend == "native":   # whole-env parallel check; no proving, never skips
+        return "warp-ubuntu-latest-x64-32x", False
     if mode == "execute":
         return "warp-ubuntu-latest-x64-16x", False
     if gpu:                       # zkVM prove needs a GPU
@@ -132,6 +134,9 @@ def cmd_manifest(a):
     ("zisk", "execute"): ["cycles", "execute-time", "throughput", "peak-rss"],
     ("sp1", "execute"): ["cycles", "execute-time", "throughput", "peak-rss"],
     ("zisk", "prove"): ["prove-time", "steps"], ("sp1", "prove"): ["prove-time", "steps"],
+    # native is whole-env (one row per env); mode is ignored (it never proves).
+    ("native", "execute"): ["throughput", "check-time", "peak-rss"],
+    ("native", "prove"): ["throughput", "check-time", "peak-rss"],
 }
 
 
diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index 045a1f4f..165cbc69 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -1,19 +1,20 @@
 #!/usr/bin/env bash
-# Compile one library env to a `.ixe` from a checked-out repo, then benchmark the
-# given backend over a manifest, emitting the neutral results JSON
+# Compile one library env to a `.ixe` from a checked-out repo (unless REUSE_IXE),
+# then benchmark the given backend, emitting the neutral results JSON
 #   { "<name>": { "<metric>": <number>, ... }, ... }
-# that bench.py compare consumes.
+# that bench.py compare / the bencher jobs consume.
 #
 #   run.sh <repo_dir> <env> <backend> <mode> <names_file> <out_json>
 #     repo_dir : checked-out worktree (has .lake/build/bin/{ix,bench-typecheck})
 #     env      : initStd | lean | mathlib
-#     backend  : aiur | zisk | sp1
+#     backend  : aiur | zisk | sp1 | native
 #     mode     : execute | prove
 #
-# `ix` / `bench-typecheck` are taken from <repo_dir> (so base measures base's
-# code, PR the PR's — the caller puts <repo_dir>/.lake/build/bin on PATH). The
-# zkVM hosts run from <repo_dir>/<backend>. Per-constant subprocesses for the
-# zkVM hosts so one OOM/timeout drops only that row.
+# `ix` / `bench-typecheck` come from <repo_dir> (so base measures base's code, PR
+# the PR's — the caller puts <repo_dir>/.lake/build/bin on PATH). aiur / zisk /
+# sp1 run one subprocess per constant so a failure/timeout drops only that row;
+# native is whole-env (`ix check --anon`, the parallel out-of-circuit kernel) and
+# ignores <names_file>, keyed by the env.
 set -uo pipefail
 
 repo=${1:?repo_dir}; benv=${2:?env}; backend=${3:?backend}; mode=${4:?mode}
@@ -24,8 +25,11 @@ repo=$(cd "$repo" && pwd)
 : > "$out"
 emit_empty() { [ -s "$out" ] || echo '{}' > "$out"; }
 
-case "$benv" in
-  initStd) module=CompileInitStd ;;
+# `$benv` is used verbatim for the `.ixe` filename (bench-pr compiles `initStd.ixe`;
+# the bencher jobs reuse the compile job's cached `InitStd.ixe`), and lowercased
+# only to pick the Compile module.
+case "$(printf '%s' "$benv" | tr '[:upper:]' '[:lower:]')" in
+  initstd) module=CompileInitStd ;;
   lean)    module=CompileLean ;;
   mathlib) module=CompileMathlib ;;
   *) echo "unknown env: $benv" >&2; exit 2 ;;
@@ -40,32 +44,42 @@ else
   echo "::endgroup::"
 fi
 
+tmp=$(mktemp -d)
+
 case "$backend" in
   aiur)
-    # bench-typecheck runs Phase 1 (execute) always; Phase 2 (prove) unless
-    # --execute-only. It writes the neutral JSON itself via --json.
-    args=(--ixe "$ixe" --manifest "$names" --json "$out")
-    if [ "$mode" = execute ]; then
-      bench-typecheck "${args[@]}" --execute-only || true
-    else
-      bench-typecheck "${args[@]}" --texray 2> tx.log || true
-      # Fold texray's proving RSS high-water mark into every entry (max over
-      # spans; $2+0 stops at the first non-digit) — same parse as aiur-bench.yml.
-      rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>m {m=$2+0} END {if (m>0) print m}' tx.log)
-      if [ -n "${rss:-}" ] && [ -s "$out" ]; then
-        jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' "$out" > "$out.t" \
-          && mv "$out.t" "$out" || true
+    # One bench-typecheck subprocess per constant (isolation + per-constant
+    # peak-rss). Phase 1 (execute) always runs; Phase 2 (prove) unless
+    # --execute-only. bench-typecheck writes the neutral per-constant JSON.
+    while IFS= read -r c; do
+      [ -z "$c" ] && continue
+      slug=$(printf '%s' "$c" | tr '/ .:' '____')
+      res="$tmp/$slug.json"
+      if [ "$mode" = execute ]; then
+        bench-typecheck --ixe "$ixe" "$c" --json "$res" --execute-only \
+          || { echo "::warning::aiur execute '$c' failed; dropping"; continue; }
+      else
+        bench-typecheck --ixe "$ixe" "$c" --json "$res" --texray 2> "$tmp/$slug.tx" \
+          || { echo "::warning::aiur prove '$c' failed (OOM/timeout); dropping"; continue; }
+        # Fold texray's proving RSS high-water mark (max over spans; awk's $2+0
+        # stops at the first non-digit) into this constant's entry.
+        rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>m {m=$2+0} END {if (m>0) print m}' "$tmp/$slug.tx")
+        if [ -n "${rss:-}" ] && [ -s "$res" ]; then
+          jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' "$res" > "$res.t" \
+            && mv "$res.t" "$res" || true
+        fi
       fi
-    fi
+      [ -s "$res" ] && cat "$res"
+    done < "$names" | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null
     emit_empty
     ;;
 
   zisk|sp1)
     host="${backend}-host"; work="$repo/$backend"
-    # Build the host once so per-constant timing excludes compilation, and run the
-    # binary directly under `/usr/bin/time` — a `timeout`/`cargo run` wrapper would
-    # report the wrapper's RSS, not the host's. (No per-constant timeout in execute
-    # mode; the job-level timeout bounds a hang.)
+    # Build the host once so per-constant timing excludes compilation. Order is
+    # `timeout … /usr/bin/time … host`: timeout bounds a runaway constant while
+    # /usr/bin/time still measures the host directly (its child), so RSS/elapsed
+    # are the host's, not a wrapper's.
     echo "::group::cargo build $host"
     ( cd "$work" && cargo build --quiet --release --bin "$host" )
     echo "::endgroup::"
@@ -76,16 +90,14 @@ case "$backend" in
     # in-session as root; the host children inherit it. Without this the ASM
     # services die with `mmap(rom) errno=11`. SP1 needs no such raise.
     [ "$backend" = zisk ] && sudo prlimit --pid $$ --memlock=unlimited:unlimited
-    tmp=$(mktemp -d)
     while IFS= read -r c; do
       [ -z "$c" ] && continue
       slug=$(printf '%s' "$c" | tr '/ .:' '____')
       log="$tmp/$slug.out"; tmf="$tmp/$slug.time"
       if [ "$mode" = execute ]; then
-        # `/usr/bin/time -f '%e %M'` → elapsed seconds + max RSS (kB).
-        ( cd "$work" && /usr/bin/time -f '%e %M' -o "$tmf" \
+        ( cd "$work" && timeout 25m /usr/bin/time -f '%e %M' -o "$tmf" \
             "$bin" --execute --ixe "$ixe" --constant "$c" --skip-deps ) > "$log" 2>>"$log" \
-          || { echo "::warning::$backend execute '$c' failed; dropping"; continue; }
+          || { echo "::warning::$backend execute '$c' failed/timed out; dropping"; continue; }
         fail=$(grep -oE 'failures[:=] ?[0-9]+' "$log" | head -1 | grep -oE '[0-9]+')
         if [ "${fail:-1}" != 0 ]; then
           echo "::warning::$backend '$c': nonzero/missing failures; dropping"; continue
@@ -130,6 +142,29 @@ case "$backend" in
     emit_empty
     ;;
 
+  native)
+    # Native out-of-circuit Rust kernel, whole env across available_parallelism
+    # workers — far faster than proving; ignores <names_file>. ix check is a
+    # single multi-threaded process so /usr/bin/time -f '%M' is the true peak RSS.
+    # `##check## <elapsed_ms> <passed> <failures> <total>` is the machine line.
+    log="$tmp/native.out"; tmf="$tmp/native.time"
+    /usr/bin/time -f '%e %M' -o "$tmf" ix check "$ixe" --anon > "$log" 2>>"$log" \
+      || { echo "::warning::native check failed"; emit_empty; exit 0; }
+    line=$(grep '^##check##' "$log" | tail -1)
+    elapsed_ms=$(echo "$line" | awk '{print $2}')
+    failures=$(echo "$line" | awk '{print $4}'); total=$(echo "$line" | awk '{print $5}')
+    if [ -z "${total:-}" ] || [ "${failures:-1}" != 0 ]; then
+      echo "::warning::native check: nonzero/missing failures or no ##check## line"; emit_empty; exit 0
+    fi
+    rssk=$(awk 'NR==1{print $2}' "$tmf"); rss=$(( ${rssk:-0} * 1024 ))
+    check_s=$(awk -v e="$elapsed_ms" 'BEGIN{printf "%.3f", e/1000}')
+    tput=$(awk -v t="$total" -v e="$elapsed_ms" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }')
+    jq -n --arg n "$benv" --argjson c "$total" --argjson s "$check_s" \
+          --argjson tp "$tput" --argjson rss "$rss" \
+      '{($n): {constants:$c, "check-time":$s, throughput:$tp, "peak-rss":$rss}}' > "$out"
+    emit_empty
+    ;;
+
   *) echo "unknown backend: $backend" >&2; exit 2 ;;
 esac
 echo "rows in $out: $(jq 'length' "$out" 2>/dev/null || echo '?')"
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index 5ddda359..ff6c8c4e 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -214,37 +214,20 @@ jobs:
       # fold the max over spans in as `peak-rss` (bytes), the proving RSS
       # high-water mark.
       - name: Run Aiur typecheck benchmark
+        env:
+          REUSE_IXE: "1"
         run: |
-          measure() {
-            local c="$1" rss
-            timeout 20m bench-typecheck --ixe ${{ matrix.bench }}.ixe "$c" \
-              --json "res-$c.json" --texray 2>"tx-$c.log" \
-              || echo "warning: $c failed (OOM/timeout); dropping it from this report"
-            rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>max {max=$2+0} END {if (max>0) print max}' "tx-$c.log")
-            if [ -f "res-$c.json" ] && [ -n "$rss" ] && [ "$rss" -gt 0 ]; then
-              jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' \
-                "res-$c.json" > "res-$c.json.tmp" && mv "res-$c.json.tmp" "res-$c.json" || true
-            fi
-          }
           # The primary subset's prove tier (cheap primaries) — heavy primaries are
-          # execute-only (a single-shard prove would OOM the runner).
+          # execute-only (a single-shard prove would OOM the runner). run.sh drives
+          # bench-typecheck per constant (per-constant peak-rss), same as the
+          # !benchmark PR path.
           benv="${{ matrix.bench }}"; benv="${benv,}"   # InitStd→initStd, Mathlib→mathlib
           python3 .github/scripts/bench.py manifest \
-            --csv Benchmarks/Vectors.csv --env "$benv" --mode prove --primary \
-            --out names.txt
+            --csv Benchmarks/Vectors.csv --env "$benv" --mode prove --primary --out names.txt
           echo "proving $(wc -l < names.txt) primary constants:"; cat names.txt
-          while IFS= read -r c; do [ -n "$c" ] && measure "$c"; done < names.txt
-          # Merge the per-constant results; if none produced anything, emit `{}`.
-          jq -s 'reduce .[] as $o ({}; . + $o)' res-*.json > results.json 2>/dev/null \
-            || echo '{}' > results.json
-          [ -s results.json ] || echo '{}' > results.json
-          # Wrap each metric value as { "value": v } for Bencher Metric Format.
-          # bench-typecheck already emits slug keys (constants, fft-cost,
-          # execute-time, prove-time, throughput = constants/prove-time); peak-rss
-          # is injected above.
-          jq '
-            map_values(to_entries | map({(.key): {value: .value}}) | add)
-          ' results.json > aiur.json
+          bash .github/scripts/run.sh . "${{ matrix.bench }}" aiur prove names.txt neutral.json
+          # Wrap neutral { name: { metric: v } } → Bencher Metric Format.
+          jq 'map_values(map_values({value: .}))' neutral.json > aiur.json
           cat aiur.json
       # Upload Aiur metrics. Every measure shares the per-workload baseline
       # window (data points since the aiur reset tag). constants is deterministic
@@ -294,13 +277,12 @@ jobs:
       fail-fast: false
       matrix:
         backend: [zisk, sp1]
-        include:
-          - bench: InitStd
+        bench: [InitStd, Mathlib]
     steps:
       - uses: actions/checkout@v6
         with:
           fetch-depth: 0
-          fetch-tags: true   # bencher-track reads the baseline-reset tag
+          fetch-tags: true   # bencher-track reads the bencher-thresholds-reset tag
       - uses: actions-rust-lang/setup-rust-toolchain@v1
         with:
           cache-workspaces: ${{ matrix.backend }}
@@ -379,7 +361,7 @@ jobs:
       - uses: actions/checkout@v6
         with:
           fetch-depth: 0
-          fetch-tags: true   # bencher-track reads the baseline-reset tag
+          fetch-tags: true   # bencher-track reads the bencher-thresholds-reset tag
       - uses: actions/cache/restore@v5
         with:
           path: ~/.local/bin
@@ -400,26 +382,16 @@ jobs:
           path: ${{ matrix.bench }}.ixe
           key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
           fail-on-cache-miss: true
-      # `ix check --anon` checks every kernel-checkable address across
-      # available_parallelism workers and prints a machine-readable
-      # `##check## <elapsed_ms> <passed> <failures> <total>` line. ix check is a
-      # single multi-threaded process, so `/usr/bin/time -f '%M'` is the true peak RSS.
+      # run.sh native runs `ix check --anon` (whole env, parallel) and emits the
+      # neutral { <env>: { constants, check-time, throughput, peak-rss } } — same
+      # path as the !benchmark native backend.
       - name: Run native kernel check
+        env:
+          REUSE_IXE: "1"
         run: |
-          /usr/bin/time -f '%e %M' -o time.txt \
-            ix check ${{ matrix.bench }}.ixe --anon 2>&1 | tee out.txt
-          line=$(grep '^##check##' out.txt | tail -1)
-          elapsed_ms=$(echo "$line" | awk '{print $2}')
-          failures=$(echo "$line" | awk '{print $4}')
-          total=$(echo "$line" | awk '{print $5}')
-          [ "${failures:-1}" = 0 ] || { echo "kernel check reported ${failures:-?} failure(s)"; exit 1; }
-          rssk=$(awk 'NR==1{print $2}' time.txt); rss=$(( ${rssk:-0} * 1024 ))
-          check_s=$(awk -v e="$elapsed_ms" 'BEGIN{printf "%.3f", e/1000}')
-          tput=$(awk -v t="$total" -v e="$elapsed_ms" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }')
-          jq -n --arg n "${{ matrix.bench }}" --argjson c "$total" --argjson s "$check_s" \
-                --argjson tp "$tput" --argjson rss "$rss" \
-            '{($n): {constants:{value:$c}, "check-time":{value:$s}, throughput:{value:$tp}, "peak-rss":{value:$rss}}}' \
-            > bench.json
+          bash .github/scripts/run.sh . "${{ matrix.bench }}" native execute \
+            /dev/null neutral.json
+          jq 'map_values(map_values({value: .}))' neutral.json > bench.json
           cat bench.json
       # constants is deterministic → pinned (0/0); check-time / throughput /
       # peak-rss are noisy parallel wall-clock → percentage bounds.
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index 9f75a308..682e063f 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -1,7 +1,7 @@
 # `!benchmark` PR command: run the curated constant set (Benchmarks/Vectors.csv)
 # through chosen prover backend(s) and post a main-vs-PR comparison table.
 #
-#   !benchmark [aiur] [zisk] [sp1 | all]  [execute|prove]
+#   !benchmark [aiur] [zisk] [sp1] [native | all]  [execute|prove]
 #   BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
 #   BENCH_FULL=1                   # run the full curated set, not the ~11 primary
 #   BENCH_TIER=cheap|heavy|all     # override the mode default (execute=all, prove=cheap)
@@ -18,12 +18,6 @@ name: Benchmark pull requests
 on:
   issue_comment:
     types: [created]
-  # TEST: temporary trigger so this workflow can be exercised from a PR branch.
-  # issue_comment only fires once the workflow is on the default branch, so a
-  # `!benchmark` comment can't test it pre-merge. On pull_request there is no
-  # comment, so the run uses the parser defaults (aiur / initStd / execute).
-  # Remove this trigger before merge.
-  pull_request:
 
 permissions:
   contents: read
@@ -38,12 +32,11 @@ jobs:
   setup:
     name: Parse !benchmark comment
     if: >-
-      github.event_name == 'pull_request' ||
-      (github.event.issue.pull_request &&
-       github.event.issue.state == 'open' &&
-       contains(github.event.comment.body, '!benchmark') &&
-       (github.event.comment.author_association == 'MEMBER' ||
-        github.event.comment.author_association == 'OWNER'))
+      github.event.issue.pull_request &&
+      github.event.issue.state == 'open' &&
+      contains(github.event.comment.body, '!benchmark') &&
+      (github.event.comment.author_association == 'MEMBER' ||
+       github.event.comment.author_association == 'OWNER')
     runs-on: ubuntu-latest
     outputs:
       matrix: ${{ steps.parse.outputs.matrix }}
@@ -157,7 +150,7 @@ jobs:
       # zkVM cells additionally need the Rust toolchain + the backend's toolchain
       # and system deps (the shared composite install actions).
       - name: Set up zkVM Rust toolchain
-        if: matrix.cell.skip != 'true' && matrix.cell.backend != 'aiur'
+        if: matrix.cell.skip != 'true' && (matrix.cell.backend == 'zisk' || matrix.cell.backend == 'sp1')
         uses: actions-rust-lang/setup-rust-toolchain@v1
         with:
           cache-workspaces: ${{ matrix.cell.backend }}
@@ -167,6 +160,12 @@ jobs:
       - name: Install Zisk
         if: matrix.cell.skip != 'true' && matrix.cell.backend == 'zisk'
         uses: ./.github/actions/install-zisk
+      # native = whole-env `ix check --anon`; needs only GNU time (ix is built above).
+      - name: Install GNU time
+        if: matrix.cell.skip != 'true' && matrix.cell.backend == 'native'
+        run: |
+          sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true
+          sudo apt-get update && sudo apt-get install -y time
 
       # ---------- main side (cached by base SHA) ----------
       - name: Restore cached main results
diff --git a/.github/workflows/bencher-thresholds-reset.yml b/.github/workflows/bencher-thresholds-reset.yml
index 37a0213c..d3c3eef5 100644
--- a/.github/workflows/bencher-thresholds-reset.yml
+++ b/.github/workflows/bencher-thresholds-reset.yml
@@ -17,8 +17,9 @@ name: Bencher thresholds reset
 #     `bencher-thresholds-reset:<token>` label on the PR, whatever added it — so a
 #     Triage+ collaborator can queue a reset by applying the label directly, and
 #     cancel by removing it before merge. Naming convention: one label per token,
-#     `bencher-thresholds-reset:<token>` where <token> is `ix-compile`, `aiur`, or
-#     `all` (the merge step expands an `all` label into every workload). Labeling
+#     `bencher-thresholds-reset:<token>` where <token> is `ix-compile`, `aiur`,
+#     `zisk`, `sp1`, `native-check`, or `all` (the merge step expands an `all`
+#     label into every workload). Labeling
 #     requires Triage+, so PR authors from forks cannot self-queue a reset. The
 #     label shares the command/workflow name; the git tag it moves is the same
 #     stem with a dash: `bencher-thresholds-reset-<workload>`.
@@ -36,7 +37,7 @@ on:
         description: Workload baseline to reset
         required: true
         type: choice
-        options: [ix-compile, aiur, all]
+        options: [ix-compile, aiur, zisk, sp1, native-check, all]
       sha:
         description: "Commit to anchor to (default: HEAD)"
         required: false
@@ -65,7 +66,7 @@ jobs:
       MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }}
     steps:
       - run: |
-          valid="ix-compile aiur"
+          valid="ix-compile aiur zisk sp1 native-check"
           if [ "$EVENT" = workflow_dispatch ]; then
             # Reset the chosen workload(s) at the given commit; no PR scan.
             sha="${INPUT_SHA:-$HEAD_SHA}"
@@ -119,7 +120,7 @@ jobs:
       - run: |
           # Accepted command tokens — applied verbatim as labels (incl. `all`,
           # which the merge job expands into every workload).
-          accepted="ix-compile aiur all"
+          accepted="ix-compile aiur zisk sp1 native-check all"
           # Parse the workload token(s) after the command, lowercased.
           workloads=$(printf '%s' "$BODY" \
             | grep -oiE '!bencher-thresholds-reset[[:space:]]+[a-z0-9 -]+' \
@@ -142,5 +143,5 @@ jobs:
               --body "♻️ Baseline reset queued for:$ok — will anchor to the merge commit when this PR merges."
           else
             gh pr comment "$PR" --repo "$REPO" \
-              --body "⚠️ Reset command matched no known workload (expected \`ix-compile\`, \`aiur\`, or \`all\`). Nothing will reset on merge."
+              --body "⚠️ Reset command matched no known workload (expected \`ix-compile\`, \`aiur\`, \`zisk\`, \`sp1\`, \`native-check\`, or \`all\`). Nothing will reset on merge."
           fi
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
new file mode 100644
index 00000000..8677e4a8
--- /dev/null
+++ b/docs/benchmarking.md
@@ -0,0 +1,70 @@
+# Benchmarking
+
+Ix is benchmarked on two surfaces, both driven by one curated constant set and
+the same backend drivers:
+
+- **`!benchmark` PR comment** (`.github/workflows/bench-pr.yml`) — on demand,
+  posts a **main-vs-PR** comparison table on the pull request.
+- **Bencher.dev** (`.github/workflows/bench-main.yml`) — on every push to `main`,
+  tracks each measure over time at <https://bencher.dev> (project `ix`).
+
+## Backends
+
+| backend | what it measures | metrics |
+|---|---|---|
+| `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove) | `fft-cost`, `execute-time`, `prove-time`, `peak-rss`, `constants`, `throughput` |
+| `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `peak-rss` (+ `shards`, `max-shard-cycles` for sharded runs) |
+| `native` | the same kernel run **out-of-circuit and in parallel** (`ix check --anon`, whole env) — far faster | `throughput` (constants/s), `check-time`, `peak-rss`, `constants` |
+
+All four are driven by `.github/scripts/run.sh` (compile the env `.ixe`, run the
+backend, emit a neutral `{ "<name>": { "<metric>": n } }` JSON). The PR workflow
+compares two such JSONs; the bencher workflow wraps one in Bencher Metric Format.
+
+## Constant set — `Benchmarks/Vectors.csv`
+
+One CSV is the single source of truth: `name,env,tier,shard_target,primary,aiur_fft,zisk_cycles`.
+
+- `env` — compile target the constant resolves in (`initStd` / `lean` / `mathlib`).
+- `tier` — `cheap` (prove-feasible on a CI runner) or `heavy` (execute-only; a
+  single-shard prove would OOM).
+- `primary` — the **~11-constant default subset**, spanning shape and the
+  cheap→heavy cost range (3 are shard targets). Everything defaults to this;
+  the full ~60-constant set is opt-in.
+
+`bench.py manifest` selects names by env + mode (`prove`→cheap, `execute`→all) +
+`--primary`. `bench.py compare` renders the PR table.
+
+## `!benchmark` grammar
+
+Maintainer comment on a PR:
+
+```
+!benchmark [aiur] [zisk] [sp1] [native | all]  [execute|prove]
+BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
+BENCH_FULL=1                   # run the full curated set, not the ~11 primary
+BENCH_TIER=cheap|heavy|all     # override the mode default (execute=all, prove=cheap)
+BENCH_SHARD=1                  # restrict to the multi-shard target constants
+BENCH_GPU=1                    # allow zkVM prove on a self-hosted GPU runner
+RUST_LOG=info                  # passthrough env (allowlisted)
+```
+
+Defaults: `aiur`, `execute`, `initStd`, primary subset. Backends fan out as a
+matrix; `main` results are cached by base SHA. zkVM `prove` is skipped with a
+note unless a GPU runner is selected.
+
+## Bencher jobs (`bench-main.yml`)
+
+`build → compile → { prove, zkvm-execute, native-check }`, each reporting to its
+own testbed + **workload** (`aiur`, `zisk`, `sp1`, `native-check`, `ix-compile`).
+Deterministic measures (cycles, fft-cost, constants, …) are pinned exactly;
+noisy wall-clock measures (time, RAM, throughput) ride percentage bounds, both
+windowed to the per-workload `bencher-thresholds-reset-<workload>` tag.
+
+To re-baseline a workload after an intended step change, comment
+`!bencher-thresholds-reset <workload|all>` on the merging PR, or run the
+`bencher-thresholds-reset` workflow (`.github/workflows/bencher-thresholds-reset.yml`).
+
+## Not yet covered
+
+- **zkVM proving** (Zisk/SP1 `prove`) needs a self-hosted GPU runner; on CPU
+  runners it is execute-only.

From 0dbb0afbdf042cfeacbbd7153a727687cfe931ed Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Tue, 30 Jun 2026 21:42:45 +0000
Subject: [PATCH 06/27] feat(ci): Aiur proves all primaries that fit 128GB;
 native checks primaries too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Aiur prove now covers the whole primary subset: bench.py exempts --primary from
  the prove cheap-tier cap, and run.sh proves each constant whose Aiur fft-cost
  fits the prover RAM ceiling (AIUR_PROVE_MAX_FFT, ~128 GB at 2.34 GB per billion
  fft) and execute-only's the rest — so heavy primaries (only Vector.extract_append
  at ~145 GB) still report execute metrics instead of being dropped. BENCH_FULL
  prove stays capped at the cheap tier so it doesn't balloon.
- native now reports two views per env: the whole env (`ix check --anon`, keyed by
  env) and a per-primary subject check (`ix check --consts`, keyed by constant) —
  apples-to-apples with the zkVM --skip-deps execute. Wired into both the
  native-check bencher job and the !benchmark native backend.
- run.sh: fix a stream-corruption bug — tool stdout and ::warning::/::notice:: now
  go to logs/stderr so only JSON reaches the per-constant `jq -s` merge.
---
 .github/scripts/bench.py         |   5 +-
 .github/scripts/run.sh           | 105 +++++++++++++++++++------------
 .github/workflows/bench-main.yml |  15 +++--
 docs/benchmarking.md             |   9 ++-
 4 files changed, 86 insertions(+), 48 deletions(-)

diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index 018a1b89..746b9ac7 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -100,7 +100,10 @@ def cmd_parse(_a):
 
 # ──────────────────────── manifest ────────────────────────
 def cmd_manifest(a):
-    tier = a.tier or ("cheap" if a.mode == "prove" else "all")
+    # prove defaults to the cheap tier to keep the full set bounded; the curated
+    # primary subset is exempt — run.sh proves each primary that fits the Aiur RAM
+    # ceiling and execute-only's the rest, so all primaries are selected here.
+    tier = a.tier or ("cheap" if (a.mode == "prove" and not a.primary) else "all")
     names = []
     with open(a.csv) as f:
         for line in f:
diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index 165cbc69..2f31199d 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -6,15 +6,15 @@
 #
 #   run.sh <repo_dir> <env> <backend> <mode> <names_file> <out_json>
 #     repo_dir : checked-out worktree (has .lake/build/bin/{ix,bench-typecheck})
-#     env      : initStd | lean | mathlib
+#     env      : initStd | lean | mathlib  (any case; used verbatim for <env>.ixe)
 #     backend  : aiur | zisk | sp1 | native
 #     mode     : execute | prove
 #
 # `ix` / `bench-typecheck` come from <repo_dir> (so base measures base's code, PR
-# the PR's — the caller puts <repo_dir>/.lake/build/bin on PATH). aiur / zisk /
-# sp1 run one subprocess per constant so a failure/timeout drops only that row;
-# native is whole-env (`ix check --anon`, the parallel out-of-circuit kernel) and
-# ignores <names_file>, keyed by the env.
+# the PR's — the caller puts <repo_dir>/.lake/build/bin on PATH). Each constant
+# is its own subprocess (a failure/timeout drops only that row). Only JSON is
+# written to stdout — tool output and `::warning::`/`::notice::` go to logs /
+# stderr so they never corrupt the merged JSON stream.
 set -uo pipefail
 
 repo=${1:?repo_dir}; benv=${2:?env}; backend=${3:?backend}; mode=${4:?mode}
@@ -37,7 +37,7 @@ esac
 
 ixe="$repo/$benv.ixe"
 if [ "${REUSE_IXE:-0}" = 1 ] && [ -f "$ixe" ]; then
-  echo "reusing existing $ixe (REUSE_IXE)"
+  echo "reusing existing $ixe (REUSE_IXE)" >&2
 else
   echo "::group::ix compile $module → $benv.ixe ($backend/$mode)"
   "$repo/.lake/build/bin/ix" compile "$repo/Benchmarks/Compile/$module.lean" --out "$ixe"
@@ -48,19 +48,30 @@ tmp=$(mktemp -d)
 
 case "$backend" in
   aiur)
-    # One bench-typecheck subprocess per constant (isolation + per-constant
-    # peak-rss). Phase 1 (execute) always runs; Phase 2 (prove) unless
-    # --execute-only. bench-typecheck writes the neutral per-constant JSON.
+    # One bench-typecheck per constant (isolation + per-constant peak-rss).
+    # Execute mode → Phase 1 only (--execute-only). Prove mode → prove each
+    # constant whose Aiur fft-cost fits the prover RAM ceiling (~128 GB at
+    # 2.34 GB per billion fft), else fall back to execute-only so a too-large
+    # single-shard prove never OOM-kills the job.
+    csv="$repo/Benchmarks/Vectors.csv"
+    ceil=${AIUR_PROVE_MAX_FFT:-50000000000}
     while IFS= read -r c; do
       [ -z "$c" ] && continue
       slug=$(printf '%s' "$c" | tr '/ .:' '____')
       res="$tmp/$slug.json"
-      if [ "$mode" = execute ]; then
-        bench-typecheck --ixe "$ixe" "$c" --json "$res" --execute-only \
-          || { echo "::warning::aiur execute '$c' failed; dropping"; continue; }
-      else
-        bench-typecheck --ixe "$ixe" "$c" --json "$res" --texray 2> "$tmp/$slug.tx" \
-          || { echo "::warning::aiur prove '$c' failed (OOM/timeout); dropping"; continue; }
+      do_prove=0
+      if [ "$mode" = prove ]; then
+        fft=$(awk -F, -v n="$c" '$1==n {print $6}' "$csv" 2>/dev/null)
+        if [ -n "${fft:-}" ] && [ "$fft" -le "$ceil" ]; then
+          do_prove=1
+        else
+          echo "::notice::aiur: '$c' fft=${fft:-?} exceeds $ceil (~128 GB); execute-only" >&2
+        fi
+      fi
+      if [ "$do_prove" = 1 ]; then
+        bench-typecheck --ixe "$ixe" "$c" --json "$res" --texray \
+          > "$tmp/$slug.log" 2> "$tmp/$slug.tx" \
+          || { echo "::warning::aiur prove '$c' failed (OOM/timeout); dropping" >&2; continue; }
         # Fold texray's proving RSS high-water mark (max over spans; awk's $2+0
         # stops at the first non-digit) into this constant's entry.
         rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>m {m=$2+0} END {if (m>0) print m}' "$tmp/$slug.tx")
@@ -68,6 +79,9 @@ case "$backend" in
           jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' "$res" > "$res.t" \
             && mv "$res.t" "$res" || true
         fi
+      else
+        bench-typecheck --ixe "$ixe" "$c" --json "$res" --execute-only > "$tmp/$slug.log" 2>&1 \
+          || { echo "::warning::aiur execute '$c' failed; dropping" >&2; continue; }
       fi
       [ -s "$res" ] && cat "$res"
     done < "$names" | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null
@@ -97,15 +111,15 @@ case "$backend" in
       if [ "$mode" = execute ]; then
         ( cd "$work" && timeout 25m /usr/bin/time -f '%e %M' -o "$tmf" \
             "$bin" --execute --ixe "$ixe" --constant "$c" --skip-deps ) > "$log" 2>>"$log" \
-          || { echo "::warning::$backend execute '$c' failed/timed out; dropping"; continue; }
+          || { echo "::warning::$backend execute '$c' failed/timed out; dropping" >&2; continue; }
         fail=$(grep -oE 'failures[:=] ?[0-9]+' "$log" | head -1 | grep -oE '[0-9]+')
         if [ "${fail:-1}" != 0 ]; then
-          echo "::warning::$backend '$c': nonzero/missing failures; dropping"; continue
+          echo "::warning::$backend '$c': nonzero/missing failures; dropping" >&2; continue
         fi
         # Total cycles: sharded prints "total cycles: N", single prints "cycles: N".
         cyc=$(grep -oE 'total cycles: [0-9]+' "$log" | tail -1 | grep -oE '[0-9]+')
         [ -z "$cyc" ] && cyc=$(grep -oE 'cycles: [0-9]+' "$log" | tail -1 | grep -oE '[0-9]+')
-        [ -z "$cyc" ] && { echo "::warning::$backend '$c': no cycle count; dropping"; continue; }
+        [ -z "$cyc" ] && { echo "::warning::$backend '$c': no cycle count; dropping" >&2; continue; }
         secs=$(awk 'NR==1{print $1}' "$tmf"); rssk=$(awk 'NR==1{print $2}' "$tmf")
         rss=$(( ${rssk:-0} * 1024 ))
         tput=$(awk -v c="$cyc" -v s="${secs:-0}" 'BEGIN{ if (s>0) printf "%.0f", c/s; else print 0 }')
@@ -127,7 +141,7 @@ case "$backend" in
         # prove (single-leaf, GPU runner only — the workflow gates this cell).
         ( cd "$work" && timeout 60m cargo run --quiet --release --bin "$host" -- \
             --gpu --ixe "$ixe" --constant "$c" --skip-deps ) > "$log" 2>&1 \
-          || { echo "::warning::$backend prove '$c' failed/timed out; dropping"; continue; }
+          || { echo "::warning::$backend prove '$c' failed/timed out; dropping" >&2; continue; }
         secs=$(grep -oE 'prove [0-9.]+s'   "$log" | head -1 | grep -oE '[0-9.]+')
         steps=$(grep -oE '\([0-9]+ steps\)' "$log" | head -1 | grep -oE '[0-9]+')
         fail=$(grep -oE 'failures=[0-9]+'  "$log" | head -1 | grep -oE '[0-9]+')
@@ -135,7 +149,7 @@ case "$backend" in
           jq -n --arg n "$c" --argjson t "$secs" --argjson s "${steps:-0}" \
             '{($n): {"prove-time": $t, steps: $s}}'
         else
-          echo "::warning::$backend prove '$c': no clean prove line; dropping"
+          echo "::warning::$backend prove '$c': no clean prove line; dropping" >&2
         fi
       fi
     done < "$names" | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null
@@ -143,28 +157,39 @@ case "$backend" in
     ;;
 
   native)
-    # Native out-of-circuit Rust kernel, whole env across available_parallelism
-    # workers — far faster than proving; ignores <names_file>. ix check is a
-    # single multi-threaded process so /usr/bin/time -f '%M' is the true peak RSS.
-    # `##check## <elapsed_ms> <passed> <failures> <total>` is the machine line.
-    log="$tmp/native.out"; tmf="$tmp/native.time"
-    /usr/bin/time -f '%e %M' -o "$tmf" ix check "$ixe" --anon > "$log" 2>>"$log" \
-      || { echo "::warning::native check failed"; emit_empty; exit 0; }
-    line=$(grep '^##check##' "$log" | tail -1)
-    elapsed_ms=$(echo "$line" | awk '{print $2}')
-    failures=$(echo "$line" | awk '{print $4}'); total=$(echo "$line" | awk '{print $5}')
-    if [ -z "${total:-}" ] || [ "${failures:-1}" != 0 ]; then
-      echo "::warning::native check: nonzero/missing failures or no ##check## line"; emit_empty; exit 0
-    fi
-    rssk=$(awk 'NR==1{print $2}' "$tmf"); rss=$(( ${rssk:-0} * 1024 ))
-    check_s=$(awk -v e="$elapsed_ms" 'BEGIN{printf "%.3f", e/1000}')
-    tput=$(awk -v t="$total" -v e="$elapsed_ms" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }')
-    jq -n --arg n "$benv" --argjson c "$total" --argjson s "$check_s" \
-          --argjson tp "$tput" --argjson rss "$rss" \
-      '{($n): {constants:$c, "check-time":$s, throughput:$tp, "peak-rss":$rss}}' > "$out"
+    # Native out-of-circuit Rust kernel (far faster than proving). Two views,
+    # both via the `##check## <elapsed_ms> <passed> <failures> <total>` line
+    # (ix check is a single multi-threaded process so /usr/bin/time -f '%M' is the
+    # true peak RSS): the whole env in parallel (`--anon`, keyed by env), and a
+    # per-primary subject check (`--consts`, keyed by constant) for an
+    # apples-to-apples baseline next to the zkVM `--skip-deps` execute.
+    native_one() {  # <label> <ix-check-args…>  → prints one JSON object
+      local label="$1"; shift
+      local log="$tmp/n.out" tmf="$tmp/n.time"
+      /usr/bin/time -f '%e %M' -o "$tmf" ix check "$ixe" "$@" > "$log" 2>>"$log" \
+        || { echo "::warning::native '$label' check failed; dropping" >&2; return; }
+      local line ems fl tot
+      line=$(grep '^##check##' "$log" | tail -1)
+      ems=$(echo "$line" | awk '{print $2}'); fl=$(echo "$line" | awk '{print $4}'); tot=$(echo "$line" | awk '{print $5}')
+      { [ -n "${tot:-}" ] && [ "${fl:-1}" = 0 ]; } \
+        || { echo "::warning::native '$label': bad ##check## / failures; dropping" >&2; return; }
+      local rssk rss cs tp
+      rssk=$(awk 'NR==1{print $2}' "$tmf"); rss=$(( ${rssk:-0} * 1024 ))
+      cs=$(awk -v e="$ems" 'BEGIN{printf "%.3f", e/1000}')
+      tp=$(awk -v t="$tot" -v e="$ems" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }')
+      jq -n --arg n "$label" --argjson c "$tot" --argjson s "$cs" --argjson tp "$tp" --argjson rss "$rss" \
+        '{($n): {constants:$c, "check-time":$s, throughput:$tp, "peak-rss":$rss}}'
+    }
+    {
+      native_one "$benv" --anon
+      while IFS= read -r c; do
+        [ -z "$c" ] && continue
+        native_one "$c" --consts "$c"
+      done < "$names"
+    } | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null
     emit_empty
     ;;
 
   *) echo "unknown backend: $backend" >&2; exit 2 ;;
 esac
-echo "rows in $out: $(jq 'length' "$out" 2>/dev/null || echo '?')"
+echo "rows in $out: $(jq 'length' "$out" 2>/dev/null || echo '?')" >&2
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index ff6c8c4e..5fb5cfee 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -217,10 +217,9 @@ jobs:
         env:
           REUSE_IXE: "1"
         run: |
-          # The primary subset's prove tier (cheap primaries) — heavy primaries are
-          # execute-only (a single-shard prove would OOM the runner). run.sh drives
-          # bench-typecheck per constant (per-constant peak-rss), same as the
-          # !benchmark PR path.
+          # All primaries: run.sh proves each whose Aiur fft-cost fits ~128 GB and
+          # execute-only's the rest (so heavy primaries still report execute
+          # metrics). Per-constant peak-rss, same path as the !benchmark PR run.
           benv="${{ matrix.bench }}"; benv="${benv,}"   # InitStd→initStd, Mathlib→mathlib
           python3 .github/scripts/bench.py manifest \
             --csv Benchmarks/Vectors.csv --env "$benv" --mode prove --primary --out names.txt
@@ -389,8 +388,12 @@ jobs:
         env:
           REUSE_IXE: "1"
         run: |
-          bash .github/scripts/run.sh . "${{ matrix.bench }}" native execute \
-            /dev/null neutral.json
+          # Whole env (keyed by env) + the primary constants subject-checked
+          # (keyed by constant) for an apples-to-apples baseline next to zisk/sp1.
+          benv="${{ matrix.bench }}"; benv="${benv,}"
+          python3 .github/scripts/bench.py manifest \
+            --csv Benchmarks/Vectors.csv --env "$benv" --mode execute --primary --out names.txt
+          bash .github/scripts/run.sh . "${{ matrix.bench }}" native execute names.txt neutral.json
           jq 'map_values(map_values({value: .}))' neutral.json > bench.json
           cat bench.json
       # constants is deterministic → pinned (0/0); check-time / throughput /
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index 8677e4a8..a06a646f 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -14,7 +14,14 @@ the same backend drivers:
 |---|---|---|
 | `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove) | `fft-cost`, `execute-time`, `prove-time`, `peak-rss`, `constants`, `throughput` |
 | `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `peak-rss` (+ `shards`, `max-shard-cycles` for sharded runs) |
-| `native` | the same kernel run **out-of-circuit and in parallel** (`ix check --anon`, whole env) — far faster | `throughput` (constants/s), `check-time`, `peak-rss`, `constants` |
+| `native` | the same kernel run **out-of-circuit and in parallel** (`ix check`) — far faster | `throughput` (constants/s), `check-time`, `peak-rss`, `constants` |
+
+In **prove** mode, `run.sh` proves each constant whose Aiur fft-cost fits the
+prover RAM ceiling (`AIUR_PROVE_MAX_FFT`, ~128 GB at 2.34 GB per billion fft) and
+falls back to **execute-only** for the rest, so every primary still reports
+metrics. The `native` backend reports two views: the **whole env** (`ix check
+--anon`, keyed by env) and a **per-primary subject check** (`ix check --consts`,
+keyed by constant — apples-to-apples with the zkVM `--skip-deps` execute).
 
 All four are driven by `.github/scripts/run.sh` (compile the env `.ixe`, run the
 backend, emit a neutral `{ "<name>": { "<metric>": n } }` JSON). The PR workflow

From ea1926cb5d65d0db5163bcbc99fcd383c6222ef7 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 2 Jul 2026 01:15:43 +0000
Subject: [PATCH 07/27] feat(ci): texray per-phase drill-down +
 child-process-aware peak RAM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Route every backend's peak-rss and per-phase timings through tracing-texray:

- peak-rss now comes from the fork's process-tree sampler, so Zisk's ASM
  microservices' memory (tens of GB in separate PIDs) is captured — a bare
  /proc/self/status read of the host missed it. Verified on real execute
  runs: sp1 ~9GB, zisk ~49GB peak.

- zisk/sp1 hosts and bench-typecheck self-report cycles/time/throughput/
  peak-rss via --json, retiring run.sh's grep/awk/time parsing. They also
  emit per-phase span timings via --texray-json; native check-rs reports
  peak-rss as a 6th ##check## field.

- run.sh folds span timings into a per-constant `phases` object; bench.py
  renders a collapsible per-phase drill-down in the !benchmark comparison,
  and bench-main.yml tracks phase:<span> measures on bencher.

tracing-texray is pinned to the argumentcomputer/tracing-texray json-ram
branch (rev 15ae57c), which adds the process-tree RSS sampler + JSON sink.
---
 .github/actions/install-sp1/action.yml  |   2 +-
 .github/actions/install-zisk/action.yml |   2 +-
 .github/scripts/bench.py                |  42 +++++++-
 .github/scripts/run.sh                  | 121 +++++++++++-------------
 .github/workflows/bench-main.yml        |  21 ++--
 .github/workflows/bench-pr.yml          |   6 --
 Benchmarks/Typecheck.lean               |  22 ++++-
 Cargo.lock                              |   2 +-
 Cargo.toml                              |   2 +-
 Ix/Cli/CheckRsCmd.lean                  |  11 ++-
 Ix/TracingTexray.lean                   |  26 +++++
 crates/ffi/src/texray.rs                |  35 +++++++
 docs/benchmarking.md                    |  27 +++++-
 sp1/Cargo.lock                          |  58 ++++++++++++
 sp1/host/Cargo.toml                     |   5 +
 sp1/host/src/main.rs                    |  89 +++++++++++++++++
 zisk/Cargo.lock                         |  24 +++++
 zisk/host/Cargo.toml                    |   6 ++
 zisk/host/src/main.rs                   |  92 +++++++++++++++++-
 19 files changed, 496 insertions(+), 97 deletions(-)

diff --git a/.github/actions/install-sp1/action.yml b/.github/actions/install-sp1/action.yml
index 71aa2037..91ad4d6b 100644
--- a/.github/actions/install-sp1/action.yml
+++ b/.github/actions/install-sp1/action.yml
@@ -22,7 +22,7 @@ runs:
           nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \
           libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \
           openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \
-          pkg-config libssl-dev time
+          pkg-config libssl-dev
     - name: Install SP1 toolchain (sp1up, latest)
       shell: bash
       run: |
diff --git a/.github/actions/install-zisk/action.yml b/.github/actions/install-zisk/action.yml
index ff883957..7b01c6e3 100644
--- a/.github/actions/install-zisk/action.yml
+++ b/.github/actions/install-zisk/action.yml
@@ -23,7 +23,7 @@ runs:
           nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \
           libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \
           openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \
-          pkg-config libssl-dev time
+          pkg-config libssl-dev
     # `--version 0.18.0` pins the toolchain to match our deps. Our host links the
     # argumentcomputer/zisk `blake3-precompile` fork, which is based on v0.18.0
     # (its cargo-zisk has `check-setup`, used below to regenerate the key's
diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index 746b9ac7..a770fc07 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -136,7 +136,8 @@ def cmd_manifest(a):
     ("aiur", "prove"): ["prove-time", "peak-rss"],
     ("zisk", "execute"): ["cycles", "execute-time", "throughput", "peak-rss"],
     ("sp1", "execute"): ["cycles", "execute-time", "throughput", "peak-rss"],
-    ("zisk", "prove"): ["prove-time", "steps"], ("sp1", "prove"): ["prove-time", "steps"],
+    ("zisk", "prove"): ["prove-time", "steps", "peak-rss"],
+    ("sp1", "prove"): ["prove-time", "peak-rss"],
     # native is whole-env (one row per env); mode is ignored (it never proves).
     ("native", "execute"): ["throughput", "check-time", "peak-rss"],
     ("native", "prove"): ["throughput", "check-time", "peak-rss"],
@@ -171,6 +172,40 @@ def _load(path):
         return {}
 
 
+def _phases(entry):
+    """The `phases` object (span → seconds) on a constant's entry, or {}."""
+    p = entry.get("phases") if isinstance(entry, dict) else None
+    return p if isinstance(p, dict) else {}
+
+
+def _phase_details(main_d, pr_d, names):
+    """Collapsible per-constant phase (span) timing tables — the drill-down that
+    shows *where* time moved between main and PR. Emitted only for constants that
+    carry tracing-texray span data."""
+    blocks = []
+    for n in names:
+        mp, pp = _phases(main_d.get(n, {})), _phases(pr_d.get(n, {}))
+        # Only worth a drill-down when there's more than one phase; a lone phase
+        # (zisk/sp1 execute, native check) just restates the headline metric.
+        if len(set(mp) | set(pp)) < 2:
+            continue
+        rows = ["| phase | main (s) | PR (s) | Δ% |", "|---|--:|--:|--:|"]
+        # Slowest-on-PR (else main) first, so the dominant phase leads.
+        spans = sorted(set(mp) | set(pp),
+                       key=lambda s: -(pp.get(s) if isinstance(pp.get(s), (int, float))
+                                       else mp.get(s) if isinstance(mp.get(s), (int, float)) else 0))
+        for s in spans:
+            mv, pv = mp.get(s), pp.get(s)
+            mv = mv if isinstance(mv, (int, float)) else None
+            pv = pv if isinstance(pv, (int, float)) else None
+            dp = _delta(mv, pv)
+            rows.append(f"| `{s}` | {_human(mv)} | {_human(pv)} | "
+                        f"{'n/a' if dp is None else f'{dp:+.1f}%'} |")
+        blocks.append(f"<details><summary><code>{n}</code> — phase breakdown</summary>\n\n"
+                      + "\n".join(rows) + "\n\n</details>")
+    return blocks
+
+
 def cmd_compare(a):
     metrics = a.metric or METRICS.get((a.backend, a.mode))
     if not metrics:
@@ -228,6 +263,11 @@ def emit(text):
     if worst and worst[0] is not None and worst[0] > a.threshold:
         s += f" Worst: `{worst[1]}` {worst[0]:+.1f}%."
     out.append(s)
+    details = _phase_details(main_d, pr_d, names)
+    if details:
+        out += ["", "<details><summary>Per-phase timing drill-down</summary>", ""]
+        out += details
+        out += ["", "</details>"]
     emit("\n".join(out))
 
 
diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index 2f31199d..58dd860d 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -25,6 +25,20 @@ repo=$(cd "$repo" && pwd)
 : > "$out"
 emit_empty() { [ -s "$out" ] || echo '{}' > "$out"; }
 
+# Fold a tool's per-phase span timings (tracing-texray JSONL, one
+# `{"span":"…","seconds":N}` per closed span, possibly repeated per shard) into
+# its per-constant results file under a `phases` object, summed by span name —
+# the source bench.py renders as the comparative drill-down. No-op if the tool
+# emitted no spans.
+merge_phases() {  # <results.json> <spans.jsonl>
+  local res="$1" spans="$2" ph
+  [ -s "$spans" ] || return 0
+  ph=$(jq -s 'reduce .[] as $o ({}; .[$o.span] += $o.seconds)' "$spans" 2>/dev/null)
+  [ -n "$ph" ] && [ "$ph" != "{}" ] || return 0
+  jq --argjson ph "$ph" 'map_values(. + {phases: $ph})' "$res" > "$res.p" \
+    && mv "$res.p" "$res" || true
+}
+
 # `$benv` is used verbatim for the `.ixe` filename (bench-pr compiles `initStd.ixe`;
 # the bencher jobs reuse the compile job's cached `InitStd.ixe`), and lowercased
 # only to pick the Compile module.
@@ -58,7 +72,7 @@ case "$backend" in
     while IFS= read -r c; do
       [ -z "$c" ] && continue
       slug=$(printf '%s' "$c" | tr '/ .:' '____')
-      res="$tmp/$slug.json"
+      res="$tmp/$slug.json"; spans="$tmp/$slug.spans"
       do_prove=0
       if [ "$mode" = prove ]; then
         fft=$(awk -F, -v n="$c" '$1==n {print $6}' "$csv" 2>/dev/null)
@@ -68,21 +82,18 @@ case "$backend" in
           echo "::notice::aiur: '$c' fft=${fft:-?} exceeds $ceil (~128 GB); execute-only" >&2
         fi
       fi
+      # bench-typecheck self-reports peak-rss (texray tree sampler) in its --json;
+      # --texray-json captures the per-phase aiur/*, stark/* timings.
       if [ "$do_prove" = 1 ]; then
-        bench-typecheck --ixe "$ixe" "$c" --json "$res" --texray \
-          > "$tmp/$slug.log" 2> "$tmp/$slug.tx" \
+        bench-typecheck --ixe "$ixe" "$c" --json "$res" --texray-json "$spans" \
+          > "$tmp/$slug.log" 2>&1 \
           || { echo "::warning::aiur prove '$c' failed (OOM/timeout); dropping" >&2; continue; }
-        # Fold texray's proving RSS high-water mark (max over spans; awk's $2+0
-        # stops at the first non-digit) into this constant's entry.
-        rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>m {m=$2+0} END {if (m>0) print m}' "$tmp/$slug.tx")
-        if [ -n "${rss:-}" ] && [ -s "$res" ]; then
-          jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' "$res" > "$res.t" \
-            && mv "$res.t" "$res" || true
-        fi
       else
-        bench-typecheck --ixe "$ixe" "$c" --json "$res" --execute-only > "$tmp/$slug.log" 2>&1 \
+        bench-typecheck --ixe "$ixe" "$c" --json "$res" --execute-only \
+          --texray-json "$spans" > "$tmp/$slug.log" 2>&1 \
           || { echo "::warning::aiur execute '$c' failed; dropping" >&2; continue; }
       fi
+      merge_phases "$res" "$spans"
       [ -s "$res" ] && cat "$res"
     done < "$names" | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null
     emit_empty
@@ -90,10 +101,11 @@ case "$backend" in
 
   zisk|sp1)
     host="${backend}-host"; work="$repo/$backend"
-    # Build the host once so per-constant timing excludes compilation. Order is
-    # `timeout … /usr/bin/time … host`: timeout bounds a runaway constant while
-    # /usr/bin/time still measures the host directly (its child), so RSS/elapsed
-    # are the host's, not a wrapper's.
+    # Build the host once so per-constant timing excludes compilation. The host
+    # self-measures and writes its own neutral results JSON via `--json`
+    # (cycles/execute-time/throughput/peak-rss for execute; prove-time/… for
+    # prove), so there is nothing to grep — `timeout` only bounds a runaway
+    # constant.
     echo "::group::cargo build $host"
     ( cd "$work" && cargo build --quiet --release --bin "$host" )
     echo "::endgroup::"
@@ -104,80 +116,55 @@ case "$backend" in
     # in-session as root; the host children inherit it. Without this the ASM
     # services die with `mmap(rom) errno=11`. SP1 needs no such raise.
     [ "$backend" = zisk ] && sudo prlimit --pid $$ --memlock=unlimited:unlimited
+    # zisk proves with `--gpu`; sp1 selects the GPU prover via its env/features.
+    gpu=; [ "$backend" = zisk ] && gpu=--gpu
     while IFS= read -r c; do
       [ -z "$c" ] && continue
       slug=$(printf '%s' "$c" | tr '/ .:' '____')
-      log="$tmp/$slug.out"; tmf="$tmp/$slug.time"
+      res="$tmp/$slug.json"; log="$tmp/$slug.log"; spans="$tmp/$slug.spans"
       if [ "$mode" = execute ]; then
-        ( cd "$work" && timeout 25m /usr/bin/time -f '%e %M' -o "$tmf" \
-            "$bin" --execute --ixe "$ixe" --constant "$c" --skip-deps ) > "$log" 2>>"$log" \
+        ( cd "$work" && timeout 25m "$bin" --execute --ixe "$ixe" \
+            --constant "$c" --skip-deps --json "$res" --texray-json "$spans" ) \
+          > "$log" 2>&1 \
           || { echo "::warning::$backend execute '$c' failed/timed out; dropping" >&2; continue; }
-        fail=$(grep -oE 'failures[:=] ?[0-9]+' "$log" | head -1 | grep -oE '[0-9]+')
-        if [ "${fail:-1}" != 0 ]; then
-          echo "::warning::$backend '$c': nonzero/missing failures; dropping" >&2; continue
-        fi
-        # Total cycles: sharded prints "total cycles: N", single prints "cycles: N".
-        cyc=$(grep -oE 'total cycles: [0-9]+' "$log" | tail -1 | grep -oE '[0-9]+')
-        [ -z "$cyc" ] && cyc=$(grep -oE 'cycles: [0-9]+' "$log" | tail -1 | grep -oE '[0-9]+')
-        [ -z "$cyc" ] && { echo "::warning::$backend '$c': no cycle count; dropping" >&2; continue; }
-        secs=$(awk 'NR==1{print $1}' "$tmf"); rssk=$(awk 'NR==1{print $2}' "$tmf")
-        rss=$(( ${rssk:-0} * 1024 ))
-        tput=$(awk -v c="$cyc" -v s="${secs:-0}" 'BEGIN{ if (s>0) printf "%.0f", c/s; else print 0 }')
-        # Per-shard "cycles=<n>" lines appear only for sharded runs.
-        mapfile -t sh < <(grep -oE 'cycles=[0-9]+' "$log" | grep -oE '[0-9]+')
-        base="cycles:\$cyc, \"execute-time\":\$secs, throughput:\$tput, \"peak-rss\":\$rss"
-        if [ "${#sh[@]}" -gt 0 ]; then
-          maxsh=$(printf '%s\n' "${sh[@]}" | sort -n | tail -1)
-          jq -n --arg n "$c" --argjson cyc "$cyc" --argjson secs "${secs:-0}" \
-                --argjson tput "$tput" --argjson rss "$rss" \
-                --argjson nsh "${#sh[@]}" --argjson maxsh "$maxsh" \
-            "{(\$n): {$base, shards:\$nsh, \"max-shard-cycles\":\$maxsh}}"
-        else
-          jq -n --arg n "$c" --argjson cyc "$cyc" --argjson secs "${secs:-0}" \
-                --argjson tput "$tput" --argjson rss "$rss" \
-            "{(\$n): {$base}}"
-        fi
       else
-        # prove (single-leaf, GPU runner only — the workflow gates this cell).
-        ( cd "$work" && timeout 60m cargo run --quiet --release --bin "$host" -- \
-            --gpu --ixe "$ixe" --constant "$c" --skip-deps ) > "$log" 2>&1 \
+        # prove (GPU runner only — the workflow gates this cell).
+        ( cd "$work" && timeout 60m "$bin" $gpu --ixe "$ixe" \
+            --constant "$c" --skip-deps --json "$res" --texray-json "$spans" ) \
+          > "$log" 2>&1 \
           || { echo "::warning::$backend prove '$c' failed/timed out; dropping" >&2; continue; }
-        secs=$(grep -oE 'prove [0-9.]+s'   "$log" | head -1 | grep -oE '[0-9.]+')
-        steps=$(grep -oE '\([0-9]+ steps\)' "$log" | head -1 | grep -oE '[0-9]+')
-        fail=$(grep -oE 'failures=[0-9]+'  "$log" | head -1 | grep -oE '[0-9]+')
-        if [ -n "${secs:-}" ] && [ "${fail:-1}" = 0 ]; then
-          jq -n --arg n "$c" --argjson t "$secs" --argjson s "${steps:-0}" \
-            '{($n): {"prove-time": $t, steps: $s}}'
-        else
-          echo "::warning::$backend prove '$c': no clean prove line; dropping" >&2
-        fi
       fi
+      # The host writes $res only on a clean (zero-failure) run.
+      merge_phases "$res" "$spans"
+      [ -s "$res" ] && cat "$res"
     done < "$names" | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null
     emit_empty
     ;;
 
   native)
     # Native out-of-circuit Rust kernel (far faster than proving). Two views,
-    # both via the `##check## <elapsed_ms> <passed> <failures> <total>` line
-    # (ix check is a single multi-threaded process so /usr/bin/time -f '%M' is the
-    # true peak RSS): the whole env in parallel (`--anon`, keyed by env), and a
-    # per-primary subject check (`--consts`, keyed by constant) for an
-    # apples-to-apples baseline next to the zkVM `--skip-deps` execute.
+    # both keyed off the structured line
+    #   `##check## <elapsed_ms> <passed> <failures> <total> <peak-rss-bytes>`
+    # (peak-rss from ix check's tracing-texray tree sampler): the whole env in
+    # parallel (`--anon`, keyed by env), and a per-primary subject check
+    # (`--consts`, keyed by constant) for an apples-to-apples baseline next to
+    # the zkVM `--skip-deps` execute.
     native_one() {  # <label> <ix-check-args…>  → prints one JSON object
       local label="$1"; shift
-      local log="$tmp/n.out" tmf="$tmp/n.time"
-      /usr/bin/time -f '%e %M' -o "$tmf" ix check "$ixe" "$@" > "$log" 2>>"$log" \
+      local log="$tmp/n.out"
+      ix check-rs "$ixe" "$@" > "$log" 2>>"$log" \
         || { echo "::warning::native '$label' check failed; dropping" >&2; return; }
-      local line ems fl tot
+      local line ems fl tot rss
       line=$(grep '^##check##' "$log" | tail -1)
-      ems=$(echo "$line" | awk '{print $2}'); fl=$(echo "$line" | awk '{print $4}'); tot=$(echo "$line" | awk '{print $5}')
+      ems=$(echo "$line" | awk '{print $2}'); fl=$(echo "$line" | awk '{print $4}')
+      tot=$(echo "$line" | awk '{print $5}'); rss=$(echo "$line" | awk '{print $6}')
       { [ -n "${tot:-}" ] && [ "${fl:-1}" = 0 ]; } \
         || { echo "::warning::native '$label': bad ##check## / failures; dropping" >&2; return; }
-      local rssk rss cs tp
-      rssk=$(awk 'NR==1{print $2}' "$tmf"); rss=$(( ${rssk:-0} * 1024 ))
+      local cs tp
       cs=$(awk -v e="$ems" 'BEGIN{printf "%.3f", e/1000}')
       tp=$(awk -v t="$tot" -v e="$ems" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }')
-      jq -n --arg n "$label" --argjson c "$tot" --argjson s "$cs" --argjson tp "$tp" --argjson rss "$rss" \
+      jq -n --arg n "$label" --argjson c "$tot" --argjson s "$cs" --argjson tp "$tp" \
+            --argjson rss "${rss:-0}" \
         '{($n): {constants:$c, "check-time":$s, throughput:$tp, "peak-rss":$rss}}'
     }
     {
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index 5fb5cfee..d4134d00 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -225,8 +225,10 @@ jobs:
             --csv Benchmarks/Vectors.csv --env "$benv" --mode prove --primary --out names.txt
           echo "proving $(wc -l < names.txt) primary constants:"; cat names.txt
           bash .github/scripts/run.sh . "${{ matrix.bench }}" aiur prove names.txt neutral.json
-          # Wrap neutral { name: { metric: v } } → Bencher Metric Format.
-          jq 'map_values(map_values({value: .}))' neutral.json > aiur.json
+          # Wrap neutral { name: { metric: v } } → Bencher Metric Format,
+          # flattening the per-phase `phases` object into `phase:<span>` measures
+          # so span timings are tracked over time alongside the headline metrics.
+          jq 'map_values((.phases // {}) as $p | del(.phases) | (. + ($p | with_entries(.key |= "phase:" + .))) | map_values({value: .}))' neutral.json > aiur.json
           cat aiur.json
       # Upload Aiur metrics. Every measure shares the per-workload baseline
       # window (data points since the aiur reset tag). constants is deterministic
@@ -235,7 +237,10 @@ jobs:
       # let wins through) rather than a hard pin. prove-time/execute-time,
       # peak-rss (texray's proving RSS high-water mark), and throughput
       # (constants/prove-time, where a drop is the regression) are noisy
-      # wall-clock and ride percentage bounds.
+      # wall-clock and ride percentage bounds. The per-phase `phase:<span>`
+      # measures are uploaded for trend visibility but intentionally left
+      # un-thresholded (noisy and dynamically named; the PR-comment drill-down
+      # does the phase-level alerting).
       - uses: ./.github/actions/bencher-track
         with:
           testbed: aiur-typecheck-x64-32x
@@ -310,8 +315,10 @@ jobs:
           echo "executing $(wc -l < names.txt) primary constants:"; cat names.txt
           bash .github/scripts/run.sh . ${{ matrix.bench }} ${{ matrix.backend }} execute \
             names.txt neutral.json
-          # Wrap neutral { name: { metric: v } } → Bencher Metric Format.
-          jq 'map_values(map_values({value: .}))' neutral.json > bench.json
+          # Wrap neutral { name: { metric: v } } → Bencher Metric Format,
+          # flattening the per-phase `phases` object into `phase:<span>` measures
+          # so span timings are tracked over time alongside the headline metrics.
+          jq 'map_values((.phases // {}) as $p | del(.phases) | (. + ($p | with_entries(.key |= "phase:" + .))) | map_values({value: .}))' neutral.json > bench.json
           cat bench.json
       # cycles / shards / max-shard-cycles are deterministic per guest ELF →
       # pinned (0/0). execute-time / peak-rss / throughput are noisy wall-clock →
@@ -394,7 +401,9 @@ jobs:
           python3 .github/scripts/bench.py manifest \
             --csv Benchmarks/Vectors.csv --env "$benv" --mode execute --primary --out names.txt
           bash .github/scripts/run.sh . "${{ matrix.bench }}" native execute names.txt neutral.json
-          jq 'map_values(map_values({value: .}))' neutral.json > bench.json
+          # Wrap → Bencher Metric Format, flattening `phases` into `phase:<span>`
+          # measures (a no-op for native, which records no spans).
+          jq 'map_values((.phases // {}) as $p | del(.phases) | (. + ($p | with_entries(.key |= "phase:" + .))) | map_values({value: .}))' neutral.json > bench.json
           cat bench.json
       # constants is deterministic → pinned (0/0); check-time / throughput /
       # peak-rss are noisy parallel wall-clock → percentage bounds.
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index 682e063f..546bd351 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -160,12 +160,6 @@ jobs:
       - name: Install Zisk
         if: matrix.cell.skip != 'true' && matrix.cell.backend == 'zisk'
         uses: ./.github/actions/install-zisk
-      # native = whole-env `ix check --anon`; needs only GNU time (ix is built above).
-      - name: Install GNU time
-        if: matrix.cell.skip != 'true' && matrix.cell.backend == 'native'
-        run: |
-          sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true
-          sudo apt-get update && sudo apt-get install -y time
 
       # ---------- main side (cached by base SHA) ----------
       - name: Restore cached main results
diff --git a/Benchmarks/Typecheck.lean b/Benchmarks/Typecheck.lean
index 1196c055..aded506d 100644
--- a/Benchmarks/Typecheck.lean
+++ b/Benchmarks/Typecheck.lean
@@ -105,6 +105,9 @@ structure Result where
   fftCost : Float
   executeSec : Float
   proveSec : Option Float := none
+  /-- Peak resident-set size in bytes (tracing-texray tree sampler), captured
+      after the constant's heaviest phase. -/
+  peakRss : Option Nat := none
   deriving Inhabited
 
 /-- Round a Float to `d` decimal places, to keep the emitted JSON readable. -/
@@ -119,6 +122,9 @@ def Result.toJsonEntry (r : Result) : String × Json :=
     [ ("constants", Lean.toJson r.constants)
     , ("fft-cost", Lean.toJson (roundTo 0 r.fftCost))
     , ("execute-time", Lean.toJson (roundTo 6 r.executeSec)) ]
+  let base := match r.peakRss with
+    | some n => base ++ [ ("peak-rss", Lean.toJson n) ]
+    | none => base
   -- prove-time and the derived proving throughput (constants/prove-time, the
   -- proving analog of compile's constants/sec) are present only once proven.
   let fields := match r.proveSec with
@@ -161,6 +167,15 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
     IO.eprintln "error: provide a constant via --constant, positional name(s), and/or --manifest <path>"
     return 1
   let jsonOut : Option String := (p.flag? "json").map (·.as! String)
+  let texrayJson : Option String := (p.flag? "texray-json").map (·.as! String)
+  -- Start the process-tree RSS sampler so each Result's peak-rss reflects the
+  -- true high-water mark. When a drill-down path is given, install the streaming
+  -- subscriber and point the per-span sink at it, so the prover's aiur/* and
+  -- stark/* phase timings land as JSON Lines for the CI comparison.
+  TracingTexray.startSampler
+  match texrayJson with
+  | some path => TracingTexray.init {}; TracingTexray.jsonSink path
+  | none => pure ()
   -- skip-deps: check just the target (`verify_const`, trusting its deps)
   -- instead of re-checking the whole transitive closure (`verify_claim`).
   let skipDeps := p.hasFlag "skip-deps"
@@ -244,6 +259,8 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
   -- `--execute-only`: stop after Phase 1; the results JSON (if requested) is
   -- already complete with the execute metrics.
   if executeOnly then
+    let peak ← TracingTexray.peakTreeRssBytes
+    execed := execed.map (fun (r, a) => ({ r with peakRss := some peak }, a))
     writeJson (execed.map (·.1))
     match jsonOut with
     | some path => IO.println s!"wrote {execed.size} execute-only benchmarks to {path}"
@@ -280,8 +297,10 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
       | .error e => IO.eprintln s!"  prove {r.name} failed: {e}"; continue
       | .ok _ => pure ()
       spent := spent + proveSec
+      let peak ← TracingTexray.peakTreeRssBytes
       IO.println s!"  {r.name}: prove={proveSec}s (cumulative {spent}s)"
-      ordered := ordered.set! i ({ r with proveSec := some proveSec }, addr)
+      ordered := ordered.set! i
+        ({ r with proveSec := some proveSec, peakRss := some peak }, addr)
       writeJson (ordered.map (·.1))
     catch e =>
       IO.eprintln s!"  prove {r.name} threw: {e}"
@@ -304,6 +323,7 @@ def typecheckCmd : Cli.Cmd := `[Cli|
     "execute-only";       "Execute only (Phase 1: constants / fft-cost / execute-time) and skip proving. The fast per-PR `execute`-mode signal."
     texray;               "Force the tracing-texray timeline + RAM breakdown on (per-prove spans on stderr)."
     "no-texray";          "Force the breakdown off. Default: on iff --json was not given."
+    "texray-json" : String; "Write per-phase span timings (aiur/*, stark/*) as JSON Lines to this path, for the CI drill-down. Implies installing the streaming subscriber."
 
   ARGS:
     ...names : String;   "Fully-qualified constant name(s) to benchmark (e.g. `Nat.add_comm String.append`). Optional if `--manifest` is given."
diff --git a/Cargo.lock b/Cargo.lock
index 9db6079f..a8af9cc1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4022,7 +4022,7 @@ dependencies = [
 [[package]]
 name = "tracing-texray"
 version = "0.2.0"
-source = "git+https://github.com/argumentcomputer/tracing-texray?rev=31d194dd1bc50458d26f77c89bb68f67e5d1c149#31d194dd1bc50458d26f77c89bb68f67e5d1c149"
+source = "git+https://github.com/argumentcomputer/tracing-texray?rev=15ae57cfbcb234ff5911a306898e7d93d396d648#15ae57cfbcb234ff5911a306898e7d93d396d648"
 dependencies = [
  "loom",
  "parking_lot",
diff --git a/Cargo.toml b/Cargo.toml
index 456f0b7a..618aa969 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -58,7 +58,7 @@ sha2 = "0.10"
 tiny-keccak = { version = "2", features = ["keccak"] }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
-tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "31d194dd1bc50458d26f77c89bb68f67e5d1c149" }
+tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "15ae57cfbcb234ff5911a306898e7d93d396d648" }
 
 [workspace.lints.rust]
 invalid_reference_casting = "warn"
diff --git a/Ix/Cli/CheckRsCmd.lean b/Ix/Cli/CheckRsCmd.lean
index 1cb0fbf7..596b3578 100644
--- a/Ix/Cli/CheckRsCmd.lean
+++ b/Ix/Cli/CheckRsCmd.lean
@@ -29,6 +29,7 @@ public import Cli
 public import Ix.Common
 public import Ix.KernelCheck
 public import Ix.Meta
+public import Ix.TracingTexray
 public import Ix.Cli.ValidateCmd
 public import Std.Internal.UV.System
 
@@ -168,7 +169,8 @@ private def runCheckAnon (envPath : String) (p : Cli.Parsed) : IO UInt32 := do
   if !failOutPath.isEmpty then
     IO.println s!"[check] streamed {failures.size} failure(s) to {failOutPath}"
 
-  IO.println s!"##check## {elapsed} {passed} {failures.size} {results.size}"
+  let peakRss ← TracingTexray.peakTreeRssBytes
+  IO.println s!"##check## {elapsed} {passed} {failures.size} {results.size} {peakRss}"
   return if failures.isEmpty then 0 else 1
 
 /-- Meta-mode runner: dispatch to `rsCheckIxonFFI` with seed filtering. -/
@@ -219,7 +221,8 @@ private def runCheckMeta (envPath : String) (p : Cli.Parsed) : IO UInt32 := do
   if !failOutPath.isEmpty then
     IO.println s!"[check] streamed {failures.size} failure(s) to {failOutPath}"
 
-  IO.println s!"##check## {elapsed} {passed} {failures.size} {seedNames.size}"
+  let peakRss ← TracingTexray.peakTreeRssBytes
+  IO.println s!"##check## {elapsed} {passed} {failures.size} {seedNames.size} {peakRss}"
   return if failures.isEmpty then 0 else 1
 
 def runCheckRsCmd (p : Cli.Parsed) : IO UInt32 := do
@@ -228,6 +231,10 @@ def runCheckRsCmd (p : Cli.Parsed) : IO UInt32 := do
       return 1
   let envPath := pathArg.as! String
 
+  -- Start the process-tree RSS sampler so the `##check##` line can report an
+  -- accurate peak-rss (the parallel kernel check's high-water mark).
+  TracingTexray.startSampler
+
   -- `--workers N` is plumbed through the existing
   -- `IX_KERNEL_CHECK_WORKERS` env var that `resolve_kernel_check_workers`
   -- (`src/ffi/kernel.rs`) reads. Setting `1` forces a single-threaded
diff --git a/Ix/TracingTexray.lean b/Ix/TracingTexray.lean
index c0c4c064..2aeac07f 100644
--- a/Ix/TracingTexray.lean
+++ b/Ix/TracingTexray.lean
@@ -41,6 +41,32 @@ private opaque initWith
 def init (s : Settings := {}) : IO Unit :=
   initWith s.namePrefixes s.trackRam s.streaming
 
+@[extern "rs_texray_start_sampler"]
+private opaque startSamplerFFI (intervalMs : UInt64) : IO Unit
+
+/-- Start the process-tree RSS sampler (idempotent). Unlike the per-span
+    `/proc/self/status` reads, this sums RSS across this process and all its
+    children, so [`peakTreeRssBytes`] captures memory resident in helper
+    processes (e.g. a zkVM host's services). `intervalMs` is the sample period. -/
+def startSampler (intervalMs : UInt64 := 50) : IO Unit :=
+  startSamplerFFI intervalMs
+
+@[extern "rs_texray_peak_tree_rss_bytes"]
+private opaque peakTreeRssBytesFFI : IO UInt64
+
+/-- Peak resident-set size in bytes across this process and its children. `0`
+    until [`startSampler`] has run or on non-Linux platforms. -/
+def peakTreeRssBytes : IO Nat := do
+  return (← peakTreeRssBytesFFI).toNat
+
+@[extern "rs_texray_json_sink"]
+private opaque jsonSinkFFI (path : @& String) : IO Unit
+
+/-- Direct the per-span timing sink to `path` (JSON Lines). Pair with `init`
+    (streaming) so the examined `aiur/*` / `stark/*` spans are recorded. -/
+def jsonSink (path : String) : IO Unit :=
+  jsonSinkFFI path
+
 end TracingTexray
 
 end
diff --git a/crates/ffi/src/texray.rs b/crates/ffi/src/texray.rs
index 77d91b5b..326f23a0 100644
--- a/crates/ffi/src/texray.rs
+++ b/crates/ffi/src/texray.rs
@@ -59,3 +59,38 @@ extern "C" fn rs_texray_init(
   let _ = Registry::default().with(layer.with_filter(filter)).try_init();
   LeanIOResult::ok(LeanOwned::box_usize(0))
 }
+
+/// Start tracing-texray's process-tree RSS sampler (idempotent). `interval_ms`
+/// is the sampling period in milliseconds. Runs on a background daemon thread;
+/// [`rs_texray_peak_tree_rss_bytes`] reads back the high-water mark. Captures
+/// child-process memory (e.g. a zkVM host's helper processes) that a bare
+/// `/proc/self/status` read misses.
+#[unsafe(no_mangle)]
+extern "C" fn rs_texray_start_sampler(
+  interval_ms: u64,
+) -> LeanIOResult<LeanOwned> {
+  tracing_texray::rss_sampler::start(std::time::Duration::from_millis(
+    interval_ms,
+  ));
+  LeanIOResult::ok(LeanOwned::box_usize(0))
+}
+
+/// Peak resident-set size (bytes) across this process and its children per the
+/// tree sampler. `0` until [`rs_texray_start_sampler`] has run or off Linux.
+#[unsafe(no_mangle)]
+extern "C" fn rs_texray_peak_tree_rss_bytes() -> LeanIOResult<LeanOwned> {
+  let bytes = tracing_texray::rss_sampler::peak_tree_rss_bytes();
+  LeanIOResult::ok(LeanOwned::box_u64(bytes))
+}
+
+/// Direct tracing-texray's per-span timing sink to `path` (one
+/// `{"span","seconds"}` JSON line per closed examined span). Combine with a
+/// `streaming`/examined subscriber so the prover's `aiur/*` + `stark/*` spans
+/// are recorded for the CI drill-down.
+#[unsafe(no_mangle)]
+extern "C" fn rs_texray_json_sink(
+  path: LeanString<LeanBorrowed<'_>>,
+) -> LeanIOResult<LeanOwned> {
+  let _ = tracing_texray::json_sink::to_file(&path.to_string());
+  LeanIOResult::ok(LeanOwned::box_usize(0))
+}
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index a06a646f..c7680829 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -13,20 +13,37 @@ the same backend drivers:
 | backend | what it measures | metrics |
 |---|---|---|
 | `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove) | `fft-cost`, `execute-time`, `prove-time`, `peak-rss`, `constants`, `throughput` |
-| `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `peak-rss` (+ `shards`, `max-shard-cycles` for sharded runs) |
-| `native` | the same kernel run **out-of-circuit and in parallel** (`ix check`) — far faster | `throughput` (constants/s), `check-time`, `peak-rss`, `constants` |
+| `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `peak-rss` |
+| `native` | the same kernel run **out-of-circuit and in parallel** (`ix check-rs`) — far faster | `throughput` (constants/s), `check-time`, `peak-rss`, `constants` |
 
 In **prove** mode, `run.sh` proves each constant whose Aiur fft-cost fits the
 prover RAM ceiling (`AIUR_PROVE_MAX_FFT`, ~128 GB at 2.34 GB per billion fft) and
 falls back to **execute-only** for the rest, so every primary still reports
-metrics. The `native` backend reports two views: the **whole env** (`ix check
---anon`, keyed by env) and a **per-primary subject check** (`ix check --consts`,
-keyed by constant — apples-to-apples with the zkVM `--skip-deps` execute).
+metrics. The `native` backend reports two views: the **whole env** (`ix check-rs
+--anon`, keyed by env) and a **per-primary subject check** (`ix check-rs
+--consts`, keyed by constant — apples-to-apples with the zkVM `--skip-deps`
+execute).
 
 All four are driven by `.github/scripts/run.sh` (compile the env `.ixe`, run the
 backend, emit a neutral `{ "<name>": { "<metric>": n } }` JSON). The PR workflow
 compares two such JSONs; the bencher workflow wraps one in Bencher Metric Format.
 
+### Peak RAM and the per-phase drill-down (tracing-texray)
+
+Every tool sources `peak-rss` from **tracing-texray's process-tree sampler** — a
+background thread that sums `VmRSS` across the process *and its children* and
+tracks the high-water mark. This captures memory that a bare `/proc/self/status`
+read misses, most importantly Zisk's ASM microservices (separate PIDs).
+
+Each tool also writes its per-phase span timings (tracing-texray's JSON-Lines
+sink, one `{"span","seconds"}` per closed span) to a side file, which `run.sh`
+aggregates into a `phases` object on the constant's entry. `aiur` yields a rich
+breakdown (`aiur/execute`, `aiur/witness`, `stark/fri_open`, …) since the prover
+instruments those spans; `zisk`/`sp1` record a single `execute`/`prove` phase;
+`native` records none. In a `!benchmark` comparison, `bench.py` renders any
+multi-phase constant as a collapsible **per-phase timing drill-down** (main vs
+PR seconds + Δ%), so a regression can be traced to the phase that moved.
+
 ## Constant set — `Benchmarks/Vectors.csv`
 
 One CSV is the single source of truth: `name,env,tier,shard_target,primary,aiur_fft,zisk_cycles`.
diff --git a/sp1/Cargo.lock b/sp1/Cargo.lock
index a5f9903d..eb4c6af9 100644
--- a/sp1/Cargo.lock
+++ b/sp1/Cargo.lock
@@ -1247,6 +1247,21 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
 
+[[package]]
+name = "generator"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3b854b0e584ead1a33f18b2fcad7cf7be18b3875c78816b753639aa501513ae"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "libc",
+ "log",
+ "rustversion",
+ "windows-link",
+ "windows-result",
+]
+
 [[package]]
 name = "generic-array"
 version = "0.14.9"
@@ -1894,6 +1909,19 @@ version = "0.4.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5"
 
+[[package]]
+name = "loom"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca"
+dependencies = [
+ "cfg-if",
+ "generator",
+ "scoped-tls",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "lru"
 version = "0.12.5"
@@ -3225,6 +3253,12 @@ dependencies = [
  "sdd",
 ]
 
+[[package]]
+name = "scoped-tls"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294"
+
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
@@ -4072,9 +4106,11 @@ dependencies = [
  "ix-common",
  "ix-kernel",
  "ixon",
+ "serde_json",
  "sp1-build",
  "sp1-sdk",
  "tokio",
+ "tracing-texray",
 ]
 
 [[package]]
@@ -4636,6 +4672,16 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
+[[package]]
+name = "terminal_size"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874"
+dependencies = [
+ "rustix",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "thiserror"
 version = "1.0.69"
@@ -5062,6 +5108,18 @@ dependencies = [
  "tracing-log",
 ]
 
+[[package]]
+name = "tracing-texray"
+version = "0.2.0"
+source = "git+https://github.com/argumentcomputer/tracing-texray?rev=15ae57cfbcb234ff5911a306898e7d93d396d648#15ae57cfbcb234ff5911a306898e7d93d396d648"
+dependencies = [
+ "loom",
+ "parking_lot",
+ "terminal_size",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "transpose"
 version = "0.2.3"
diff --git a/sp1/host/Cargo.toml b/sp1/host/Cargo.toml
index f45c1d3c..eed2beb0 100644
--- a/sp1/host/Cargo.toml
+++ b/sp1/host/Cargo.toml
@@ -30,6 +30,11 @@ sp1-sdk = { git = "https://github.com/argumentcomputer/sp1", branch = "blake3-pr
 tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
 clap = { version = "4.0", features = ["derive"] }
 anyhow = "1"
+# Neutral per-constant results JSON (`--json`), merged by the CI bench driver.
+serde_json = "1"
+# Process-tree RSS sampler (accurate peak RAM) + per-phase timing sink for the
+# CI drill-down.
+tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "15ae57cfbcb234ff5911a306898e7d93d396d648" }
 # Throughput formatting (e.g. `42.0 consts/s`).
 human-repr = "1"
 # Proof-size measurement: SP1's `SP1ProofWithPublicValues::bytes()` returns
diff --git a/sp1/host/src/main.rs b/sp1/host/src/main.rs
index 5c69741e..ded742b8 100644
--- a/sp1/host/src/main.rs
+++ b/sp1/host/src/main.rs
@@ -71,6 +71,43 @@ struct Args {
   /// `bench-typecheck --skip-deps`.
   #[arg(long, requires = "constant")]
   skip_deps: bool,
+
+  /// Write the neutral per-constant results JSON `{ "<name>": { … } }` to this
+  /// path (execute → cycles/execute-time/throughput/peak-rss; prove →
+  /// prove-time/peak-rss). Written only on a clean run (zero failures), so a
+  /// present file always holds a valid measurement. This is the machine
+  /// source the CI bench driver merges; the human summary still prints.
+  #[arg(long)]
+  json: Option<PathBuf>,
+
+  /// Write per-phase timings (`{"span","seconds"}` JSON Lines) to this path via
+  /// tracing-texray's sink, for the CI drill-down. The host records its
+  /// `execute` / `prove` phases here.
+  #[arg(long)]
+  texray_json: Option<PathBuf>,
+}
+
+/// Peak resident set size (bytes) across this process *and its children*, from
+/// tracing-texray's tree sampler. `0` until the sampler has started or off
+/// Linux.
+fn peak_rss_bytes() -> Option<u64> {
+  match tracing_texray::rss_sampler::peak_tree_rss_bytes() {
+    0 => None,
+    n => Some(n),
+  }
+}
+
+/// Write the neutral per-constant entry `{ "<name>": <metrics> }` to `path`
+/// (the shape `run.sh` merges with `jq -s`). serde_json handles key escaping so
+/// arbitrary Lean names are safe.
+fn write_json_entry(
+  path: &PathBuf,
+  name: &str,
+  metrics: serde_json::Value,
+) -> Result<()> {
+  let entry = serde_json::json!({ name: metrics });
+  fs::write(path, serde_json::to_string(&entry)?)
+    .map_err(|e| anyhow::anyhow!("write {}: {e}", path.display()))
 }
 
 fn load_env_bytes(ixe: Option<&PathBuf>) -> Vec<u8> {
@@ -185,6 +222,23 @@ async fn main() -> Result<()> {
   sp1_sdk::utils::setup_logger();
 
   let args = Args::parse();
+
+  // Start the process-tree RSS sampler (accurate peak RAM) and point the
+  // per-phase timing sink at the drill-down file if requested — both
+  // independent of the SDK's global tracing logger.
+  //
+  // TODO(spans): the sink only receives the coarse `sp1/execute` / `sp1/prove`
+  // phases we `record_manual` below. For a finer drill-down, install a TeXRay
+  // subscriber and examine the sp1-sdk's own tracing spans — which requires
+  // composing it with the SDK's global logger (`sp1_sdk::utils::setup_logger`),
+  // currently the sole subscriber.
+  tracing_texray::rss_sampler::start(std::time::Duration::from_millis(50));
+  if let Some(path) = &args.texray_json {
+    if let Some(p) = path.to_str() {
+      let _ = tracing_texray::json_sink::to_file(p);
+    }
+  }
+
   let whole_env_bytes = load_env_bytes(args.ixe.as_ref());
 
   // `--constant` ships a closure sub-env + a check-list (Anon only); otherwise
@@ -219,6 +273,10 @@ async fn main() -> Result<()> {
     let (output, report) =
       client.execute(GUEST_ELF, stdin).await.expect("execute");
     let exec_duration = exec_start.elapsed();
+    tracing_texray::json_sink::record_manual(
+      "sp1/execute",
+      exec_duration.as_secs_f64(),
+    );
     let failures = u32::from_le_bytes(
       output.as_slice()[..4].try_into().expect("output too short"),
     );
@@ -258,6 +316,22 @@ async fn main() -> Result<()> {
     if failures > 0 {
       bail!("kernel typecheck produced {failures} failure(s)");
     }
+    if let Some(path) = &args.json {
+      let cycles = report.total_instruction_count();
+      let secs = exec_duration.as_secs_f64();
+      let tput = if secs > 0.0 { cycles as f64 / secs } else { 0.0 };
+      let key = args.constant.clone().unwrap_or_else(|| "env".to_string());
+      write_json_entry(
+        path,
+        &key,
+        serde_json::json!({
+          "cycles": cycles,
+          "execute-time": (secs * 1e6).round() / 1e6,
+          "throughput": tput.round(),
+          "peak-rss": peak_rss_bytes(),
+        }),
+      )?;
+    }
     return Ok(());
   }
 
@@ -269,6 +343,10 @@ async fn main() -> Result<()> {
   // var (see the module doc header and `Cargo.toml`). `--execute` doesn't.
   let proof = client.prove(&pk, stdin).compressed().await.expect("prove");
   let prove_duration = start.elapsed();
+  tracing_texray::json_sink::record_manual(
+    "sp1/prove",
+    prove_duration.as_secs_f64(),
+  );
   let throughput =
     const_count as f64 / prove_duration.as_secs_f64().max(f64::EPSILON);
   // `SP1ProofWithPublicValues::bytes()` is the onchain-verifier encoding
@@ -285,5 +363,16 @@ async fn main() -> Result<()> {
   client.verify(&proof, pk.verifying_key(), None).expect("verify");
   let verify_duration = verify_start.elapsed();
   println!("proof verified in {:.3}s", verify_duration.as_secs_f64());
+  if let Some(path) = &args.json {
+    let key = args.constant.clone().unwrap_or_else(|| "env".to_string());
+    write_json_entry(
+      path,
+      &key,
+      serde_json::json!({
+        "prove-time": (prove_duration.as_secs_f64() * 1e6).round() / 1e6,
+        "peak-rss": peak_rss_bytes(),
+      }),
+    )?;
+  }
   Ok(())
 }
diff --git a/zisk/Cargo.lock b/zisk/Cargo.lock
index 57b66b1c..dfcb2928 100644
--- a/zisk/Cargo.lock
+++ b/zisk/Cargo.lock
@@ -5410,6 +5410,16 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
+[[package]]
+name = "terminal_size"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874"
+dependencies = [
+ "rustix",
+ "windows-sys 0.60.2",
+]
+
 [[package]]
 name = "thiserror"
 version = "1.0.69"
@@ -5904,6 +5914,18 @@ dependencies = [
  "tracing-serde",
 ]
 
+[[package]]
+name = "tracing-texray"
+version = "0.2.0"
+source = "git+https://github.com/argumentcomputer/tracing-texray?rev=15ae57cfbcb234ff5911a306898e7d93d396d648#15ae57cfbcb234ff5911a306898e7d93d396d648"
+dependencies = [
+ "loom",
+ "parking_lot",
+ "terminal_size",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "try-lock"
 version = "0.2.5"
@@ -7257,7 +7279,9 @@ dependencies = [
  "ix-common",
  "ix-kernel",
  "ixon",
+ "serde_json",
  "tokio",
+ "tracing-texray",
  "zisk-sdk",
 ]
 
diff --git a/zisk/host/Cargo.toml b/zisk/host/Cargo.toml
index d77c0e7a..f9026384 100644
--- a/zisk/host/Cargo.toml
+++ b/zisk/host/Cargo.toml
@@ -17,6 +17,12 @@ ix-kernel = { path = "../../crates/kernel" }
 zisk-sdk = { workspace = true }
 anyhow = "1"
 clap = { version = "4.0", features = ["derive"] }
+# Neutral per-constant results JSON (`--json`), merged by the CI bench driver.
+serde_json = "1"
+# Accurate peak RAM via the process-tree sampler (captures the ASM
+# microservices' child-process memory that `/proc/self/status` misses) and the
+# per-phase timing sink feeding the CI drill-down.
+tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "15ae57cfbcb234ff5911a306898e7d93d396d648" }
 tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
 # Throughput formatting (e.g. `42.0 consts/s`).
 human-repr = "1"
diff --git a/zisk/host/src/main.rs b/zisk/host/src/main.rs
index 6bc6b11f..b3c89ae5 100644
--- a/zisk/host/src/main.rs
+++ b/zisk/host/src/main.rs
@@ -208,6 +208,46 @@ struct Args {
   /// `bench-typecheck --skip-deps`.
   #[arg(long, requires = "constant")]
   skip_deps: bool,
+
+  /// Write the neutral per-constant results JSON `{ "<name>": { … } }` to this
+  /// path (execute → cycles/execute-time/throughput/peak-rss; prove →
+  /// prove-time/steps/peak-rss). Written only on a clean run (zero failures),
+  /// so a present file always holds a valid measurement. Requires `--constant`.
+  /// This is the machine-readable source the CI bench driver merges; the
+  /// human-readable summary still prints regardless.
+  #[arg(long, requires = "constant")]
+  json: Option<PathBuf>,
+
+  /// Write per-phase timings (`{"span","seconds"}` JSON Lines) to this path via
+  /// tracing-texray's sink, for the CI drill-down. The host records its
+  /// `execute` / `prove` phases here; any zisk-sdk tracing spans nested under
+  /// them are captured too.
+  #[arg(long)]
+  texray_json: Option<PathBuf>,
+}
+
+/// Peak resident set size (bytes) across this process *and its children*, from
+/// tracing-texray's tree sampler. `0` until [`start`] has run or off Linux.
+/// Unlike a bare `/proc/self/status` read this includes Zisk's ASM
+/// microservices, which mmap large ROMs in separate PIDs.
+fn peak_rss_bytes() -> Option<u64> {
+  match tracing_texray::rss_sampler::peak_tree_rss_bytes() {
+    0 => None,
+    n => Some(n),
+  }
+}
+
+/// Write the neutral per-constant entry `{ "<name>": <metrics> }` to `path`
+/// (the shape `run.sh` merges with `jq -s`). serde_json handles key escaping so
+/// arbitrary Lean names are safe.
+fn write_json_entry(
+  path: &PathBuf,
+  name: &str,
+  metrics: serde_json::Value,
+) -> Result<()> {
+  let entry = serde_json::json!({ name: metrics });
+  std::fs::write(path, serde_json::to_string(&entry)?)
+    .map_err(|e| anyhow::anyhow!("write {}: {e}", path.display()))
 }
 
 /// 112-byte public output of one shard-guest proof.
@@ -880,18 +920,32 @@ async fn run_constant(
 
   // ---- Execute mode: cycles only, no proof. ----
   if args.execute {
+    let t0 = Instant::now();
     let result = client.execute(&SHARD_PROGRAM, stdin).run()?.await?;
+    let execute_secs = t0.elapsed().as_secs_f64();
+    tracing_texray::json_sink::record_manual("zisk/execute", execute_secs);
     let mut buf = [0u8; SHARD_PUBLICS_LEN];
     result.get_public_values_slice(&mut buf);
     let publics = ShardPublics::decode(&buf);
-    println!(
-      "cycles: {}, failures: {}",
-      result.get_execution_steps(),
-      publics.failures
-    );
+    let cycles = result.get_execution_steps();
+    println!("cycles: {cycles}, failures: {}", publics.failures);
     if publics.failures > 0 {
       bail!("kernel typecheck produced {} failure(s)", publics.failures);
     }
+    if let Some(path) = &args.json {
+      let tput =
+        if execute_secs > 0.0 { cycles as f64 / execute_secs } else { 0.0 };
+      write_json_entry(
+        path,
+        name,
+        serde_json::json!({
+          "cycles": cycles,
+          "execute-time": (execute_secs * 1e6).round() / 1e6,
+          "throughput": tput.round(),
+          "peak-rss": peak_rss_bytes(),
+        }),
+      )?;
+    }
     return Ok(());
   }
 
@@ -901,6 +955,7 @@ async fn run_constant(
   result.get_public_values_slice(&mut buf);
   let publics = ShardPublics::decode(&buf);
   let leaf_ms = result.get_proving_time();
+  tracing_texray::json_sink::record_manual("zisk/prove", leaf_ms as f64 / 1000.0);
   let expected = subject_of_cover(&cover);
   if *expected.as_bytes() != publics.subject_root {
     bail!(
@@ -929,6 +984,17 @@ async fn run_constant(
       publics.failures
     );
   }
+  if let Some(path) = &args.json {
+    write_json_entry(
+      path,
+      name,
+      serde_json::json!({
+        "prove-time": (leaf_ms as f64).round() / 1000.0,
+        "steps": result.get_execution_steps(),
+        "peak-rss": peak_rss_bytes(),
+      }),
+    )?;
+  }
   Ok(())
 }
 
@@ -1596,6 +1662,22 @@ async fn main() -> Result<()> {
 
   let args = Args::parse();
 
+  // Start the process-tree RSS sampler so `peak_rss_bytes()` reflects the ASM
+  // microservices' memory, and point the per-phase timing sink at the drill-down
+  // file if requested. Both are independent of the SDK's global tracing logger.
+  //
+  // TODO(spans): the sink only receives the coarse `zisk/execute` / `zisk/prove`
+  // phases we `record_manual` below. For a finer drill-down (setup, trace-gen,
+  // per-microservice), install a TeXRay subscriber and examine the zisk-sdk's
+  // own tracing spans — which requires composing it with the SDK's global logger
+  // (`zisk_sdk::setup_logger`), currently the sole subscriber.
+  tracing_texray::rss_sampler::start(std::time::Duration::from_millis(50));
+  if let Some(path) = &args.texray_json {
+    if let Some(p) = path.to_str() {
+      let _ = tracing_texray::json_sink::to_file(p);
+    }
+  }
+
   // Collect inputs. No `--ixe` → a single empty env (back-compat).
   let inputs: Vec<Option<PathBuf>> = if args.ixe.is_empty() {
     vec![None]

From 1839a6657ba6811e9327da83b2084c84437af1a6 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 2 Jul 2026 11:14:19 -0400
Subject: [PATCH 08/27] =?UTF-8?q?refactor(ci):=20rename=20backend=20native?=
 =?UTF-8?q?=E2=86=92ooc;=20suffix=20bencher=20workloads=20with=20-check?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Backend `native` is renamed to `ooc` (out-of-circuit) throughout: the
  BACKENDS tuple in bench.py, the run.sh case arm, the `native-check` job
  in bench-main.yml (now `ooc-check`), the `\!benchmark` grammar in
  bench-pr.yml, and docs/benchmarking.md.

- Bencher workload names carry an explicit `-check` suffix so the tracked
  identity is self-describing: `aiur-check`, `zisk-check`, `sp1-check`,
  `ooc-check` (was `aiur`, `zisk`, `sp1`, `native-check`). Reset workflow
  tokens (options / valid / accepted / error text), tag-reference comments,
  and the bencher-track action's docstring updated to match.

- Clarify the `\!benchmark` grammar: `([aiur] [zisk] [sp1] [ooc] | all)` —
  `all` is an alternative to the whole backend list, not just to `ooc`.
---
 .github/actions/bencher-track/action.yml      |  2 +-
 .github/scripts/bench.py                      | 12 +++---
 .github/scripts/run.sh                        | 18 ++++-----
 .github/workflows/bench-main.yml              | 38 +++++++++----------
 .github/workflows/bench-pr.yml                |  2 +-
 .../workflows/bencher-thresholds-reset.yml    | 14 +++----
 docs/benchmarking.md                          | 12 +++---
 7 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/.github/actions/bencher-track/action.yml b/.github/actions/bencher-track/action.yml
index 498f19af..c8127314 100644
--- a/.github/actions/bencher-track/action.yml
+++ b/.github/actions/bencher-track/action.yml
@@ -12,7 +12,7 @@ inputs:
     description: Bencher testbed slug.
     required: true
   workload:
-    description: Workload key for the `bencher-thresholds-reset-<workload>` tag (e.g. ix-compile, aiur).
+    description: Workload key for the `bencher-thresholds-reset-<workload>` tag (e.g. ix-compile, aiur-check).
     required: true
   file:
     description: Bencher Metric Format JSON file to upload.
diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index a770fc07..0d9ef8a1 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -18,7 +18,7 @@
 
 
 # ───────────────────────── parse ─────────────────────────
-BACKENDS = ("aiur", "zisk", "sp1", "native")
+BACKENDS = ("aiur", "zisk", "sp1", "ooc")
 MODES = ("execute", "prove")
 ENVS = ("initStd", "lean", "mathlib")
 CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_GPU", "BENCH_FULL"}
@@ -29,7 +29,7 @@ def runner_for(backend, mode, gpu):
     """(runs-on label, skip?) for a cell."""
     if backend == "aiur":
         return "warp-ubuntu-latest-x64-32x", False
-    if backend == "native":   # whole-env parallel check; no proving, never skips
+    if backend == "ooc":      # whole-env parallel check; no proving, never skips
         return "warp-ubuntu-latest-x64-32x", False
     if mode == "execute":
         return "warp-ubuntu-latest-x64-16x", False
@@ -138,9 +138,9 @@ def cmd_manifest(a):
     ("sp1", "execute"): ["cycles", "execute-time", "throughput", "peak-rss"],
     ("zisk", "prove"): ["prove-time", "steps", "peak-rss"],
     ("sp1", "prove"): ["prove-time", "peak-rss"],
-    # native is whole-env (one row per env); mode is ignored (it never proves).
-    ("native", "execute"): ["throughput", "check-time", "peak-rss"],
-    ("native", "prove"): ["throughput", "check-time", "peak-rss"],
+    # ooc is whole-env (one row per env); mode is ignored (it never proves).
+    ("ooc", "execute"): ["throughput", "check-time", "peak-rss"],
+    ("ooc", "prove"): ["throughput", "check-time", "peak-rss"],
 }
 
 
@@ -186,7 +186,7 @@ def _phase_details(main_d, pr_d, names):
     for n in names:
         mp, pp = _phases(main_d.get(n, {})), _phases(pr_d.get(n, {}))
         # Only worth a drill-down when there's more than one phase; a lone phase
-        # (zisk/sp1 execute, native check) just restates the headline metric.
+        # (zisk/sp1 execute, ooc check) just restates the headline metric.
         if len(set(mp) | set(pp)) < 2:
             continue
         rows = ["| phase | main (s) | PR (s) | Δ% |", "|---|--:|--:|--:|"]
diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index 58dd860d..47a59cf5 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -7,7 +7,7 @@
 #   run.sh <repo_dir> <env> <backend> <mode> <names_file> <out_json>
 #     repo_dir : checked-out worktree (has .lake/build/bin/{ix,bench-typecheck})
 #     env      : initStd | lean | mathlib  (any case; used verbatim for <env>.ixe)
-#     backend  : aiur | zisk | sp1 | native
+#     backend  : aiur | zisk | sp1 | ooc
 #     mode     : execute | prove
 #
 # `ix` / `bench-typecheck` come from <repo_dir> (so base measures base's code, PR
@@ -141,25 +141,25 @@ case "$backend" in
     emit_empty
     ;;
 
-  native)
-    # Native out-of-circuit Rust kernel (far faster than proving). Two views,
-    # both keyed off the structured line
+  ooc)
+    # Out-of-circuit Rust kernel (far faster than proving). Two views, both keyed
+    # off the structured line
     #   `##check## <elapsed_ms> <passed> <failures> <total> <peak-rss-bytes>`
     # (peak-rss from ix check's tracing-texray tree sampler): the whole env in
     # parallel (`--anon`, keyed by env), and a per-primary subject check
     # (`--consts`, keyed by constant) for an apples-to-apples baseline next to
     # the zkVM `--skip-deps` execute.
-    native_one() {  # <label> <ix-check-args…>  → prints one JSON object
+    ooc_one() {  # <label> <ix-check-args…>  → prints one JSON object
       local label="$1"; shift
       local log="$tmp/n.out"
       ix check-rs "$ixe" "$@" > "$log" 2>>"$log" \
-        || { echo "::warning::native '$label' check failed; dropping" >&2; return; }
+        || { echo "::warning::ooc '$label' check failed; dropping" >&2; return; }
       local line ems fl tot rss
       line=$(grep '^##check##' "$log" | tail -1)
       ems=$(echo "$line" | awk '{print $2}'); fl=$(echo "$line" | awk '{print $4}')
       tot=$(echo "$line" | awk '{print $5}'); rss=$(echo "$line" | awk '{print $6}')
       { [ -n "${tot:-}" ] && [ "${fl:-1}" = 0 ]; } \
-        || { echo "::warning::native '$label': bad ##check## / failures; dropping" >&2; return; }
+        || { echo "::warning::ooc '$label': bad ##check## / failures; dropping" >&2; return; }
       local cs tp
       cs=$(awk -v e="$ems" 'BEGIN{printf "%.3f", e/1000}')
       tp=$(awk -v t="$tot" -v e="$ems" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }')
@@ -168,10 +168,10 @@ case "$backend" in
         '{($n): {constants:$c, "check-time":$s, throughput:$tp, "peak-rss":$rss}}'
     }
     {
-      native_one "$benv" --anon
+      ooc_one "$benv" --anon
       while IFS= read -r c; do
         [ -z "$c" ] && continue
-        native_one "$c" --consts "$c"
+        ooc_one "$c" --consts "$c"
       done < "$names"
     } | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null
     emit_empty
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index d4134d00..12380f3f 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -9,9 +9,9 @@ name: Benchmarks
 #   3. zkvm-execute — restore that `.ixe` and execute the same constants through
 #                     the Zisk and SP1 zkVM hosts (deterministic cycle counts +
 #                     time/throughput/RAM; proving needs a GPU, so execute-only).
-#   4. native-check — restore that `.ixe` and run the native Rust kernel (the same
-#                     kernel, out-of-circuit and parallel — far faster) over the
-#                     whole env via `ix check --anon`, tracking throughput.
+#   4. ooc-check    — restore that `.ixe` and run the out-of-circuit Rust kernel
+#                     (the same kernel, out-of-circuit and parallel — far faster)
+#                     over the whole env via `ix check --anon`, tracking throughput.
 # Each job reports to its own bencher testbed/workload so a threshold reset only
 # touches its own measures.
 
@@ -231,7 +231,7 @@ jobs:
           jq 'map_values((.phases // {}) as $p | del(.phases) | (. + ($p | with_entries(.key |= "phase:" + .))) | map_values({value: .}))' neutral.json > aiur.json
           cat aiur.json
       # Upload Aiur metrics. Every measure shares the per-workload baseline
-      # window (data points since the aiur reset tag). constants is deterministic
+      # window (data points since the aiur-check reset tag). constants is deterministic
       # → pinned exactly (0/0). fft-cost is deterministic but only ever drops on
       # a real Aiur win, so it rides an upper-only 5% bound (flag a regression,
       # let wins through) rather than a hard pin. prove-time/execute-time,
@@ -244,7 +244,7 @@ jobs:
       - uses: ./.github/actions/bencher-track
         with:
           testbed: aiur-typecheck-x64-32x
-          workload: aiur
+          workload: aiur-check
           file: aiur.json
           key: ${{ secrets.BENCHER_API_KEY }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -326,7 +326,7 @@ jobs:
       - uses: ./.github/actions/bencher-track
         with:
           testbed: ${{ matrix.backend }}-execute-x64-16x
-          workload: ${{ matrix.backend }}
+          workload: ${{ matrix.backend }}-check
           file: bench.json
           key: ${{ secrets.BENCHER_API_KEY }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -350,12 +350,12 @@ jobs:
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
             --threshold-lower-boundary 0.10
 
-  # Native Rust kernel typecheck — the same kernel as the zkVM guest, but run
-  # out-of-circuit and in parallel (`--workers` defaults to the core count), so
-  # far faster than proving. Checks the whole env via `ix check --anon`, tracking
-  # throughput (constants/sec), wall time, and peak RAM. Reuses the compile job's
-  # cached `.ixe` and the staged `ix` binary — no recompile.
-  native-check:
+  # Out-of-circuit Rust kernel typecheck — the same kernel as the zkVM guest, but
+  # run out-of-circuit and in parallel (`--workers` defaults to the core count),
+  # so far faster than proving. Checks the whole env via `ix check --anon`,
+  # tracking throughput (constants/sec), wall time, and peak RAM. Reuses the
+  # compile job's cached `.ixe` and the staged `ix` binary — no recompile.
+  ooc-check:
     needs: compile
     runs-on: warp-ubuntu-latest-x64-32x
     timeout-minutes: 60
@@ -388,10 +388,10 @@ jobs:
           path: ${{ matrix.bench }}.ixe
           key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
           fail-on-cache-miss: true
-      # run.sh native runs `ix check --anon` (whole env, parallel) and emits the
+      # run.sh ooc runs `ix check --anon` (whole env, parallel) and emits the
       # neutral { <env>: { constants, check-time, throughput, peak-rss } } — same
-      # path as the !benchmark native backend.
-      - name: Run native kernel check
+      # path as the !benchmark ooc backend.
+      - name: Run out-of-circuit kernel check
         env:
           REUSE_IXE: "1"
         run: |
@@ -400,17 +400,17 @@ jobs:
           benv="${{ matrix.bench }}"; benv="${benv,}"
           python3 .github/scripts/bench.py manifest \
             --csv Benchmarks/Vectors.csv --env "$benv" --mode execute --primary --out names.txt
-          bash .github/scripts/run.sh . "${{ matrix.bench }}" native execute names.txt neutral.json
+          bash .github/scripts/run.sh . "${{ matrix.bench }}" ooc execute names.txt neutral.json
           # Wrap → Bencher Metric Format, flattening `phases` into `phase:<span>`
-          # measures (a no-op for native, which records no spans).
+          # measures (a no-op for ooc, which records no spans).
           jq 'map_values((.phases // {}) as $p | del(.phases) | (. + ($p | with_entries(.key |= "phase:" + .))) | map_values({value: .}))' neutral.json > bench.json
           cat bench.json
       # constants is deterministic → pinned (0/0); check-time / throughput /
       # peak-rss are noisy parallel wall-clock → percentage bounds.
       - uses: ./.github/actions/bencher-track
         with:
-          testbed: native-check-x64-32x
-          workload: native-check
+          testbed: ooc-check-x64-32x
+          workload: ooc-check
           file: bench.json
           key: ${{ secrets.BENCHER_API_KEY }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index 546bd351..cf801b99 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -1,7 +1,7 @@
 # `!benchmark` PR command: run the curated constant set (Benchmarks/Vectors.csv)
 # through chosen prover backend(s) and post a main-vs-PR comparison table.
 #
-#   !benchmark [aiur] [zisk] [sp1] [native | all]  [execute|prove]
+#   !benchmark ([aiur] [zisk] [sp1] [ooc] | all)  [execute|prove]
 #   BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
 #   BENCH_FULL=1                   # run the full curated set, not the ~11 primary
 #   BENCH_TIER=cheap|heavy|all     # override the mode default (execute=all, prove=cheap)
diff --git a/.github/workflows/bencher-thresholds-reset.yml b/.github/workflows/bencher-thresholds-reset.yml
index d3c3eef5..f05db98a 100644
--- a/.github/workflows/bencher-thresholds-reset.yml
+++ b/.github/workflows/bencher-thresholds-reset.yml
@@ -17,9 +17,9 @@ name: Bencher thresholds reset
 #     `bencher-thresholds-reset:<token>` label on the PR, whatever added it — so a
 #     Triage+ collaborator can queue a reset by applying the label directly, and
 #     cancel by removing it before merge. Naming convention: one label per token,
-#     `bencher-thresholds-reset:<token>` where <token> is `ix-compile`, `aiur`,
-#     `zisk`, `sp1`, `native-check`, or `all` (the merge step expands an `all`
-#     label into every workload). Labeling
+#     `bencher-thresholds-reset:<token>` where <token> is `ix-compile`,
+#     `aiur-check`, `zisk-check`, `sp1-check`, `ooc-check`, or `all` (the merge
+#     step expands an `all` label into every workload). Labeling
 #     requires Triage+, so PR authors from forks cannot self-queue a reset. The
 #     label shares the command/workflow name; the git tag it moves is the same
 #     stem with a dash: `bencher-thresholds-reset-<workload>`.
@@ -37,7 +37,7 @@ on:
         description: Workload baseline to reset
         required: true
         type: choice
-        options: [ix-compile, aiur, zisk, sp1, native-check, all]
+        options: [ix-compile, aiur-check, zisk-check, sp1-check, ooc-check, all]
       sha:
         description: "Commit to anchor to (default: HEAD)"
         required: false
@@ -66,7 +66,7 @@ jobs:
       MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }}
     steps:
       - run: |
-          valid="ix-compile aiur zisk sp1 native-check"
+          valid="ix-compile aiur-check zisk-check sp1-check ooc-check"
           if [ "$EVENT" = workflow_dispatch ]; then
             # Reset the chosen workload(s) at the given commit; no PR scan.
             sha="${INPUT_SHA:-$HEAD_SHA}"
@@ -120,7 +120,7 @@ jobs:
       - run: |
           # Accepted command tokens — applied verbatim as labels (incl. `all`,
           # which the merge job expands into every workload).
-          accepted="ix-compile aiur zisk sp1 native-check all"
+          accepted="ix-compile aiur-check zisk-check sp1-check ooc-check all"
           # Parse the workload token(s) after the command, lowercased.
           workloads=$(printf '%s' "$BODY" \
             | grep -oiE '!bencher-thresholds-reset[[:space:]]+[a-z0-9 -]+' \
@@ -143,5 +143,5 @@ jobs:
               --body "♻️ Baseline reset queued for:$ok — will anchor to the merge commit when this PR merges."
           else
             gh pr comment "$PR" --repo "$REPO" \
-              --body "⚠️ Reset command matched no known workload (expected \`ix-compile\`, \`aiur\`, \`zisk\`, \`sp1\`, \`native-check\`, or \`all\`). Nothing will reset on merge."
+              --body "⚠️ Reset command matched no known workload (expected \`ix-compile\`, \`aiur-check\`, \`zisk-check\`, \`sp1-check\`, \`ooc-check\`, or \`all\`). Nothing will reset on merge."
           fi
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index c7680829..59f94b37 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -14,12 +14,12 @@ the same backend drivers:
 |---|---|---|
 | `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove) | `fft-cost`, `execute-time`, `prove-time`, `peak-rss`, `constants`, `throughput` |
 | `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `peak-rss` |
-| `native` | the same kernel run **out-of-circuit and in parallel** (`ix check-rs`) — far faster | `throughput` (constants/s), `check-time`, `peak-rss`, `constants` |
+| `ooc` | the same kernel run **out-of-circuit and in parallel** (`ix check-rs`) — far faster | `throughput` (constants/s), `check-time`, `peak-rss`, `constants` |
 
 In **prove** mode, `run.sh` proves each constant whose Aiur fft-cost fits the
 prover RAM ceiling (`AIUR_PROVE_MAX_FFT`, ~128 GB at 2.34 GB per billion fft) and
 falls back to **execute-only** for the rest, so every primary still reports
-metrics. The `native` backend reports two views: the **whole env** (`ix check-rs
+metrics. The `ooc` backend reports two views: the **whole env** (`ix check-rs
 --anon`, keyed by env) and a **per-primary subject check** (`ix check-rs
 --consts`, keyed by constant — apples-to-apples with the zkVM `--skip-deps`
 execute).
@@ -40,7 +40,7 @@ sink, one `{"span","seconds"}` per closed span) to a side file, which `run.sh`
 aggregates into a `phases` object on the constant's entry. `aiur` yields a rich
 breakdown (`aiur/execute`, `aiur/witness`, `stark/fri_open`, …) since the prover
 instruments those spans; `zisk`/`sp1` record a single `execute`/`prove` phase;
-`native` records none. In a `!benchmark` comparison, `bench.py` renders any
+`ooc` records none. In a `!benchmark` comparison, `bench.py` renders any
 multi-phase constant as a collapsible **per-phase timing drill-down** (main vs
 PR seconds + Δ%), so a regression can be traced to the phase that moved.
 
@@ -63,7 +63,7 @@ One CSV is the single source of truth: `name,env,tier,shard_target,primary,aiur_
 Maintainer comment on a PR:
 
 ```
-!benchmark [aiur] [zisk] [sp1] [native | all]  [execute|prove]
+!benchmark ([aiur] [zisk] [sp1] [ooc] | all)  [execute|prove]
 BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
 BENCH_FULL=1                   # run the full curated set, not the ~11 primary
 BENCH_TIER=cheap|heavy|all     # override the mode default (execute=all, prove=cheap)
@@ -78,8 +78,8 @@ note unless a GPU runner is selected.
 
 ## Bencher jobs (`bench-main.yml`)
 
-`build → compile → { prove, zkvm-execute, native-check }`, each reporting to its
-own testbed + **workload** (`aiur`, `zisk`, `sp1`, `native-check`, `ix-compile`).
+`build → compile → { prove, zkvm-execute, ooc-check }`, each reporting to its
+own testbed + **workload** (`aiur-check`, `zisk-check`, `sp1-check`, `ooc-check`, `ix-compile`).
 Deterministic measures (cycles, fft-cost, constants, …) are pinned exactly;
 noisy wall-clock measures (time, RAM, throughput) ride percentage bounds, both
 windowed to the per-workload `bencher-thresholds-reset-<workload>` tag.

From 0797729f1797017a142b5e67ede3fda51e690665 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 2 Jul 2026 14:16:42 -0400
Subject: [PATCH 09/27] refactor(ci): bencher-first !benchmark, unified
 --consts CLI, RAM harness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**bench.py !benchmark surface**
- `fetch-main`: queries bencher.dev's public reports API for `branch=main` at
  the base SHA, filters results by `--names`, and reshapes into the neutral
  `{ "<name>": { "<metric>": v } }` shape. Replaces the actions/cache layer;
  bench-pr.yml falls back to a local base run only when bencher hasn't
  ingested the base SHA yet. TODO left for non-main base branches.
- Mode is fixed per backend (`aiur=prove`, others=`execute`); the optional
  bare `execute` token in `!benchmark` flips aiur to `--execute-only`.
  Grammar drops `[execute|prove]` and `BENCH_GPU`; zkVM prove paths removed.
- Single runner `warp-ubuntu-latest-x64-32x` for every cell; testbeds
  unified to `<backend>-check-x64-32x`; `MAIN_TESTBEDS` maps (backend, mode)
  to the slug.
- `compare` renders the OOM sentinel `{"oom": true}` as `OOM` cells + `n/a`
  Δ%. Regression/improvement flags fire on every metric column (previously
  only the first); summary counts distinct constants; worst names constant
  + metric. `_human` is unit-aware: bytes → GiB/MiB, seconds → µs/ms/s/m,
  counts → K/M/B/T.
- Sub-span (per-constant phase) drill-down removed with a TODO; `run.sh`'s
  `merge_phases` and `bench-main.yml`'s `phase:<span>` flattening still
  populate the neutral JSON for later reinstatement.

**CLI unification** (bench-typecheck, zisk-host, sp1-host)
- Dropped `--constant`; added `--consts <n1,n2,…>` (comma-list, clap
  `value_delimiter` on the Rust hosts) and `--consts-file <path>`.
  Multi-const runs loop the single-const path per name, accumulating one
  entry per name via a merge-safe `write_json_entry`.
- Dropped `--texray-json <path>` and `--no-texray`. `--texray` (bool) alone
  toggles the tracing-texray subscriber; combined with `--json <path>`,
  span timings auto-write to `<json>.spans`.
- Inline `#[cfg(test)] mod cli_tests` in each Rust host covers
  value-delimiter parsing, `--consts` `requires`, and `collect_consts`
  union/dedup.

**Vectors.csv**
- Removed `aiur_fft`, `zisk_cycles` (measurements live in bencher, never
  here). Rows can now omit trailing zero fields; parser tolerates 3+ cols.
- Primary set curated: renamed `Vector.extract_append` →
  `Vector.extract_append._proof_1` (kept as shard_target); promoted
  `Vector.append` and `Nat.sub_le_of_le_add` to primary; added the un-
  shardable Init constants from the Zisk cost-model doc's canonical 12
  (`Char.ofOrdinal_le_of_le`, `Array.extract_append._proof_1_1`, the
  `SInt.Int{8,16,32,64}.instRxcHasSize_eq` family with the case corrected),
  plus `ByteArray.utf8DecodeChar?_utf8EncodeChar_append`, `String.append`,
  `IxVMPrim.nat_pow_big`, and
  `Std.Tactic.BVDecide.BVExpr.bitblast.goCache_Inv_of_Inv._mutual`.
  Trimmed obvious duplicates (`Array.qsort`, `Int.ediv`, `List.dropLast`,
  `List.range`, `UInt32.toNat`, `Std.Time.Month.Offset.ofNat`).

**RAM harness** (run.sh, aiur prove)
- Tier gate removed. Every constant attempts a full prove.
- `watch_ram_kill` samples `ps -eo pid,ppid,rss` every ~3 s and SIGKILLs
  the tree if it exceeds `AIUR_PROVE_MAX_RSS_GB` (default 120 GB — 8 GB
  headroom under 128 GB). Killed constants record `{"oom": true}` for the
  compare table.

**Misc**
- `install-zisk`'s description now correctly says the proving key is
  installed (`client.setup()` loads const-trees before execute too).
- `bencher-track`'s workload description enumerates all options.
- `zisk/Cargo.toml`'s patch example uses `/path/to/…`, not `/home/ubuntu`.
- `docs/zisk-cycle-cost-model.md` finding #4 disambiguates
  "not-shardable" (mutual blocks only) from "not full-closure-single-leaf
  provable" (the canonical 12).
- `riscv-bench.yml` still on the temporary `sb/ci-benchmarks` push
  trigger; drop before merge.
---
 .github/actions/bencher-track/action.yml |   2 +-
 .github/actions/install-zisk/action.yml  |   8 +-
 .github/scripts/bench.py                 | 334 ++++++++++++++++-------
 .github/scripts/run.sh                   | 114 +++++---
 .github/workflows/bench-main.yml         |  12 +-
 .github/workflows/bench-pr.yml           |  85 +++---
 .github/workflows/riscv-bench.yml        |   8 +-
 Benchmarks/Typecheck.lean                |  98 +++----
 Benchmarks/Vectors.csv                   | 172 ++++++------
 README.md                                |  27 +-
 docs/benchmarking.md                     |  41 ++-
 docs/zisk-cycle-cost-model.md            |  29 +-
 sp1/host/src/main.rs                     | 220 ++++++++++-----
 zisk/Cargo.toml                          |   2 +-
 zisk/host/src/main.rs                    | 202 ++++++++++----
 15 files changed, 848 insertions(+), 506 deletions(-)

diff --git a/.github/actions/bencher-track/action.yml b/.github/actions/bencher-track/action.yml
index c8127314..e77b2b9e 100644
--- a/.github/actions/bencher-track/action.yml
+++ b/.github/actions/bencher-track/action.yml
@@ -12,7 +12,7 @@ inputs:
     description: Bencher testbed slug.
     required: true
   workload:
-    description: Workload key for the `bencher-thresholds-reset-<workload>` tag (e.g. ix-compile, aiur-check).
+    description: Workload key for the `bencher-thresholds-reset-<workload>` tag (ix-compile, aiur-check, zisk-check, sp1-check, ooc-check).
     required: true
   file:
     description: Bencher Metric Format JSON file to upload.
diff --git a/.github/actions/install-zisk/action.yml b/.github/actions/install-zisk/action.yml
index 7b01c6e3..e3b1162b 100644
--- a/.github/actions/install-zisk/action.yml
+++ b/.github/actions/install-zisk/action.yml
@@ -1,8 +1,10 @@
 name: Install Zisk
 description: >-
-  Install the system build deps and the ZisK zkVM toolchain (ziskup, CPU build,
-  no proving keys) needed to build and run the Zisk host. Assumes a Rust
-  toolchain is already set up.
+  Install the system build deps, the ZisK zkVM toolchain (ziskup, CPU build),
+  and the fork-matching proving key needed to build and run the Zisk host.
+  Execute needs the key too — zisk-host's `client.setup()` loads the circuit's
+  const-tree files before either the execute or the prove branch. Assumes a
+  Rust toolchain is already set up.
 
 runs:
   using: composite
diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index 0d9ef8a1..b6990d31 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python3
 """All data-wrangling for the `!benchmark` PR workflow, as subcommands:
 
-  parse     COMMENT_BODY env → matrix + config (writes $GITHUB_OUTPUT)
-  manifest  Benchmarks/Vectors.csv → the constant names for one cell
-  compare   main.json + pr.json   → a Markdown main-vs-PR table
-  comment   per-cell table files  → the final PR comment body
+  parse       COMMENT_BODY env → matrix + config (writes $GITHUB_OUTPUT)
+  manifest    Benchmarks/Vectors.csv → the constant names for one cell
+  fetch-main  base SHA + cell → main.json pulled from bencher.dev
+  compare     main.json + pr.json → a Markdown main-vs-PR table
+  comment     per-cell table files → the final PR comment body
 
 The neutral results JSON every backend normalises to (see run.sh) is
 `{ "<name>": { "<metric>": <number>, ... }, ... }`. All metrics are
@@ -12,30 +13,25 @@
 """
 import argparse
 import glob
-import hashlib
 import json
 import os
+import urllib.parse
+import urllib.request
 
 
 # ───────────────────────── parse ─────────────────────────
-BACKENDS = ("aiur", "zisk", "sp1", "ooc")
-MODES = ("execute", "prove")
+# Default mode per backend. Aiur is the only backend with a real choice:
+# `prove` (default) is the full pipeline; `execute` skips Phase 2 (--execute-only)
+# and reports the fft-cost / execute-time subset. Users opt in with the bare
+# `execute` token in `!benchmark`. The zkVMs and ooc are always execute.
+DEFAULT_MODE = {"aiur": "prove", "zisk": "execute", "sp1": "execute", "ooc": "execute"}
+BACKENDS = tuple(DEFAULT_MODE)
 ENVS = ("initStd", "lean", "mathlib")
-CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_GPU", "BENCH_FULL"}
+CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_FULL"}
 PASSTHROUGH_KEYS = {"RUST_LOG", "WITHOUT_VK_VERIFICATION", "RUSTFLAGS"}
 
 
-def runner_for(backend, mode, gpu):
-    """(runs-on label, skip?) for a cell."""
-    if backend == "aiur":
-        return "warp-ubuntu-latest-x64-32x", False
-    if backend == "ooc":      # whole-env parallel check; no proving, never skips
-        return "warp-ubuntu-latest-x64-32x", False
-    if mode == "execute":
-        return "warp-ubuntu-latest-x64-16x", False
-    if gpu:                       # zkVM prove needs a GPU
-        return "self-hosted-gpu", False
-    return "ubuntu-latest", True
+RUNNER = "warp-ubuntu-latest-x64-32x"
 
 
 def cmd_parse(_a):
@@ -44,14 +40,14 @@ def cmd_parse(_a):
     cmd = next((ln for ln in lines if "!benchmark" in ln), "")
     toks = cmd.split("!benchmark", 1)[1].split() if "!benchmark" in cmd else []
 
-    backends, mode = [], "execute"
+    backends, execute_flag = [], False
     for t in (t.lower() for t in toks):
         if t == "all":
             backends = list(BACKENDS)
         elif t in BACKENDS and t not in backends:
             backends.append(t)
-        elif t in MODES:
-            mode = t
+        elif t == "execute":
+            execute_flag = True
     if not backends:
         backends = ["aiur"]
 
@@ -73,24 +69,29 @@ def cmd_parse(_a):
         tier = ""             # empty ⇒ derived from mode at manifest time
     shard = "1" if cfg.get("BENCH_SHARD") == "1" else "0"
     full = "1" if cfg.get("BENCH_FULL") == "1" else "0"  # full set vs primary subset
-    gpu = cfg.get("BENCH_GPU") == "1"
+
+    def mode_for(b):
+        # `execute` only affects aiur (the zkVMs and ooc are execute-only anyway).
+        return "execute" if (b == "aiur" and execute_flag) else DEFAULT_MODE[b]
 
     cells = []
     for b in backends:
+        m = mode_for(b)
         for e in envs:
-            runner, skip = runner_for(b, mode, gpu)
-            cells.append({"backend": b, "env": e, "mode": mode, "runner": runner,
-                          "skip": "true" if skip else "false", "label": f"{b}-{e}-{mode}"})
-
-    summary = (f"backends: `{' '.join(backends)}` · mode: `{mode}` · "
-               f"envs: `{','.join(envs)}` · set: `{'full' if full == '1' else 'primary'}` · "
-               f"tier: `{tier or 'auto'}` · shard: `{shard}` · gpu: `{int(gpu)}`")
+            cells.append({"backend": b, "env": e, "mode": m,
+                          "runner": RUNNER,
+                          "label": f"{b}-{e}-{m}"})
+
+    modes = " ".join(f"{b}={mode_for(b)}" for b in backends)
+    summary = (f"backends: `{modes}` · envs: `{','.join(envs)}` · "
+               f"set: `{'full' if full == '1' else 'primary'}` · "
+               f"tier: `{tier or 'auto'}` · shard: `{shard}`")
     if passthrough:
         summary += " · env: `" + " ".join(passthrough) + "`"
 
     with open(os.environ.get("GITHUB_OUTPUT", "/dev/stdout"), "a") as f:
         f.write(f"matrix={json.dumps(cells)}\n")
-        f.write(f"mode={mode}\ntier={tier}\nshard={shard}\nfull={full}\n")
+        f.write(f"tier={tier}\nshard={shard}\nfull={full}\n")
         f.write(f"config-summary={summary}\n")
         f.write("passthrough-env<<PTENV\n" + "\n".join(passthrough)
                 + ("\n" if passthrough else "") + "PTENV\n")
@@ -101,8 +102,9 @@ def cmd_parse(_a):
 # ──────────────────────── manifest ────────────────────────
 def cmd_manifest(a):
     # prove defaults to the cheap tier to keep the full set bounded; the curated
-    # primary subset is exempt — run.sh proves each primary that fits the Aiur RAM
-    # ceiling and execute-only's the rest, so all primaries are selected here.
+    # primary subset is exempt — run.sh gates prove vs execute-only per-constant
+    # on the tier column, so all primaries are selected here (heavy ones fall
+    # back to execute-only in run.sh, not by being excluded up here).
     tier = a.tier or ("cheap" if (a.mode == "prove" and not a.primary) else "all")
     names = []
     with open(a.csv) as f:
@@ -111,9 +113,13 @@ def cmd_manifest(a):
             if not row or row.startswith("#"):
                 continue
             cols = row.split(",")
-            if cols[0] == "name" or len(cols) < 4:
+            if cols[0] == "name" or len(cols) < 3:
                 continue
-            name, env, ctier, shard = cols[:4]
+            # `shard_target` and `primary` default to "0" when the column is
+            # omitted, so rows can drop trailing zero fields (most only carry
+            # the first three).
+            name, env, ctier = cols[:3]
+            shard = cols[3] if len(cols) >= 4 else "0"
             rep = cols[4] if len(cols) >= 5 else "0"
             if env != a.env:
                 continue
@@ -126,21 +132,20 @@ def cmd_manifest(a):
             names.append(name)
     with open(a.out, "w") as f:
         f.write("\n".join(names) + ("\n" if names else ""))
-    vhash = hashlib.sha256(open(a.csv, "rb").read()).hexdigest()[:16]
-    print(f"count={len(names)}\nvhash={vhash}\ntier={tier}")
+    print(f"count={len(names)}\ntier={tier}")
 
 
 # ───────────────────────── compare ─────────────────────────
+# Compare-column set per (backend, mode). Aiur has both modes: `prove` shows
+# the full execute+prove metric set; `execute` is a subset (Phase 1 only, no
+# prove-time / no throughput). Bencher stores only the prove set for main —
+# `execute` mode filters that same JSON down to the execute-side columns.
 METRICS = {
-    ("aiur", "execute"): ["fft-cost", "execute-time"],
-    ("aiur", "prove"): ["prove-time", "peak-rss"],
-    ("zisk", "execute"): ["cycles", "execute-time", "throughput", "peak-rss"],
-    ("sp1", "execute"): ["cycles", "execute-time", "throughput", "peak-rss"],
-    ("zisk", "prove"): ["prove-time", "steps", "peak-rss"],
-    ("sp1", "prove"): ["prove-time", "peak-rss"],
-    # ooc is whole-env (one row per env); mode is ignored (it never proves).
-    ("ooc", "execute"): ["throughput", "check-time", "peak-rss"],
-    ("ooc", "prove"): ["throughput", "check-time", "peak-rss"],
+    ("aiur", "prove"):    ["fft-cost", "execute-time", "prove-time", "peak-rss"],
+    ("aiur", "execute"):  ["fft-cost", "execute-time", "peak-rss"],
+    ("zisk", "execute"):  ["cycles", "execute-time", "throughput", "peak-rss"],
+    ("sp1",  "execute"):  ["cycles", "execute-time", "throughput", "peak-rss"],
+    ("ooc",  "execute"):  ["throughput", "check-time", "peak-rss"],
 }
 
 
@@ -149,14 +154,78 @@ def _num(d, name, metric):
     return v if isinstance(v, (int, float)) else None
 
 
-def _human(v):
-    if v is None:
-        return "n/a"
+# Per-metric formatting kind. Metric names are the neutral-JSON keys the tools
+# emit (see METRICS above). Unknown metrics fall through to `_human_auto`.
+_METRIC_KIND = {
+    # bytes
+    "peak-rss": "bytes",
+    "file-size": "bytes",
+    # seconds
+    "execute-time": "seconds",
+    "prove-time": "seconds",
+    "check-time": "seconds",
+    "compile-time": "seconds",
+    # large counts (10^6+ typical)
+    "fft-cost": "count",
+    "cycles": "count",
+    "steps": "count",
+    "max-shard-cycles": "count",
+    "throughput": "count",
+    # small integers
+    "constants": "int",
+    "shards": "int",
+}
+
+
+def _human_bytes(v):
+    v = float(v)
+    for unit in ("B", "KiB", "MiB", "GiB", "TiB"):
+        if abs(v) < 1024:
+            return f"{int(v):,} {unit}" if unit == "B" else f"{v:,.2f} {unit}"
+        v /= 1024
+    return f"{v:,.2f} PiB"
+
+
+def _human_seconds(v):
+    v = float(v)
+    if abs(v) < 1e-3:
+        return f"{v * 1e6:.1f} µs"
+    if abs(v) < 1:
+        return f"{v * 1e3:.1f} ms"
+    if abs(v) < 60:
+        return f"{v:.3f} s"
+    m, s = divmod(v, 60)
+    return f"{int(m)}m {s:.1f}s"
+
+
+def _human_count(v):
+    v = float(v)
+    if abs(v) < 1000:
+        return f"{int(v):,}" if v == int(v) else f"{v:.3f}"
+    for unit in ("K", "M", "B", "T"):
+        v /= 1000
+        if abs(v) < 1000:
+            return f"{v:.2f}{unit}"
+    return f"{v:.2f}Q"
+
+
+def _human_auto(v):
     if isinstance(v, int) or (isinstance(v, float) and v.is_integer()):
         return f"{int(v):,}"
     return f"{v:,.3f}"
 
 
+def _human(v, metric=None):
+    if v is None:
+        return "n/a"
+    kind = _METRIC_KIND.get(metric, "auto")
+    if kind == "bytes":   return _human_bytes(v)
+    if kind == "seconds": return _human_seconds(v)
+    if kind == "count":   return _human_count(v)
+    if kind == "int":     return f"{int(v):,}"
+    return _human_auto(v)
+
+
 def _delta(main, pr):
     if main is None or pr is None or main == 0:
         return None
@@ -172,38 +241,13 @@ def _load(path):
         return {}
 
 
-def _phases(entry):
-    """The `phases` object (span → seconds) on a constant's entry, or {}."""
-    p = entry.get("phases") if isinstance(entry, dict) else None
-    return p if isinstance(p, dict) else {}
-
-
-def _phase_details(main_d, pr_d, names):
-    """Collapsible per-constant phase (span) timing tables — the drill-down that
-    shows *where* time moved between main and PR. Emitted only for constants that
-    carry tracing-texray span data."""
-    blocks = []
-    for n in names:
-        mp, pp = _phases(main_d.get(n, {})), _phases(pr_d.get(n, {}))
-        # Only worth a drill-down when there's more than one phase; a lone phase
-        # (zisk/sp1 execute, ooc check) just restates the headline metric.
-        if len(set(mp) | set(pp)) < 2:
-            continue
-        rows = ["| phase | main (s) | PR (s) | Δ% |", "|---|--:|--:|--:|"]
-        # Slowest-on-PR (else main) first, so the dominant phase leads.
-        spans = sorted(set(mp) | set(pp),
-                       key=lambda s: -(pp.get(s) if isinstance(pp.get(s), (int, float))
-                                       else mp.get(s) if isinstance(mp.get(s), (int, float)) else 0))
-        for s in spans:
-            mv, pv = mp.get(s), pp.get(s)
-            mv = mv if isinstance(mv, (int, float)) else None
-            pv = pv if isinstance(pv, (int, float)) else None
-            dp = _delta(mv, pv)
-            rows.append(f"| `{s}` | {_human(mv)} | {_human(pv)} | "
-                        f"{'n/a' if dp is None else f'{dp:+.1f}%'} |")
-        blocks.append(f"<details><summary><code>{n}</code> — phase breakdown</summary>\n\n"
-                      + "\n".join(rows) + "\n\n</details>")
-    return blocks
+# TODO: re-add the per-constant Aiur phase (sub-span) drill-down. run.sh's
+# `merge_phases` still folds tracing-texray JSON-Lines into `phases: { span:
+# seconds }` on each entry; the compare renderer previously emitted a
+# collapsible `<details>` block per constant showing main-vs-PR per-span deltas
+# so a regression could be traced to `aiur/execute`, `aiur/witness`,
+# `stark/fri_open`, etc. Removed while the compare surface is being stabilised;
+# reinstate once we've settled on the primary table's flag/threshold semantics.
 
 
 def cmd_compare(a):
@@ -212,9 +256,9 @@ def cmd_compare(a):
         raise SystemExit("compare: pass --metric or a known --backend/--mode")
     title = a.title
     if title is None and a.backend:
-        hit = "hit (cached)" if a.cache_hit == "true" else "miss (ran main)"
+        src = a.main_source or "unknown"
         cnt = f"{a.count} constants · " if a.count else ""
-        title = f"### `{a.backend}` · `{a.env}` · `{a.mode}` — {cnt}main cache: {hit}"
+        title = f"### `{a.backend}` · `{a.env}` · `{a.mode}` — {cnt}main from: {src}"
 
     def emit(text):
         if a.out:
@@ -239,35 +283,40 @@ def emit(text):
         head += [f"{m} (main)", f"{m} (PR)", "Δ%"]
     rows = ["| " + " | ".join(head) + " |", "|" + "|".join(["---"] * len(head)) + "|"]
 
-    n_reg = n_imp = 0
-    worst = None
+    def _oom(d, n):
+        return isinstance(d.get(n), dict) and d[n].get("oom") is True
+
+    regressed, improved = set(), set()
+    worst = None  # (dp, name, metric)
     for n in names:
         cells = [f"`{n}`"]
-        for i, m in enumerate(metrics):
+        main_oom, pr_oom = _oom(main_d, n), _oom(pr_d, n)
+        for m in metrics:
             mv, pv = _num(main_d, n, m), _num(pr_d, n, m)
+            mv_h = "OOM" if main_oom else _human(mv, m)
+            pv_h = "OOM" if pr_oom else _human(pv, m)
+            if main_oom or pr_oom:
+                cells += [mv_h, pv_h, "n/a"]
+                continue
             dp = _delta(mv, pv)
             cell = "n/a" if dp is None else f"{dp:+.1f}%"
-            if i == 0 and dp is not None:
+            if dp is not None:
                 if dp > a.threshold:
-                    cell += " ⚠️"; n_reg += 1
+                    cell += " ⚠️"; regressed.add(n)
                 elif dp < -a.threshold:
-                    cell += " 🟢"; n_imp += 1
+                    cell += " 🟢"; improved.add(n)
                 if worst is None or dp > worst[0]:
-                    worst = (dp, n)
-            cells += [_human(mv), _human(pv), cell]
+                    worst = (dp, n, m)
+            cells += [mv_h, pv_h, cell]
         rows.append("| " + " | ".join(cells) + " |")
 
     out = ([title, ""] if title else []) + rows + [""]
-    s = (f"_{len(names)} constants · {n_reg} regressed · {n_imp} improved "
-         f"(|Δ| > {a.threshold:g}% on `{primary}`)._")
+    s = (f"_{len(names)} constants · {len(regressed)} regressed · "
+         f"{len(improved)} improved (|Δ| > {a.threshold:g}% on any metric)._")
     if worst and worst[0] is not None and worst[0] > a.threshold:
-        s += f" Worst: `{worst[1]}` {worst[0]:+.1f}%."
+        s += f" Worst: `{worst[1]}` `{worst[2]}` {worst[0]:+.1f}%."
     out.append(s)
-    details = _phase_details(main_d, pr_d, names)
-    if details:
-        out += ["", "<details><summary>Per-phase timing drill-down</summary>", ""]
-        out += details
-        out += ["", "</details>"]
+    # TODO: emit per-constant phase drill-down (see the TODO by _phase_details).
     emit("\n".join(out))
 
 
@@ -286,6 +335,83 @@ def cmd_comment(a):
     print(open(a.out).read())
 
 
+# ─────────────────────── fetch-main ──────────────────────
+# Testbeds bench-main.yml uploads to, keyed by (backend, mode). Only the
+# pairs main actually runs land here — anything else (e.g. `aiur execute`,
+# `zisk prove`) has no bencher data; fetch-main exits non-zero for those and
+# bench-pr.yml falls back to running main locally.
+MAIN_TESTBEDS = {
+    # `aiur execute` uses the same testbed as `aiur prove` — bencher stores
+    # only prove and the execute columns are extracted from that JSON.
+    ("aiur", "prove"):    "aiur-check-x64-32x",
+    ("aiur", "execute"):  "aiur-check-x64-32x",
+    ("zisk", "execute"):  "zisk-check-x64-32x",
+    ("sp1",  "execute"):  "sp1-check-x64-32x",
+    ("ooc",  "execute"):  "ooc-check-x64-32x",
+}
+
+
+def cmd_fetch_main(a):
+    """Pull the base SHA's neutral results JSON from bencher.dev.
+
+    Exits 2 if (backend, mode) isn't a combination main runs. Exits 3 if
+    bencher has no report at that hash yet (freshly-pushed main whose CI is
+    still ingesting) or the request failed. Callers fall back to running
+    main locally on any non-zero exit.
+    """
+    testbed = MAIN_TESTBEDS.get((a.backend, a.mode))
+    if not testbed:
+        print(f"fetch-main: no main testbed for {a.backend}/{a.mode}")
+        raise SystemExit(2)
+    wanted = set(open(a.names).read().split()) if a.names else None
+    # TODO: support any base/PR branch, not just `main`. Today bench-main.yml
+    # only runs on push to main and this query hardcodes `branch=main`, so a PR
+    # against a non-main base branch (e.g. a long-running feature branch) always
+    # falls through to the local base-run path. To generalise: (1) let
+    # bench-main.yml (or a sibling) upload reports for other tracked branches,
+    # (2) plumb `--branch` here from `github.base_ref` in bench-pr.yml, (3) fall
+    # back to `main` when the base branch has no bencher data.
+    params = {"branch": "main", "testbed": testbed, "per_page": 255}
+    url = "https://api.bencher.dev/v0/projects/ix/reports?" + urllib.parse.urlencode(params)
+    try:
+        with urllib.request.urlopen(url, timeout=15) as f:
+            reports = json.load(f)
+    except Exception as e:
+        print(f"fetch-main: bencher API error: {e}")
+        raise SystemExit(3)
+    # Bencher stores the git hash at `branch.head.version.hash`.
+    at_sha = [
+        r for r in reports
+        if (((r.get("branch") or {}).get("head") or {}).get("version") or {}).get("hash") == a.sha
+    ]
+    if not at_sha:
+        print(f"fetch-main: no reports for {a.backend}/{a.mode} @ {a.sha[:8]}")
+        raise SystemExit(3)
+    # Matrix envs upload separately to the same testbed at the same commit,
+    # each contributing its own benchmark subset — aggregate across reports.
+    # Filter/emit by `name` (Bencher's `slug` is a lower-kebab-cased derivation
+    # that would mangle Lean names like `Nat.add_comm` → `nat-add-comm`).
+    out = {}
+    for r in at_sha:
+        for iteration in r.get("results", []):
+            for bench in iteration:
+                name = bench["benchmark"]["name"]
+                if wanted is not None and name not in wanted:
+                    continue
+                metrics = {
+                    m["measure"]["name"]: m["metric"]["value"]
+                    for m in bench.get("measures", [])
+                }
+                if metrics:
+                    out[name] = metrics
+    if not out:
+        print(f"fetch-main: reports found but no matching benchmarks in --names")
+        raise SystemExit(3)
+    with open(a.out, "w") as f:
+        json.dump(out, f)
+    print(f"fetch-main: {len(out)} constant(s) from bencher for {a.backend}/{a.mode}")
+
+
 # ────────────────────────── cli ──────────────────────────
 def main():
     ap = argparse.ArgumentParser(description=__doc__)
@@ -301,12 +427,20 @@ def main():
                    help="Restrict to the primary subset (the primary=1 column).")
     m.set_defaults(fn=cmd_manifest)
 
+    fm = sub.add_parser("fetch-main")
+    fm.add_argument("--sha", required=True)
+    fm.add_argument("--backend", required=True)
+    fm.add_argument("--mode", required=True)
+    fm.add_argument("--names", help="Only fetch benchmarks whose names appear in this file.")
+    fm.add_argument("--out", required=True)
+    fm.set_defaults(fn=cmd_fetch_main)
+
     c = sub.add_parser("compare")
     c.add_argument("--main", required=True); c.add_argument("--pr", required=True)
     c.add_argument("--metric", action="append", default=[])
     c.add_argument("--threshold", type=float, default=3.0)
     c.add_argument("--title"); c.add_argument("--backend"); c.add_argument("--env")
-    c.add_argument("--mode"); c.add_argument("--count"); c.add_argument("--cache-hit", default="")
+    c.add_argument("--mode"); c.add_argument("--count"); c.add_argument("--main-source", default="")
     c.add_argument("--out")
     c.set_defaults(fn=cmd_compare)
 
diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index 47a59cf5..10992130 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -39,6 +39,38 @@ merge_phases() {  # <results.json> <spans.jsonl>
     && mv "$res.p" "$res" || true
 }
 
+# Background RAM watchdog. Every ~3 s, sum RSS across `root_pid` and every
+# descendant (via `ps -eo pid,ppid,rss` + a small BFS); when the total exceeds
+# `max_gb`, touch `marker` and SIGKILL the tree. Callers detect the kill by
+# testing `-f "$marker"` after wait. Idempotent-ish under EPERM: descendants
+# spawned after the last sample are only reaped on the next sweep, but their
+# parent dying takes them out anyway.
+watch_ram_kill() {  # <root_pid> <max_gb> <marker>
+  local root_pid=$1 max_gb=$2 marker=$3
+  local max_kb=$((max_gb * 1024 * 1024)) total_kb
+  while kill -0 "$root_pid" 2>/dev/null; do
+    total_kb=$(ps -eo pid,ppid,rss --no-headers 2>/dev/null | awk -v root="$root_pid" '
+      { rss[$1]=$3; parent[$1]=$2 }
+      END {
+        alive[root]=1; changed=1
+        while (changed) {
+          changed=0
+          for (p in parent) if (alive[parent[p]] && !alive[p]) { alive[p]=1; changed=1 }
+        }
+        s=0; for (p in alive) s += rss[p]+0
+        print s
+      }')
+    if [ -n "$total_kb" ] && [ "$total_kb" -gt "$max_kb" ]; then
+      echo "::warning::RAM watchdog: killing pid=$root_pid tree-RSS=${total_kb}kB > ${max_kb}kB (~${max_gb} GB)" >&2
+      : > "$marker"
+      kill -KILL "$root_pid" 2>/dev/null || true
+      pkill -KILL -P "$root_pid" 2>/dev/null || true
+      return
+    fi
+    sleep 3
+  done
+}
+
 # `$benv` is used verbatim for the `.ixe` filename (bench-pr compiles `initStd.ixe`;
 # the bencher jobs reuse the compile job's cached `InitStd.ixe`), and lowercased
 # only to pick the Compile module.
@@ -63,35 +95,40 @@ tmp=$(mktemp -d)
 case "$backend" in
   aiur)
     # One bench-typecheck per constant (isolation + per-constant peak-rss).
-    # Execute mode → Phase 1 only (--execute-only). Prove mode → prove each
-    # constant whose Aiur fft-cost fits the prover RAM ceiling (~128 GB at
-    # 2.34 GB per billion fft), else fall back to execute-only so a too-large
-    # single-shard prove never OOM-kills the job.
-    csv="$repo/Benchmarks/Vectors.csv"
-    ceil=${AIUR_PROVE_MAX_FFT:-50000000000}
+    # Execute mode → Phase 1 only (--execute-only). Prove mode → always attempt
+    # a full prove (no tier gate). A RAM watchdog SIGKILLs the process tree if
+    # its tree-RSS approaches the runner's ceiling; the constant then records
+    # the neutral OOM sentinel `{ "<name>": {"oom": true} }` so bench.py compare
+    # renders `OOM` in that row instead of dropping it.
+    ceiling_gb=${AIUR_PROVE_MAX_RSS_GB:-120}
     while IFS= read -r c; do
       [ -z "$c" ] && continue
       slug=$(printf '%s' "$c" | tr '/ .:' '____')
-      res="$tmp/$slug.json"; spans="$tmp/$slug.spans"
-      do_prove=0
-      if [ "$mode" = prove ]; then
-        fft=$(awk -F, -v n="$c" '$1==n {print $6}' "$csv" 2>/dev/null)
-        if [ -n "${fft:-}" ] && [ "$fft" -le "$ceil" ]; then
-          do_prove=1
-        else
-          echo "::notice::aiur: '$c' fft=${fft:-?} exceeds $ceil (~128 GB); execute-only" >&2
-        fi
-      fi
+      res="$tmp/$slug.json"; spans="$res.spans"; oom="$tmp/$slug.oom"
+      rm -f "$oom"
       # bench-typecheck self-reports peak-rss (texray tree sampler) in its --json;
-      # --texray-json captures the per-phase aiur/*, stark/* timings.
-      if [ "$do_prove" = 1 ]; then
-        bench-typecheck --ixe "$ixe" "$c" --json "$res" --texray-json "$spans" \
+      # with --texray + --json it also writes per-phase aiur/*, stark/* timings
+      # to `<json>.spans` for the drill-down.
+      if [ "$mode" = execute ]; then
+        bench-typecheck --ixe "$ixe" --consts "$c" --json "$res" --execute-only --texray \
           > "$tmp/$slug.log" 2>&1 \
-          || { echo "::warning::aiur prove '$c' failed (OOM/timeout); dropping" >&2; continue; }
-      else
-        bench-typecheck --ixe "$ixe" "$c" --json "$res" --execute-only \
-          --texray-json "$spans" > "$tmp/$slug.log" 2>&1 \
           || { echo "::warning::aiur execute '$c' failed; dropping" >&2; continue; }
+      else
+        ( bench-typecheck --ixe "$ixe" --consts "$c" --json "$res" --texray ) \
+          > "$tmp/$slug.log" 2>&1 &
+        bt_pid=$!
+        watch_ram_kill "$bt_pid" "$ceiling_gb" "$oom" &
+        w_pid=$!
+        wait "$bt_pid" 2>/dev/null; bt_exit=$?
+        kill "$w_pid" 2>/dev/null || true
+        wait "$w_pid" 2>/dev/null || true
+        if [ -f "$oom" ]; then
+          echo "::warning::aiur prove '$c' OOM-killed at ${ceiling_gb} GB" >&2
+          jq -n --arg n "$c" '{($n): {oom: true}}' > "$res"
+        elif [ "$bt_exit" -ne 0 ]; then
+          echo "::warning::aiur prove '$c' failed (exit $bt_exit); dropping" >&2
+          continue
+        fi
       fi
       merge_phases "$res" "$spans"
       [ -s "$res" ] && cat "$res"
@@ -100,12 +137,17 @@ case "$backend" in
     ;;
 
   zisk|sp1)
+    # zkVM prove is not currently wired up (no GPU runner), so this branch runs
+    # execute only. The workflow filters `zisk|sp1 prove` at parse time.
+    if [ "$mode" != execute ]; then
+      echo "::error::$backend $mode: only execute mode is supported" >&2
+      emit_empty; exit 2
+    fi
     host="${backend}-host"; work="$repo/$backend"
     # Build the host once so per-constant timing excludes compilation. The host
     # self-measures and writes its own neutral results JSON via `--json`
-    # (cycles/execute-time/throughput/peak-rss for execute; prove-time/… for
-    # prove), so there is nothing to grep — `timeout` only bounds a runaway
-    # constant.
+    # (cycles/execute-time/throughput/peak-rss), so there is nothing to grep —
+    # `timeout` only bounds a runaway constant.
     echo "::group::cargo build $host"
     ( cd "$work" && cargo build --quiet --release --bin "$host" )
     echo "::endgroup::"
@@ -116,24 +158,14 @@ case "$backend" in
     # in-session as root; the host children inherit it. Without this the ASM
     # services die with `mmap(rom) errno=11`. SP1 needs no such raise.
     [ "$backend" = zisk ] && sudo prlimit --pid $$ --memlock=unlimited:unlimited
-    # zisk proves with `--gpu`; sp1 selects the GPU prover via its env/features.
-    gpu=; [ "$backend" = zisk ] && gpu=--gpu
     while IFS= read -r c; do
       [ -z "$c" ] && continue
       slug=$(printf '%s' "$c" | tr '/ .:' '____')
-      res="$tmp/$slug.json"; log="$tmp/$slug.log"; spans="$tmp/$slug.spans"
-      if [ "$mode" = execute ]; then
-        ( cd "$work" && timeout 25m "$bin" --execute --ixe "$ixe" \
-            --constant "$c" --skip-deps --json "$res" --texray-json "$spans" ) \
-          > "$log" 2>&1 \
-          || { echo "::warning::$backend execute '$c' failed/timed out; dropping" >&2; continue; }
-      else
-        # prove (GPU runner only — the workflow gates this cell).
-        ( cd "$work" && timeout 60m "$bin" $gpu --ixe "$ixe" \
-            --constant "$c" --skip-deps --json "$res" --texray-json "$spans" ) \
-          > "$log" 2>&1 \
-          || { echo "::warning::$backend prove '$c' failed/timed out; dropping" >&2; continue; }
-      fi
+      res="$tmp/$slug.json"; log="$tmp/$slug.log"; spans="$res.spans"
+      ( cd "$work" && timeout 25m "$bin" --execute --ixe "$ixe" \
+          --consts "$c" --skip-deps --json "$res" --texray ) \
+        > "$log" 2>&1 \
+        || { echo "::warning::$backend execute '$c' failed/timed out; dropping" >&2; continue; }
       # The host writes $res only on a clean (zero-failure) run.
       merge_phases "$res" "$spans"
       [ -s "$res" ] && cat "$res"
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index 12380f3f..228e60dd 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -217,9 +217,9 @@ jobs:
         env:
           REUSE_IXE: "1"
         run: |
-          # All primaries: run.sh proves each whose Aiur fft-cost fits ~128 GB and
-          # execute-only's the rest (so heavy primaries still report execute
-          # metrics). Per-constant peak-rss, same path as the !benchmark PR run.
+          # All primaries: run.sh proves each cheap-tier primary and execute-only's
+          # the heavy ones (so heavy primaries still report execute metrics).
+          # Per-constant peak-rss, same path as the !benchmark PR run.
           benv="${{ matrix.bench }}"; benv="${benv,}"   # InitStd→initStd, Mathlib→mathlib
           python3 .github/scripts/bench.py manifest \
             --csv Benchmarks/Vectors.csv --env "$benv" --mode prove --primary --out names.txt
@@ -243,7 +243,7 @@ jobs:
       # does the phase-level alerting).
       - uses: ./.github/actions/bencher-track
         with:
-          testbed: aiur-typecheck-x64-32x
+          testbed: aiur-check-x64-32x
           workload: aiur-check
           file: aiur.json
           key: ${{ secrets.BENCHER_API_KEY }}
@@ -275,7 +275,7 @@ jobs:
   # execute-only. Toolchain + deps come from the shared install-{zisk,sp1} actions.
   zkvm-execute:
     needs: compile
-    runs-on: warp-ubuntu-latest-x64-16x
+    runs-on: warp-ubuntu-latest-x64-32x
     timeout-minutes: 60
     strategy:
       fail-fast: false
@@ -325,7 +325,7 @@ jobs:
       # percentage bounds (throughput's regression is a drop).
       - uses: ./.github/actions/bencher-track
         with:
-          testbed: ${{ matrix.backend }}-execute-x64-16x
+          testbed: ${{ matrix.backend }}-check-x64-32x
           workload: ${{ matrix.backend }}-check
           file: bench.json
           key: ${{ secrets.BENCHER_API_KEY }}
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index cf801b99..c15ea302 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -1,18 +1,20 @@
 # `!benchmark` PR command: run the curated constant set (Benchmarks/Vectors.csv)
 # through chosen prover backend(s) and post a main-vs-PR comparison table.
 #
-#   !benchmark ([aiur] [zisk] [sp1] [ooc] | all)  [execute|prove]
+#   !benchmark ([aiur] [zisk] [sp1] [ooc] | all) [execute]
 #   BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
 #   BENCH_FULL=1                   # run the full curated set, not the ~11 primary
 #   BENCH_TIER=cheap|heavy|all     # override the mode default (execute=all, prove=cheap)
 #   BENCH_SHARD=1                  # restrict to the multi-shard target constants
-#   BENCH_GPU=1                    # allow zkVM prove on a self-hosted GPU runner
 #   RUST_LOG=info                  # passthrough env (allowlisted)
 #
-# Per-PR scope: Aiur runs execute (fast, --execute-only) or prove on CPU; the
-# Zisk/SP1 zkVM hosts run execute (deterministic cycle counts). zkVM prove needs
-# a GPU and is skipped with a note unless BENCH_GPU=1 selects a GPU runner. main's
-# numbers are cached by base SHA so they are not recomputed on every comment.
+# Mode is fixed per backend: `aiur` runs `prove` by default (its report also
+# carries the execute-side columns `fft-cost` / `execute-time`); `zisk` / `sp1`
+# / `ooc` always run `execute`. The optional `execute` token flips aiur to
+# execute-only (skips Phase 2); it's a no-op on other backends. main's numbers
+# come from bencher.dev; the workflow falls back to re-running the base SHA
+# locally only when bencher hasn't ingested it yet (freshly-pushed main whose
+# CI is still running).
 name: Benchmark pull requests
 
 on:
@@ -40,7 +42,6 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       matrix: ${{ steps.parse.outputs.matrix }}
-      mode: ${{ steps.parse.outputs.mode }}
       tier: ${{ steps.parse.outputs.tier }}
       shard: ${{ steps.parse.outputs.shard }}
       full: ${{ steps.parse.outputs.full }}
@@ -93,39 +94,22 @@ jobs:
       SHARD: ${{ needs.setup.outputs.shard }}
       FULL: ${{ needs.setup.outputs.full }}
     steps:
-      # ---------- skipped cell (zkVM prove without a GPU runner) ----------
-      - name: Skip note
-        if: matrix.cell.skip == 'true'
-        run: |
-          mkdir -p out
-          {
-            echo "### \`$BACKEND\` · \`$BENV\` · \`$MODE\`"
-            echo
-            echo "_Skipped: zkVM proving needs a GPU runner. Re-run with \`BENCH_GPU=1\` on a GPU-enabled runner._"
-          } > "out/table-$LABEL.md"
-
-      # ---------- real cell ----------
       # PR checked out at the workspace root so the local install actions and the
-      # helper scripts resolve; base (cache-miss only) goes under base/.
+      # helper scripts resolve; base (bencher-miss fallback only) goes under base/.
       - name: Checkout PR
-        if: matrix.cell.skip != 'true'
         uses: actions/checkout@v6
         with:
           ref: ${{ env.HEAD_SHA }}
       - name: Apply passthrough env
-        if: matrix.cell.skip != 'true'
         run: |
           while IFS= read -r line; do
             [ -n "$line" ] && printf '%s\n' "$line" >> "$GITHUB_ENV"
           done <<'PTENV'
           ${{ needs.setup.outputs.passthrough-env }}
           PTENV
-      # Select the constants for this cell → names.txt; emit count/vhash/tier
-      # (vhash is part of the main cache key, so editing Vectors.csv invalidates
-      # stale main results). Defaults to the primary subset; BENCH_FULL=1
-      # (→ full=1) runs the whole curated set.
+      # Select the constants for this cell → names.txt. Defaults to the primary
+      # subset; BENCH_FULL=1 (→ full=1) runs the whole curated set.
       - name: Select constants from Benchmarks/Vectors.csv
-        if: matrix.cell.skip != 'true'
         id: man
         run: |
           PRIMARY=--primary; [ "$FULL" = 1 ] && PRIMARY=
@@ -138,7 +122,6 @@ jobs:
       # backend needs `ix` to compile the env to a `.ixe`). Mathlib cache only
       # pulled for the mathlib env.
       - name: Build PR (ix, bench-typecheck)
-        if: matrix.cell.skip != 'true'
         uses: leanprover/lean-action@v1
         with:
           lake-package-directory: .
@@ -150,33 +133,38 @@ jobs:
       # zkVM cells additionally need the Rust toolchain + the backend's toolchain
       # and system deps (the shared composite install actions).
       - name: Set up zkVM Rust toolchain
-        if: matrix.cell.skip != 'true' && (matrix.cell.backend == 'zisk' || matrix.cell.backend == 'sp1')
+        if: matrix.cell.backend == 'zisk' || matrix.cell.backend == 'sp1'
         uses: actions-rust-lang/setup-rust-toolchain@v1
         with:
           cache-workspaces: ${{ matrix.cell.backend }}
       - name: Install SP1
-        if: matrix.cell.skip != 'true' && matrix.cell.backend == 'sp1'
+        if: matrix.cell.backend == 'sp1'
         uses: ./.github/actions/install-sp1
       - name: Install Zisk
-        if: matrix.cell.skip != 'true' && matrix.cell.backend == 'zisk'
+        if: matrix.cell.backend == 'zisk'
         uses: ./.github/actions/install-zisk
 
-      # ---------- main side (cached by base SHA) ----------
-      - name: Restore cached main results
-        if: matrix.cell.skip != 'true'
-        id: main-cache
-        uses: actions/cache/restore@v4
-        with:
-          path: main.json
-          key: bench-${{ matrix.cell.label }}-${{ env.BASE_SHA }}-${{ steps.man.outputs.vhash }}
-      - name: Checkout base
-        if: matrix.cell.skip != 'true' && steps.main-cache.outputs.cache-hit != 'true'
+      # ---------- main side ----------
+      # Try bencher.dev first (bench-main.yml has uploaded main's numbers).
+      # continue-on-error so bench.py's exit 2 (unsupported backend/mode) or 3
+      # (no data yet / API error) doesn't fail the job — we fall back to a
+      # local base run below.
+      - name: Fetch main from bencher
+        id: bencher
+        continue-on-error: true
+        run: |
+          python3 .github/scripts/bench.py fetch-main \
+            --sha "$BASE_SHA" --backend "$BACKEND" --mode "$MODE" \
+            --names "$GITHUB_WORKSPACE/names.txt" \
+            --out "$GITHUB_WORKSPACE/main.json"
+      - name: Checkout base (bencher had no data)
+        if: steps.bencher.outcome != 'success'
         uses: actions/checkout@v6
         with:
           ref: ${{ env.BASE_SHA }}
           path: base
       - name: Build base (ix, bench-typecheck)
-        if: matrix.cell.skip != 'true' && steps.main-cache.outputs.cache-hit != 'true'
+        if: steps.bencher.outcome != 'success'
         uses: leanprover/lean-action@v1
         with:
           lake-package-directory: base
@@ -186,21 +174,14 @@ jobs:
           use-github-cache: false
           use-mathlib-cache: ${{ matrix.cell.env == 'mathlib' && 'true' || 'false' }}
       - name: Run backend on base → main.json
-        if: matrix.cell.skip != 'true' && steps.main-cache.outputs.cache-hit != 'true'
+        if: steps.bencher.outcome != 'success'
         run: |
           export PATH="$PWD/base/.lake/build/bin:$PATH"
           bash .github/scripts/run.sh base "$BENV" "$BACKEND" "$MODE" \
             "$GITHUB_WORKSPACE/names.txt" "$GITHUB_WORKSPACE/main.json"
-      - name: Save cached main results
-        if: matrix.cell.skip != 'true' && steps.main-cache.outputs.cache-hit != 'true'
-        uses: actions/cache/save@v4
-        with:
-          path: main.json
-          key: bench-${{ matrix.cell.label }}-${{ env.BASE_SHA }}-${{ steps.man.outputs.vhash }}
 
       # ---------- PR side ----------
       - name: Run backend on PR → pr.json
-        if: matrix.cell.skip != 'true'
         run: |
           export PATH="$PWD/.lake/build/bin:$PATH"
           bash .github/scripts/run.sh . "$BENV" "$BACKEND" "$MODE" \
@@ -208,14 +189,14 @@ jobs:
 
       # ---------- compare ----------
       - name: Build comparison table
-        if: matrix.cell.skip != 'true'
         run: |
+          src=ran; [ "${{ steps.bencher.outcome }}" = success ] && src=bencher
           mkdir -p out
           python3 .github/scripts/bench.py compare \
             --main main.json --pr pr.json --out "out/table-$LABEL.md" \
             --backend "$BACKEND" --env "$BENV" --mode "$MODE" \
             --count "${{ steps.man.outputs.count }}" \
-            --cache-hit "${{ steps.main-cache.outputs.cache-hit }}"
+            --main-source "$src"
           cat "out/table-$LABEL.md"
 
       - name: Upload table
diff --git a/.github/workflows/riscv-bench.yml b/.github/workflows/riscv-bench.yml
index 79ab4950..2ac4db24 100644
--- a/.github/workflows/riscv-bench.yml
+++ b/.github/workflows/riscv-bench.yml
@@ -6,7 +6,7 @@ name: RISC-V bench
 # kernel typecheck of one constant in the SP1 and Zisk VMs — in parallel jobs.
 on:
   push:
-    branches: main
+    branches: [main, sb/ci-benchmarks]   # TEMPORARY: test on this branch
   workflow_dispatch:
 
 permissions:
@@ -40,7 +40,7 @@ jobs:
   # no proof, no GPU). SP1 and Zisk run as independent jobs so they parallelize;
   # each installs only its own toolchain via sp1up / ziskup (prebuilt binaries)
   # and downloads the shared fixture. minimal.ixe carries the full Init closure,
-  # so we scope execution with `--constant myReflEq --skip-deps`: that
+  # so we scope execution with `--consts myReflEq --skip-deps`: that
   # subject-only-typechecks just the named constant, trusting its Init
   # dependencies as Claim assumptions, instead of typechecking all of Init (which
   # never finishes in the emulator). Each host bails non-zero on any typecheck
@@ -69,7 +69,7 @@ jobs:
       - name: SP1 — execute minimal.ixe (assert failures == 0)
         run: |
           cd sp1
-          cargo run --bin sp1-host -- --execute --ixe ../minimal.ixe --constant myReflEq --skip-deps | tee only.txt
+          cargo run --bin sp1-host -- --execute --ixe ../minimal.ixe --consts myReflEq --skip-deps | tee only.txt
           grep -qE "failures: 0\b" only.txt
 
   zisk-execute:
@@ -97,5 +97,5 @@ jobs:
           # inherit it. Without this the services die with
           # `mmap(rom) errno=11` / "shmem creation ... failed".
           sudo prlimit --pid $$ --memlock=unlimited:unlimited
-          cargo run --bin zisk-host -- --execute --ixe ../minimal.ixe --constant myReflEq --skip-deps | tee only.txt
+          cargo run --bin zisk-host -- --execute --ixe ../minimal.ixe --consts myReflEq --skip-deps | tee only.txt
           grep -qE "failures: 0\b" only.txt
diff --git a/Benchmarks/Typecheck.lean b/Benchmarks/Typecheck.lean
index aded506d..d36ab53d 100644
--- a/Benchmarks/Typecheck.lean
+++ b/Benchmarks/Typecheck.lean
@@ -18,22 +18,21 @@ runtime. Useful standalone (per-constant timeline + RAM breakdown via
 tracing-texray) and as a machine source (neutral results JSON).
 
 ```
-lake exe bench-typecheck --ixe <path> [--constant <name>] [names…] [flags]
-
-  --ixe <path>       serialized `Ixon.Env`, e.g. from `ix compile Foo.lean`
-                     (writes `foo.ixe`). Required.
-  --constant <name>  constant to benchmark, by fully-qualified Lean name. The
-                     canonical single-target flag, shared with the Zisk
-                     `zisk-host --constant`. Unions with names / manifest.
-  [names…]           zero or more additional constant names to benchmark,
-                     e.g. `Nat.add_comm String.append`.
-  --manifest <path>  additionally read names from a file: one per line, blank
-                     lines and `#` comments ignored. Unions with [names…].
+lake exe bench-typecheck --ixe <path> --consts <n1,n2,…> [--consts-file <p>] [flags]
+
+  --ixe <path>          serialized `Ixon.Env`, e.g. from `ix compile Foo.lean`
+                        (writes `foo.ixe`). Required.
+  --consts <n1,n2,…>    comma-separated fully-qualified constant names to
+                        benchmark (e.g. `Nat.add_comm,String.append`). Same
+                        flag/shape as `ix check --consts`, `zisk-host --consts`,
+                        and `sp1-host --consts`.
+  --consts-file <path>  additionally read names from a file: one per line, blank
+                        lines and `#` comments ignored. Unions with --consts.
 
   Names (from any source) resolve against the env's named map via
   `String.toName` plus a `toString` fallback (mirrors `ix check --ixe`), so
   numeric / private components round-trip (`Foo.0.Bar`, `_private.M.0.foo`).
-  Pass at least one name via --constant, positional args, or --manifest.
+  Pass at least one name via --consts or --consts-file.
 
   --skip-deps    check only each target itself (verify_const, trusting its
                  deps) instead of its whole transitive closure (verify_claim,
@@ -41,9 +40,9 @@ lake exe bench-typecheck --ixe <path> [--constant <name>] [names…] [flags]
                  for targets too expensive to full-closure-check.
   --json <path>  write per-constant results JSON to <path>. Off by default:
                  normal CLI usage prints only the human-readable summary.
-  --texray       force the tracing-texray timeline + RAM breakdown on.
-  --no-texray    force it off. Default: on iff `--json` was NOT given, so a
-                 plain local run gets the breakdown while a JSON run stays quiet.
+  --texray       enable the tracing-texray timeline + RAM breakdown. With
+                 --json <path>, per-phase span timings are also written to
+                 <path>.spans (JSON Lines) for the CI drill-down. Off by default.
   --execute-only run only Phase 1 (constants / fft-cost / execute-time) and skip
                  proving — the fast `execute`-mode signal.
 ```
@@ -86,12 +85,12 @@ def friParameters : Aiur.FriParameters := {
   queryProofOfWorkBits := 0
 }
 
-/-- Manifest lines as raw strings: one name per line. Everything from a `#` to
-    end of line is a comment (whole-line or inline); blank lines are dropped.
-    `#` never appears in a Lean name, so splitting on it is safe. Resolution
-    against the env happens later (so the `toString` fallback can see the
-    displayed form the user wrote). -/
-def parseManifest (contents : String) : Array String :=
+/-- `--consts-file` lines as raw strings: one name per line. Everything from a
+    `#` to end of line is a comment (whole-line or inline); blank lines are
+    dropped. `#` never appears in a Lean name, so splitting on it is safe.
+    Resolution against the env happens later (so the `toString` fallback can
+    see the displayed form the user wrote). -/
+def parseConstsFile (contents : String) : Array String :=
   (contents.splitOn "\n").filterMap (fun line =>
     let s := ((line.splitOn "#").head?.getD "").trimAscii
     if s.isEmpty then none else some s.toString) |>.toArray
@@ -145,48 +144,43 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
   let some ixeArg := p.flag? "ixe"
     | IO.eprintln "error: --ixe <path> is required"; return 1
   let ixePath := ixeArg.as! String
-  -- Names come from `--constant`, the variadic positional args, and/or a
-  -- `--manifest` file. `--constant` is the canonical single-target flag shared
-  -- with the Zisk `zisk-host --constant`; positional names and `--manifest`
-  -- remain for benchmarking many constants at once.
-  let constName : Array String := match p.flag? "constant" with
-    | some f => #[f.as! String]
+  -- Names come from `--consts` (comma-list) and/or a `--consts-file` file.
+  let cliNames : Array String := match p.flag? "consts" with
+    | some f => ((f.as! String).splitOn ",").toArray.filterMap (fun s =>
+        let t := s.trim
+        if t.isEmpty then none else some t)
     | none => #[]
-  let cliNames := p.variableArgsAs! String
-  let fileNames ← match p.flag? "manifest" with
-    | some f => pure (parseManifest (← IO.FS.readFile (f.as! String)))
+  let fileNames ← match p.flag? "consts-file" with
+    | some f => pure (parseConstsFile (← IO.FS.readFile (f.as! String)))
     | none => pure #[]
   -- Union, preserving first-seen order, so the same const isn't proven twice.
   let nameArgs := Id.run do
     let mut seen : Std.HashSet String := {}
     let mut acc : Array String := #[]
-    for n in constName ++ cliNames ++ fileNames do
+    for n in cliNames ++ fileNames do
       if !seen.contains n then seen := seen.insert n; acc := acc.push n
     return acc
   if nameArgs.isEmpty then
-    IO.eprintln "error: provide a constant via --constant, positional name(s), and/or --manifest <path>"
+    IO.eprintln "error: provide at least one constant via --consts <n1,n2,…> and/or --consts-file <path>"
     return 1
   let jsonOut : Option String := (p.flag? "json").map (·.as! String)
-  let texrayJson : Option String := (p.flag? "texray-json").map (·.as! String)
-  -- Start the process-tree RSS sampler so each Result's peak-rss reflects the
-  -- true high-water mark. When a drill-down path is given, install the streaming
-  -- subscriber and point the per-span sink at it, so the prover's aiur/* and
-  -- stark/* phase timings land as JSON Lines for the CI comparison.
-  TracingTexray.startSampler
-  match texrayJson with
-  | some path => TracingTexray.init {}; TracingTexray.jsonSink path
-  | none => pure ()
   -- skip-deps: check just the target (`verify_const`, trusting its deps)
   -- instead of re-checking the whole transitive closure (`verify_claim`).
   let skipDeps := p.hasFlag "skip-deps"
   -- Execute-only: run just Phase 1 (constants / fft-cost / execute-time) and
   -- skip the Phase 2 prove loop.
   let executeOnly := p.hasFlag "execute-only"
-  -- Default: trace iff we're not in JSON/bencher mode.
-  let useTexray :=
-    if p.hasFlag "texray" then true
-    else if p.hasFlag "no-texray" then false
-    else jsonOut.isNone
+  -- Off by default; CI passes --texray explicitly.
+  let useTexray := p.hasFlag "texray"
+  -- Start the process-tree RSS sampler so each Result's peak-rss reflects the
+  -- true high-water mark. When --texray + --json are both on, also install the
+  -- streaming subscriber and point the per-span sink at `<json>.spans`, so the
+  -- prover's aiur/* and stark/* phase timings land as JSON Lines for the CI
+  -- comparison.
+  TracingTexray.startSampler
+  match useTexray, jsonOut with
+  | true, some path => TracingTexray.init {}; TracingTexray.jsonSink s!"{path}.spans"
+  | _, _ => pure ()
 
   -- Compile the IxVM kernel once; build the prover system once.
   let .ok toplevel := IxVM.ixVM
@@ -315,18 +309,14 @@ def typecheckCmd : Cli.Cmd := `[Cli|
   "Benchmark IxVM-kernel execution + proving of `Ix.Claim.check` over `.ixe` constants"
 
   FLAGS:
-    "ixe"       : String; "Path to a serialized `Ixon.Env` (e.g. produced by `ix compile`). Required."
-    "constant"  : String; "Constant to benchmark, by fully-qualified Lean name. The canonical single-target flag (shared with `zisk-host --constant`). Unions with positional names / --manifest."
-    "manifest"  : String; "Additionally read constant names from a file (one per line; `#` comments and blank lines ignored). Unions with the positional names."
+    "ixe"          : String; "Path to a serialized `Ixon.Env` (e.g. produced by `ix compile`). Required."
+    "consts"       : String; "Comma-separated fully-qualified constant names to benchmark (e.g. `Nat.add_comm,String.append`). Same flag/shape as `ix check --consts`, `zisk-host --consts`, and `sp1-host --consts`."
+    "consts-file"  : String; "Additionally read constant names from a file (one per line; `#` comments and blank lines ignored). Unions with --consts."
     "json"      : String; "Write per-constant results JSON to this path. Off by default; normal CLI usage prints only the human-readable summary."
     "skip-deps";          "Check only each target itself (verify_const, trusting its deps) instead of re-checking its whole transitive closure (verify_claim). Same flag as `zisk-host --skip-deps`."
     "execute-only";       "Execute only (Phase 1: constants / fft-cost / execute-time) and skip proving. The fast per-PR `execute`-mode signal."
-    texray;               "Force the tracing-texray timeline + RAM breakdown on (per-prove spans on stderr)."
-    "no-texray";          "Force the breakdown off. Default: on iff --json was not given."
-    "texray-json" : String; "Write per-phase span timings (aiur/*, stark/*) as JSON Lines to this path, for the CI drill-down. Implies installing the streaming subscriber."
+    texray;               "Enable the tracing-texray timeline + RAM breakdown (per-prove spans on stderr). Combined with --json, per-phase span timings are additionally written to `<json>.spans` as JSON Lines for the CI drill-down. Off by default."
 
-  ARGS:
-    ...names : String;   "Fully-qualified constant name(s) to benchmark (e.g. `Nat.add_comm String.append`). Optional if `--manifest` is given."
 ]
 
 def main (args : List String) : IO UInt32 :=
diff --git a/Benchmarks/Vectors.csv b/Benchmarks/Vectors.csv
index 65fe4dcf..35057169 100644
--- a/Benchmarks/Vectors.csv
+++ b/Benchmarks/Vectors.csv
@@ -1,98 +1,90 @@
-# Benchmark constant vectors -- single shared source of truth.
-# Consumed identically by Aiur (bench-typecheck --manifest), the Zisk/SP1 hosts
-# (--constant loop), and CI shell (awk). One row per curated constant.
+# Benchmark constant vectors -- single shared source of truth for which
+# constants to run. Consumed identically by Aiur (bench-typecheck --consts-file),
+# the Zisk/SP1 hosts (--consts loop), and CI shell (awk). One row per curated
+# constant. Measurements (fft, cycles, prove-time, …) live in the neutral
+# results JSON each tool emits and in bencher.dev — never in this file.
 #
-# Provenance: measured across this box's benchmarking dirs, de-duplicated --
-#   aiur_fft / env : ~/ix-aiur/Benchmarks/Statistics/data/aiur/cost.csv
-#   zisk_cycles    : ~/ix-aiur/Benchmarks/Statistics/data/zisk/single_shard.csv
-# 27 IxVM kernel-primitive constants from cost.csv are intentionally excluded
-# (not Init/Std/Mathlib library constants).
-#
-# Columns:
+# Columns (shard_target and primary default to 0 when trailing; most rows only
+# carry the first three):
 #   name         fully-qualified Lean name (resolves via NameResolve.resolveIxeAddr).
 #   env          compile target / .ixe it resolves in: initStd | lean | mathlib.
 #   tier         cheap = prove-feasible per-PR; heavy = execute-only / sharded.
-#                Boundary: Aiur fft >= 1e9 => heavy.
+#                run.sh's prove-loop reads this to gate prove vs execute-only.
 #   shard_target 1 = heavy constant designated as a multi-shard prove target.
-#   primary 1 = part of the ~11-constant primary subset spanning
-#                shape + the cheap->heavy cost range. Default for the !benchmark
-#                PR comment and the bencher prove / zkVM jobs (full set via
-#                BENCH_FULL=1). Heavy reps run only in execute/native; prove
-#                mode keeps the cheap primaries (tier filter).
-#   aiur_fft     measured Aiur fft-cost (proving-cost proxy). Informational.
-#   zisk_cycles  measured Zisk ziskemu step count; empty if unmeasured. Informational.
+#   primary      1 = part of the primary subset spanning shape + the cheap->heavy
+#                cost range. Default for the !benchmark PR comment and the
+#                bencher prove / zkVM jobs (full set via BENCH_FULL=1). Heavy
+#                reps run only in execute/native; prove mode keeps the cheap
+#                primaries (tier filter).
 #
 # CI filters on the env column (not line number), so these '#' lines and the
 # header are skipped by: awk -F, '$1!~/^#/ && $1!="name" && $2==env ...'
-name,env,tier,shard_target,primary,aiur_fft,zisk_cycles
-HEq,initStd,cheap,0,0,1716582,
-Nat,initStd,cheap,0,0,1857523,975244
-Eq.rec,initStd,cheap,0,0,2575400,2348520
-HEq.rec,initStd,cheap,0,0,2692988,2727278
-Trans.mk,initStd,cheap,0,0,2911629,7229214
-Array.toList,initStd,cheap,0,0,3332563,2580844
-Acc.rec,initStd,cheap,0,0,3505064,5105888
-Std.Time.Month.Offset.ofNat,initStd,cheap,0,0,3607673,1493508
-Sum.elim,initStd,cheap,0,0,5589905,6618130
-Prod.map,initStd,cheap,0,0,6904183,8177456
-Option.bind,initStd,cheap,0,0,7329183,7440608
-Except.bind,initStd,cheap,0,0,7667869,9427477
-WellFounded.fix,initStd,cheap,0,0,10125144,13415585
-Nat.add,initStd,cheap,0,0,13343000,10606339
-List.foldr,initStd,cheap,0,1,18579757,16707100
-List.dropLast,initStd,cheap,0,0,19509718,17522863
-List.range,initStd,cheap,0,0,20251801,13666491
-List.zipWith,initStd,cheap,0,0,20439088,20229977
-List.filterMap,initStd,cheap,0,0,25335779,21435279
-List.foldlM,initStd,cheap,0,0,39202740,
-Int.add,initStd,cheap,0,0,44714703,27635032
-BitVec.toFin,initStd,cheap,0,0,50437466,28681028
-Nat.add_comm,initStd,cheap,0,1,56084908,53239676
-UInt32.toNat,initStd,cheap,0,0,59331806,29980254
-USize.toNat,initStd,cheap,0,0,71607481,35811906
-Nat.decEq,initStd,cheap,0,0,71921625,57411966
-ByteSlice.ofByteArray,initStd,cheap,0,0,107574377,53555107
-Nat.decLe,initStd,cheap,0,0,209641496,143391161
-Nat.strongRecOn,initStd,cheap,0,0,273068854,190849758
-Int.emod,initStd,cheap,0,0,422940733,269380418
-Int.ediv,initStd,cheap,0,0,430476738,270987292
-Array.foldlM,initStd,cheap,0,0,434577494,
-Array.foldl,initStd,cheap,0,1,449323126,278537034
-Array.filter,initStd,cheap,0,0,464818232,285847515
-Nat.sub_le_of_le_add,initStd,cheap,0,0,567575653,373184538
-BitVec.add,initStd,cheap,0,0,617113462,373772656
-Int.gcd,initStd,cheap,0,1,657502637,409112011
-Nat.toDigits,initStd,cheap,0,0,663606297,357145741
-Array.map,initStd,cheap,0,0,734574964,443199245
-Array.zipWith,initStd,cheap,0,0,736658636,445195121
-String.Internal.append,initStd,cheap,0,1,793580333,
-Lean.Name.hash,initStd,cheap,0,0,861742653,447441591
-BitVec.umod,initStd,cheap,0,0,926177790,526467117
-Nat.repr,initStd,cheap,0,0,966452765,498729913
-Int.repr,initStd,cheap,0,0,993792541,504234535
-String.intercalate,initStd,heavy,0,0,1089240518,599428829
-_private.Init.Prelude.0.Lean.extractMainModule._unsafe_rec,initStd,heavy,0,0,1197925029,
-Char.toLower,initStd,heavy,0,0,1198467414,665920824
-Nat.gcd_comm,initStd,heavy,0,1,1954958779,1144352360
-Int.emod_emod_of_dvd,initStd,heavy,0,0,3856852693,2201588182
-Array.append_assoc,initStd,heavy,0,0,3938574533,1570256148
-Vector.append,initStd,heavy,0,0,4023268168,1614275115
-Fin.foldl,initStd,heavy,0,0,10853255199,5110854190
-List.mergeSort,initStd,heavy,1,1,13825318985,6706906294
-Array.binSearch,initStd,heavy,1,0,14397133548,6785827470
-Array.qsort,initStd,heavy,0,0,15781689533,7199288749
-Array.qsortOrd,initStd,heavy,0,0,15841062472,7206704674
-String.split,initStd,heavy,0,1,19578088286,8657387499
-Std.Time.Week.Offset.ofMilliseconds,initStd,heavy,0,0,24577209792,6653972854
-Vector.extract_append,initStd,heavy,1,1,61830646478,
-Lean.Expr.replace,lean,cheap,0,0,859625514,
-List.Sorted,mathlib,cheap,0,0,9578666,
-Nat.choose,mathlib,cheap,0,0,29018862,
-Nat.factorial,mathlib,cheap,0,1,33562426,
-Nat.fib,mathlib,cheap,0,0,34171209,
-GCDMonoid.gcd,mathlib,heavy,0,0,1005736276,
-Nat.Prime.two_le,mathlib,heavy,0,0,1504045298,
-Finset.prod,mathlib,heavy,0,0,3045165822,
-Finset.sum,mathlib,heavy,0,0,3045189408,
-Polynomial.eval,mathlib,heavy,0,0,5342731754,
-Multiset.sort,mathlib,heavy,1,1,18670960624,
+name,env,tier,shard_target,primary
+HEq,initStd,cheap
+Nat,initStd,cheap
+Eq.rec,initStd,cheap
+HEq.rec,initStd,cheap
+Trans.mk,initStd,cheap
+Array.toList,initStd,cheap
+Acc.rec,initStd,cheap
+Sum.elim,initStd,cheap
+Prod.map,initStd,cheap
+Option.bind,initStd,cheap
+Except.bind,initStd,cheap
+WellFounded.fix,initStd,cheap
+Nat.add,initStd,cheap
+List.filterMap,initStd,cheap
+Int.add,initStd,cheap
+BitVec.toFin,initStd,cheap
+Nat.add_comm,initStd,cheap,0,1
+USize.toNat,initStd,cheap
+Nat.decEq,initStd,cheap
+ByteSlice.ofByteArray,initStd,cheap
+Nat.decLe,initStd,cheap
+Nat.strongRecOn,initStd,cheap
+Int.emod,initStd,cheap
+Array.foldlM,initStd,cheap
+Array.filter,initStd,cheap
+Nat.sub_le_of_le_add,initStd,cheap,0,1
+BitVec.add,initStd,cheap
+Int.gcd,initStd,cheap,0,1
+Nat.toDigits,initStd,cheap
+Array.map,initStd,cheap
+Lean.Name.hash,initStd,cheap
+BitVec.umod,initStd,cheap
+Nat.repr,initStd,cheap
+String.intercalate,initStd,heavy
+_private.Init.Prelude.0.Lean.extractMainModule._unsafe_rec,initStd,heavy
+Char.toLower,initStd,heavy
+Nat.gcd_comm,initStd,heavy,0,1
+Int.emod_emod_of_dvd,initStd,heavy
+Array.append_assoc,initStd,heavy
+Vector.append,initStd,heavy,0,1
+Fin.foldl,initStd,heavy
+List.mergeSort,initStd,heavy,1,1
+Array.binSearch,initStd,heavy,1
+Array.qsortOrd,initStd,heavy
+String.split,initStd,heavy,0,1
+Std.Time.Week.Offset.ofMilliseconds,initStd,heavy
+Vector.extract_append._proof_1,initStd,heavy,1,1
+ByteArray.utf8DecodeChar?_utf8EncodeChar_append,initStd,heavy,0,1
+String.append,initStd,cheap,0,1
+_private.Init.Data.Range.Polymorphic.SInt.0.Int8.instRxcHasSize_eq,initStd,heavy,0,1
+_private.Init.Data.Range.Polymorphic.SInt.0.Int16.instRxcHasSize_eq,initStd,heavy,0,1
+_private.Init.Data.Range.Polymorphic.SInt.0.Int32.instRxcHasSize_eq,initStd,heavy,0,1
+_private.Init.Data.Range.Polymorphic.SInt.0.Int64.instRxcHasSize_eq,initStd,heavy,0,1
+Char.ofOrdinal_le_of_le,initStd,heavy,0,1
+Array.extract_append._proof_1_1,initStd,heavy,0,1
+IxVMPrim.nat_pow_big,initStd,heavy,0,1
+Std.Tactic.BVDecide.BVExpr.bitblast.goCache_Inv_of_Inv._mutual,initStd,heavy,0,1
+Lean.Expr.replace,lean,cheap
+List.Sorted,mathlib,cheap
+Nat.choose,mathlib,cheap
+Nat.factorial,mathlib,cheap,0,1
+Nat.fib,mathlib,cheap
+GCDMonoid.gcd,mathlib,heavy
+Nat.Prime.two_le,mathlib,heavy
+Finset.prod,mathlib,heavy
+Finset.sum,mathlib,heavy
+Polynomial.eval,mathlib,heavy
+Multiset.sort,mathlib,heavy,1,1
diff --git a/README.md b/README.md
index 892b7162..a975ad0d 100644
--- a/README.md
+++ b/README.md
@@ -231,7 +231,7 @@ Non-Nix users: install the SP1 toolchain manually per the
    ```
 
    For a larger, realistic env compile one of the `Benchmarks/Compile`
-   targets, then scope proving to a single constant with `--constant`
+   targets, then scope proving to one or more constants with `--consts`
    (step 2):
 
    ```
@@ -250,7 +250,7 @@ Non-Nix users: install the SP1 toolchain manually per the
    # Prove a single constant out of a larger env (Anon-only): the host resolves
    # the name and ships only that constant's closure sub-env. Full-closure by
    # default; add --skip-deps for a subject-only check (deps trusted).
-   WITHOUT_VK_VERIFICATION=1 RUST_LOG=info cargo run --release -- --ixe ../../init.ixe --constant Nat.add_comm
+   WITHOUT_VK_VERIFICATION=1 RUST_LOG=info cargo run --release -- --ixe ../../init.ixe --consts Nat.add_comm
    ```
 
    With no `--ixe`, the host runs against an empty `Ixon.Env`.
@@ -354,19 +354,20 @@ Non-Nix users: install Zisk manually per the
    RUST_LOG=info cargo run --release -- --verify-constraints --ixe ../minimal.ixe
    # Generate and verify a VadcopFinal proof of the same typecheck (CPU)
    RUST_LOG=info cargo run --release -- --ixe ../minimal.ixe
-   # Check a single named constant out of a larger env. The host resolves the
+   # Check one or more named constants out of a larger env. The host resolves each
    # name and ships only its closure sub-env (lazy fault-in, no whole-env load).
    # By default this is the FULL-CLOSURE typecheck — the constant and its whole
-   # dependency closure (matching the Aiur `bench-typecheck --constant`).
+   # dependency closure (matching the Aiur `bench-typecheck --consts <names>`).
    # Composes with --execute (cycles only) and plain prove.
-   RUST_LOG=info cargo run --release -- --execute --ixe ../init.ixe --constant Nat.add_comm
-   RUST_LOG=info cargo run --release -- --ixe ../init.ixe --constant Nat.add_comm
+   RUST_LOG=info cargo run --release -- --execute --ixe ../init.ixe --consts Nat.add_comm
+   RUST_LOG=info cargo run --release -- --ixe ../init.ixe --consts Nat.add_comm,Nat.succ
    # Add --skip-deps for a subject-only check (deps trusted, not re-checked):
-   RUST_LOG=info cargo run --release -- --execute --ixe ../init.ixe --constant Vector.extract_append --skip-deps
+   RUST_LOG=info cargo run --release -- --execute --ixe ../init.ixe --consts Vector.extract_append --skip-deps
    ```
 
-   `--constant` / `--skip-deps` are the same flags the Aiur `bench-typecheck`
-   uses, so the two backends share one vocabulary. `--skip-deps` trusts
+   `--consts` / `--skip-deps` are the same flags `ix check`, `sp1-host`, and the
+   Aiur `bench-typecheck` use, so all four share one vocabulary. `--skip-deps`
+   trusts
    dependencies rather than re-checking them, so it is far cheaper than the
    full-closure default — reserve it for constants too expensive to
    full-closure-check that also can't be sharded (e.g. `Vector.extract_append`
@@ -477,10 +478,10 @@ Non-Nix users: install Zisk manually per the
    [`DEFAULT_MEMORY_LIMIT`](https://github.com/succinctlabs/sp1/blob/v6.2.0/crates/core/executor/src/opts.rs#L25),
    configurable via `MEMORY_LIMIT` env var up to a ~1 TB JIT ceiling
    [`MAX_JIT_LOG_ADDR`](https://github.com/succinctlabs/sp1/blob/v6.2.0/crates/primitives/src/consts.rs#L11)),
-   or scope to a single constant with `--constant <name>` (all backends),
-   which resolves the name and ships only that constant's closure sub-env to the
-   guest. By default it re-checks the full dependency closure; add `--skip-deps`
-   to check it **subject-only** (dependencies trusted and lazily faulted in, not
+   or scope to one or more constants with `--consts <n1,n2,…>` (all backends),
+   which resolves each name and ships only that constant's closure sub-env to the
+   guest. By default each re-checks its full dependency closure; add `--skip-deps`
+   to check them **subject-only** (dependencies trusted and lazily faulted in, not
    re-typechecked) — so individual constants of a large env still fit the cap,
    even ones whose full-closure typecheck would not. To prove a large env in
    full under Zisk, shard it (see
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index 59f94b37..28b0d634 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -4,9 +4,18 @@ Ix is benchmarked on two surfaces, both driven by one curated constant set and
 the same backend drivers:
 
 - **`!benchmark` PR comment** (`.github/workflows/bench-pr.yml`) — on demand,
-  posts a **main-vs-PR** comparison table on the pull request.
+  posts a **main-vs-PR** comparison table on the pull request. main's numbers
+  are pulled from bencher.dev via its public reports API (`bench.py fetch-main`);
+  the PR side is measured fresh. Design-level skips (`aiur execute` — redundant
+  with `aiur prove`; `zkVM prove` — no GPU on main; `ooc prove` — no in-circuit
+  prove) are filtered from the matrix at parse time and post a note explaining
+  why. For supported combinations, if bencher hasn't ingested the base SHA yet
+  (freshly-pushed main whose push CI is still running), the workflow falls
+  back to re-running the base SHA locally.
 - **Bencher.dev** (`.github/workflows/bench-main.yml`) — on every push to `main`,
-  tracks each measure over time at <https://bencher.dev> (project `ix`).
+  tracks each measure over time at <https://bencher.dev> (project `ix`). This is
+  the canonical store for main-branch measurements; the `!benchmark` PR path
+  reads from it.
 
 ## Backends
 
@@ -16,10 +25,9 @@ the same backend drivers:
 | `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `peak-rss` |
 | `ooc` | the same kernel run **out-of-circuit and in parallel** (`ix check-rs`) — far faster | `throughput` (constants/s), `check-time`, `peak-rss`, `constants` |
 
-In **prove** mode, `run.sh` proves each constant whose Aiur fft-cost fits the
-prover RAM ceiling (`AIUR_PROVE_MAX_FFT`, ~128 GB at 2.34 GB per billion fft) and
-falls back to **execute-only** for the rest, so every primary still reports
-metrics. The `ooc` backend reports two views: the **whole env** (`ix check-rs
+In **prove** mode, `run.sh` proves each `cheap`-tier primary and falls back to
+**execute-only** for the `heavy`-tier ones (a single-shard prove would exceed
+Aiur's ~128 GB RAM ceiling), so every primary still reports metrics. The `ooc` backend reports two views: the **whole env** (`ix check-rs
 --anon`, keyed by env) and a **per-primary subject check** (`ix check-rs
 --consts`, keyed by constant — apples-to-apples with the zkVM `--skip-deps`
 execute).
@@ -46,7 +54,7 @@ PR seconds + Δ%), so a regression can be traced to the phase that moved.
 
 ## Constant set — `Benchmarks/Vectors.csv`
 
-One CSV is the single source of truth: `name,env,tier,shard_target,primary,aiur_fft,zisk_cycles`.
+One CSV is the single source of truth for *which* constants to run: `name,env,tier,shard_target,primary`. Measurements never live here — they're in each tool's neutral results JSON and in bencher.dev.
 
 - `env` — compile target the constant resolves in (`initStd` / `lean` / `mathlib`).
 - `tier` — `cheap` (prove-feasible on a CI runner) or `heavy` (execute-only; a
@@ -63,23 +71,28 @@ One CSV is the single source of truth: `name,env,tier,shard_target,primary,aiur_
 Maintainer comment on a PR:
 
 ```
-!benchmark ([aiur] [zisk] [sp1] [ooc] | all)  [execute|prove]
+!benchmark ([aiur] [zisk] [sp1] [ooc] | all) [execute]
 BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
 BENCH_FULL=1                   # run the full curated set, not the ~11 primary
 BENCH_TIER=cheap|heavy|all     # override the mode default (execute=all, prove=cheap)
 BENCH_SHARD=1                  # restrict to the multi-shard target constants
-BENCH_GPU=1                    # allow zkVM prove on a self-hosted GPU runner
 RUST_LOG=info                  # passthrough env (allowlisted)
 ```
 
-Defaults: `aiur`, `execute`, `initStd`, primary subset. Backends fan out as a
-matrix; `main` results are cached by base SHA. zkVM `prove` is skipped with a
-note unless a GPU runner is selected.
+Mode is fixed per backend: `aiur` runs `prove` by default (its report also
+carries the execute-side columns `fft-cost` / `execute-time` alongside
+`prove-time`), while `zisk` / `sp1` / `ooc` always run `execute`. The optional
+bare `execute` token flips `aiur` to execute-only (`bench-typecheck
+--execute-only`; skips Phase 2); on the other three backends it's a no-op.
+Defaults: `aiur`, `initStd`, primary subset. Backends fan out as a matrix;
+`main` results are pulled from bencher.dev.
 
 ## Bencher jobs (`bench-main.yml`)
 
 `build → compile → { prove, zkvm-execute, ooc-check }`, each reporting to its
 own testbed + **workload** (`aiur-check`, `zisk-check`, `sp1-check`, `ooc-check`, `ix-compile`).
+All four typecheck testbeds share the same slug shape:
+`<backend>-check-x64-32x`.
 Deterministic measures (cycles, fft-cost, constants, …) are pinned exactly;
 noisy wall-clock measures (time, RAM, throughput) ride percentage bounds, both
 windowed to the per-workload `bencher-thresholds-reset-<workload>` tag.
@@ -90,5 +103,5 @@ To re-baseline a workload after an intended step change, comment
 
 ## Not yet covered
 
-- **zkVM proving** (Zisk/SP1 `prove`) needs a self-hosted GPU runner; on CPU
-  runners it is execute-only.
+- **zkVM proving** (Zisk/SP1 `prove`) is not wired up — needs a self-hosted
+  GPU runner. Currently zkVMs are execute-only on both surfaces.
diff --git a/docs/zisk-cycle-cost-model.md b/docs/zisk-cycle-cost-model.md
index 86e18822..3c961ccf 100644
--- a/docs/zisk-cycle-cost-model.md
+++ b/docs/zisk-cycle-cost-model.md
@@ -68,7 +68,7 @@ objective for a cost spanning 50M–9B cycles.
 **Full-closure typecheck** — a self-contained closure: a small program, or a
 constant checked with all its dependencies. Calibrated **cross-library** on
 n=76: 12 small programs + 64 diverse constants checked full-closure via
-`--constant`, spanning **Init (51), Std (3), and Mathlib (10)**:
+`--consts`, spanning **Init (51), Std (3), and Mathlib (10)**:
 ```
 cycles ≈ 0.68M + 96,989·hb + 4,151·subst        R² 0.987, MAPE 6%
 ```
@@ -99,11 +99,12 @@ cycles ≈ 0.39M + 6,740·block_bytes + 14·subenv_bytes + 4,070·subst   R² 0.
 Reserved for constants too expensive to full-closure-check that can't be sharded
 (Finding 4).
 
-**Measuring one constant.** `zisk-host --execute --ixe initstd.ixe --constant
+**Measuring one constant.** `zisk-host --execute --ixe initstd.ixe --consts
 <NAME>` resolves the name, builds just its closure sub-env (deps lazily faulted
 in — no separate `.ixe`, no whole-env ingress), and checks it. Full-closure by
 default (`Ix.Claim.check addr none`); add `--skip-deps` for subject-only. Same
-flags as the Aiur `bench-typecheck`.
+`--consts` / `--skip-deps` vocabulary as `ix check`, `sp1-host`, and the Aiur
+`bench-typecheck`.
 
 ---
 
@@ -147,12 +148,18 @@ side.
    the cap still executes under it. Each leaf pays the ~180M fixed floor and adds
    an aggregation node, so cheap constants are batched rather than proven one at
    a time. (`--shards N` still does balanced bisection, for manual control.)
-4. **A few constants can't be full-closure-proven on a 250 GiB box and aren't
-   shardable** (single atomic constants): `Vector/Array.extract_append._proof_1`,
-   the `instRxcHasSize_eq` family. The escape hatch is `--skip-deps`:
-   `Vector.extract_append` is the 143 GiB OOM case under full-closure but checks
-   subject-only in 74M cycles. The planner flags these via
-   `infeasible_atomic_floor`.
+4. **A few constants can't be proven as a single full-closure leaf on a 250 GiB
+   box**: `Vector/Array.extract_append._proof_1`, the `instRxcHasSize_eq` family.
+   This is a per-leaf ingress/RAM ceiling on the `--consts <name>` (full-closure,
+   `Ix.Claim.check addr none`) mode — not a global unshardability. In
+   env-sharding mode (`--shard-plan`) these same constants are fine: their
+   subject checks fit in one work item and their deps are proved in other
+   shards, folded in through the assumptions root. The only work unit the
+   env-sharding planner truly can't split is a **mutual block** (`build_anon_work`
+   emits one item per Muts block, checked atomically). The escape hatch in
+   single-constant mode is `--skip-deps`: `Vector.extract_append` is the 143 GiB
+   OOM case under full-closure but checks subject-only in 74M cycles. The
+   planner flags these via `infeasible_atomic_floor`.
 5. **The packing order comes from min-cut.** Whole-env profiling of
    Init/Std/**Mathlib** (mathlib = 631k blocks) shows Lean typecheck is uniformly
    reduction-dominated: own-bytes is only **2.6–7% of member cost** (mathlib
@@ -221,7 +228,7 @@ Shards 630/634 are one atomic constant each (`Int*.instRxcHasSize_eq`): tiny
 `bytes`/`subst` but huge cycles, driven by `hb` (deep nat-range def-eq) — the
 "expensive atomic" case (Finding 4).
 
-### Full-closure single constants — `--constant`, diverse shapes
+### Full-closure single constants — `--consts`, diverse shapes
 
 One library constant each, checked full-closure (the constant and its whole
 dependency closure). The 35 Init constants below (over `initstd.ixe`) are shown
@@ -288,7 +295,7 @@ lake exe ix shard env.ixprof --max-ram 256        # or --max-cycles C / --shards
 cargo run --release --bin zisk-host -- --execute --ixe <env.ixe> \
   [--shard-plan <plan.ixes> --only-shard K]
 cargo run --release --bin zisk-host -- --execute --ixe initstd.ixe \
-  --constant "Nat.add_comm" [--skip-deps]      # full-closure / subject-only
+  --consts "Nat.add_comm" [--skip-deps]        # full-closure / subject-only
 
 # fits
 python3 ~/benchdata/prof/fit_xlib.py         # full-closure model (76 pts, Init+Std+Mathlib)
diff --git a/sp1/host/src/main.rs b/sp1/host/src/main.rs
index ded742b8..3d050ff4 100644
--- a/sp1/host/src/main.rs
+++ b/sp1/host/src/main.rs
@@ -6,7 +6,7 @@
 //! RUST_LOG=info cargo run --release -- --execute --ixe ../../minimal.ixe
 //! WITHOUT_VK_VERIFICATION=1 RUST_LOG=info cargo run --release  # prove (compressed)
 //! # prove a single constant out of a large env (Anon-only):
-//! WITHOUT_VK_VERIFICATION=1 RUST_LOG=info cargo run --release -- --ixe ../../init.ixe --constant Nat.add_comm
+//! WITHOUT_VK_VERIFICATION=1 RUST_LOG=info cargo run --release -- --ixe ../../init.ixe --consts Nat.add_comm
 //! ```
 //!
 //! Proving (any non-`--execute` run) requires `WITHOUT_VK_VERIFICATION=1` in
@@ -37,54 +37,37 @@ pub const GUEST_ELF: Elf = include_elf!("sp1-guest");
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
-  /// Run the program in the VM only - no proof.
+  /// Execute in the VM only — no proof.
   #[arg(long)]
   execute: bool,
 
-  /// Run the kernel in Meta mode (preserves names + dup-level-param-name
-  /// check). Default is Anon mode, which matches Aiur's `kernel_check_test`
-  /// semantics. Both modes prove the same structural typecheck; Meta is
-  /// strictly more constrained but slightly more expensive.
+  /// Run the kernel in Meta mode (default: Anon). Meta preserves names.
   #[arg(long)]
   meta: bool,
 
-  /// Path to a `.ixe` file produced by `lake exe ix compile`. If omitted, an
-  /// empty `IxonEnv` is used.
+  /// Path to a `.ixe` (default: empty env).
   #[arg(long)]
   ixe: Option<PathBuf>,
 
-  /// Check a single constant selected by its Lean NAME (e.g. "Nat.add_comm").
-  /// The name resolves through the env's `named` metadata to its ingress
-  /// block; the guest receives only that block's closure sub-env, so one
-  /// constant can be proved out of a large env without shipping (or
-  /// typechecking) the whole thing. By default this is the **full-closure**
-  /// typecheck (the constant and its whole dependency closure, matching
-  /// `Ix.Claim.check addr none` / the Aiur `bench-typecheck --constant`); pass
-  /// `--skip-deps` for a subject-only check (deps trusted). Anon-only
-  /// (incompatible with `--meta`). Requires `--ixe`.
+  /// Comma-separated Lean names to check (Anon-only; each is one guest run).
+  #[arg(long, value_delimiter = ',')]
+  consts: Vec<String>,
+
+  /// Additional names from a file (one per line, `#` comments); unions with --consts.
   #[arg(long)]
-  constant: Option<String>,
+  consts_file: Option<PathBuf>,
 
-  /// Modifies `--constant`: check only the named constant itself, trusting its
-  /// dependencies (subject-only), instead of re-checking its whole transitive
-  /// closure. Same flag/semantics as `zisk-host --skip-deps` and the Aiur
-  /// `bench-typecheck --skip-deps`.
-  #[arg(long, requires = "constant")]
+  /// With --consts: check each subject only, trusting its deps.
+  #[arg(long, requires = "consts")]
   skip_deps: bool,
 
-  /// Write the neutral per-constant results JSON `{ "<name>": { … } }` to this
-  /// path (execute → cycles/execute-time/throughput/peak-rss; prove →
-  /// prove-time/peak-rss). Written only on a clean run (zero failures), so a
-  /// present file always holds a valid measurement. This is the machine
-  /// source the CI bench driver merges; the human summary still prints.
+  /// Write per-constant results JSON `{ "<name>": { … } }` (accumulated across --consts).
   #[arg(long)]
   json: Option<PathBuf>,
 
-  /// Write per-phase timings (`{"span","seconds"}` JSON Lines) to this path via
-  /// tracing-texray's sink, for the CI drill-down. The host records its
-  /// `execute` / `prove` phases here.
+  /// Enable tracing-texray; with --json, per-phase spans are written to <json>.spans.
   #[arg(long)]
-  texray_json: Option<PathBuf>,
+  texray: bool,
 }
 
 /// Peak resident set size (bytes) across this process *and its children*, from
@@ -97,19 +80,51 @@ fn peak_rss_bytes() -> Option<u64> {
   }
 }
 
-/// Write the neutral per-constant entry `{ "<name>": <metrics> }` to `path`
-/// (the shape `run.sh` merges with `jq -s`). serde_json handles key escaping so
-/// arbitrary Lean names are safe.
+/// Append the per-constant entry `{ "<name>": <metrics> }` to the results JSON
+/// at `path`, merging into any existing object so a multi-const run (`--consts
+/// a,b,c`) accumulates one map with an entry per name.
 fn write_json_entry(
   path: &PathBuf,
   name: &str,
   metrics: serde_json::Value,
 ) -> Result<()> {
-  let entry = serde_json::json!({ name: metrics });
-  fs::write(path, serde_json::to_string(&entry)?)
+  let mut map: serde_json::Map<String, serde_json::Value> = match fs::read(path)
+  {
+    Ok(bytes) => serde_json::from_slice(&bytes).unwrap_or_default(),
+    Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+      serde_json::Map::new()
+    },
+    Err(e) => return Err(anyhow::anyhow!("read {}: {e}", path.display())),
+  };
+  map.insert(name.to_string(), metrics);
+  fs::write(path, serde_json::to_string(&serde_json::Value::Object(map))?)
     .map_err(|e| anyhow::anyhow!("write {}: {e}", path.display()))
 }
 
+/// Union `--consts` with names from `--consts-file`, preserving first-seen order.
+fn collect_consts(args: &Args) -> Result<Vec<String>> {
+  let mut seen: std::collections::HashSet<String> =
+    std::collections::HashSet::new();
+  let mut out: Vec<String> = Vec::new();
+  for name in &args.consts {
+    let trimmed = name.trim();
+    if !trimmed.is_empty() && seen.insert(trimmed.to_string()) {
+      out.push(trimmed.to_string());
+    }
+  }
+  if let Some(path) = &args.consts_file {
+    let contents = fs::read_to_string(path)
+      .map_err(|e| anyhow::anyhow!("read {}: {e}", path.display()))?;
+    for line in contents.lines() {
+      let name = line.split('#').next().unwrap_or("").trim();
+      if !name.is_empty() && seen.insert(name.to_string()) {
+        out.push(name.to_string());
+      }
+    }
+  }
+  Ok(out)
+}
+
 fn load_env_bytes(ixe: Option<&PathBuf>) -> Vec<u8> {
   match ixe {
     Some(path) => fs::read(path).expect("read ixe input"),
@@ -149,7 +164,7 @@ fn count_checkable(env_bytes: &[u8], meta_mode: bool) -> usize {
   }
 }
 
-/// Resolve `--constant <name>` to the guest inputs for that one constant: its
+/// Resolve one `--consts` name to the guest inputs for that constant: its
 /// closure sub-env and a check-list. The name resolves through the full env's
 /// `named` metadata to a constant address, which maps to the `build_anon_work`
 /// item whose ingress block owns it (standalone → itself; a mutual-block member
@@ -233,41 +248,56 @@ async fn main() -> Result<()> {
   // composing it with the SDK's global logger (`sp1_sdk::utils::setup_logger`),
   // currently the sole subscriber.
   tracing_texray::rss_sampler::start(std::time::Duration::from_millis(50));
-  if let Some(path) = &args.texray_json {
-    if let Some(p) = path.to_str() {
-      let _ = tracing_texray::json_sink::to_file(p);
+  // With --texray + --json, per-phase span timings land at `<json>.spans` as
+  // JSON Lines — the CI drill-down input.
+  if args.texray {
+    if let Some(json) = args.json.as_ref().and_then(|p| p.to_str()) {
+      let _ = tracing_texray::json_sink::to_file(format!("{json}.spans"));
     }
   }
 
   let whole_env_bytes = load_env_bytes(args.ixe.as_ref());
+  let client = ProverClient::from_env().await;
+  let consts = collect_consts(&args)?;
+  if !consts.is_empty() && args.meta {
+    bail!("--consts is Anon-only and cannot be combined with --meta");
+  }
 
-  // `--constant` ships a closure sub-env + a check-list (Anon only); otherwise
-  // the whole env ships with an empty check-list (= check everything).
-  let (env_bytes, check_list, const_count) =
-    if let Some(name) = &args.constant {
-      if args.meta {
-        bail!("--constant is Anon-only and cannot be combined with --meta");
-      }
-      constant_inputs(&whole_env_bytes, name, args.skip_deps)?
-    } else {
-      let cc = count_checkable(&whole_env_bytes, args.meta);
-      (whole_env_bytes, Vec::new(), cc)
-    };
+  if consts.is_empty() {
+    run_one(&client, &args, &whole_env_bytes, None).await?;
+  } else {
+    for name in &consts {
+      run_one(&client, &args, &whole_env_bytes, Some(name)).await?;
+    }
+  }
+  Ok(())
+}
+
+async fn run_one<C: Prover + Sync>(
+  client: &C,
+  args: &Args,
+  whole_env_bytes: &[u8],
+  name: Option<&str>,
+) -> Result<()> {
+  // A name ships a closure sub-env + a check-list (Anon only); otherwise the
+  // whole env ships with an empty check-list (= check everything).
+  let (env_bytes, check_list, const_count) = match name {
+    Some(n) => constant_inputs(whole_env_bytes, n, args.skip_deps)?,
+    None => {
+      let cc = count_checkable(whole_env_bytes, args.meta);
+      (whole_env_bytes.to_vec(), Vec::new(), cc)
+    },
+  };
 
   // Three guest inputs, in order:
   //   1. 1-byte mode flag (0 = Anon / 1 = Meta).
-  //   2. Serialized Ixon env (whole env, or a closure sub-env under
-  //      `--constant`). Anon enumerates work in-guest via
-  //      `ix_kernel::anon_work::build_anon_work`; Meta walks `env.named`.
-  //   3. Check-list of packed primary addresses (`--constant`), or empty
-  //      to check every work item.
+  //   2. Serialized Ixon env (whole env, or a closure sub-env under --consts).
+  //   3. Check-list of packed primary addresses (--consts), or empty for all.
   let mut stdin = SP1Stdin::new();
   stdin.write::<u8>(&u8::from(args.meta));
   stdin.write_vec(env_bytes);
   stdin.write_vec(check_list);
 
-  let client = ProverClient::from_env().await;
-
   if args.execute {
     let exec_start = Instant::now();
     let (output, report) =
@@ -320,7 +350,8 @@ async fn main() -> Result<()> {
       let cycles = report.total_instruction_count();
       let secs = exec_duration.as_secs_f64();
       let tput = if secs > 0.0 { cycles as f64 / secs } else { 0.0 };
-      let key = args.constant.clone().unwrap_or_else(|| "env".to_string());
+      let key =
+        name.map(|s| s.to_string()).unwrap_or_else(|| "env".to_string());
       write_json_entry(
         path,
         &key,
@@ -364,7 +395,7 @@ async fn main() -> Result<()> {
   let verify_duration = verify_start.elapsed();
   println!("proof verified in {:.3}s", verify_duration.as_secs_f64());
   if let Some(path) = &args.json {
-    let key = args.constant.clone().unwrap_or_else(|| "env".to_string());
+    let key = name.map(|s| s.to_string()).unwrap_or_else(|| "env".to_string());
     write_json_entry(
       path,
       &key,
@@ -376,3 +407,68 @@ async fn main() -> Result<()> {
   }
   Ok(())
 }
+
+#[cfg(test)]
+mod cli_tests {
+  use clap::Parser;
+
+  use super::{Args, collect_consts};
+
+  fn parse(argv: &[&str]) -> Args {
+    Args::try_parse_from(
+      std::iter::once("sp1-host").chain(argv.iter().copied()),
+    )
+    .expect("parse ok")
+  }
+  fn parse_err(argv: &[&str]) -> String {
+    Args::try_parse_from(
+      std::iter::once("sp1-host").chain(argv.iter().copied()),
+    )
+    .unwrap_err()
+    .to_string()
+  }
+
+  #[test]
+  fn consts_splits_on_comma() {
+    let a = parse(&["--consts", "Nat.add_comm,Nat.succ"]);
+    assert_eq!(a.consts, vec!["Nat.add_comm", "Nat.succ"]);
+  }
+
+  #[test]
+  fn consts_repeatable_and_comma_lists_stack() {
+    let a = parse(&["--consts", "a", "--consts", "b,c"]);
+    assert_eq!(a.consts, vec!["a", "b", "c"]);
+  }
+
+  #[test]
+  fn skip_deps_requires_consts() {
+    assert!(parse_err(&["--skip-deps"]).contains("--consts"));
+  }
+
+  #[test]
+  fn json_alone_ok() {
+    // sp1-host's --json is not gated on --consts (keys by "env" when no name).
+    let a = parse(&["--json", "out.json"]);
+    assert_eq!(a.json.as_deref(), Some(std::path::Path::new("out.json")));
+  }
+
+  #[test]
+  fn consts_file_alone_ok() {
+    let a = parse(&["--consts-file", "names.txt"]);
+    assert_eq!(
+      a.consts_file.as_deref(),
+      Some(std::path::Path::new("names.txt"))
+    );
+  }
+
+  #[test]
+  fn collect_unions_and_dedups() {
+    let path = std::env::temp_dir().join("sp1_host_cli_test_consts.txt");
+    std::fs::write(&path, "a\nb\n# comment\n  c  \n\na\n").expect("write");
+    let a =
+      parse(&["--consts", "a,d", "--consts-file", path.to_str().unwrap()]);
+    let got = collect_consts(&a).expect("collect");
+    assert_eq!(got, vec!["a", "d", "b", "c"]);
+    let _ = std::fs::remove_file(&path);
+  }
+}
diff --git a/zisk/Cargo.toml b/zisk/Cargo.toml
index 308731d2..4f6c64ea 100644
--- a/zisk/Cargo.toml
+++ b/zisk/Cargo.toml
@@ -21,4 +21,4 @@ panic = "abort"
 # `blake3-precompile` branch above (now that v0.18.0 + the blake3 shim are
 # pushed). To iterate against a local checkout instead, re-add:
 #   [patch."https://github.com/argumentcomputer/zisk.git"]
-#   zisk-sdk = { path = "/home/ubuntu/zisk/sdk" }
+#   zisk-sdk = { path = "/path/to/zisk/sdk" }
diff --git a/zisk/host/src/main.rs b/zisk/host/src/main.rs
index b3c89ae5..fb68d981 100644
--- a/zisk/host/src/main.rs
+++ b/zisk/host/src/main.rs
@@ -186,44 +186,29 @@ struct Args {
   #[arg(long)]
   dump_input: Option<PathBuf>,
 
-  /// Check a single constant selected by its Lean NAME (e.g.
-  /// "ByteArray.utf8DecodeChar?_utf8EncodeChar_append"), with no manifest or
-  /// range plumbing: the name is resolved through the env's `named` metadata to
-  /// its ingress block, and the guest receives only its closure sub-env. By
-  /// default this is the **full-closure** typecheck — the constant *and* its
-  /// whole dependency closure are re-checked (matching `Ix.Claim.check addr
-  /// none`, the default of the Aiur `bench-typecheck --constant`). Pass
-  /// `--skip-deps` for a subject-only check (deps trusted). Composes with
-  /// `--execute` (cycles), plain prove (single leaf, subject-bound + verified),
-  /// and `--dump-input` (write the stdin for ziskemu profiling). Requires
-  /// exactly one `--ixe`. Note: a member of a mutual block selects the whole
-  /// block's work item (the kernel checks blocks atomically).
+  /// Comma-separated Lean names to check (each: closure sub-env → one leaf).
+  #[arg(
+    long,
+    value_delimiter = ',',
+    conflicts_with_all = ["shard_plan", "only_shard", "store_dir"]
+  )]
+  consts: Vec<String>,
+
+  /// Additional names from a file (one per line, `#` comments); unions with --consts.
   #[arg(long, conflicts_with_all = ["shard_plan", "only_shard", "store_dir"])]
-  constant: Option<String>,
-
-  /// Modifies `--constant`: check only the named constant itself, trusting its
-  /// dependencies (subject-only), instead of re-checking its whole transitive
-  /// closure. Reserved for constants too expensive to full-closure-check that
-  /// also can't be sharded. Same flag/semantics as the Aiur
-  /// `bench-typecheck --skip-deps`.
-  #[arg(long, requires = "constant")]
+  consts_file: Option<PathBuf>,
+
+  /// With --consts: check each subject only, trusting its deps.
+  #[arg(long, requires = "consts")]
   skip_deps: bool,
 
-  /// Write the neutral per-constant results JSON `{ "<name>": { … } }` to this
-  /// path (execute → cycles/execute-time/throughput/peak-rss; prove →
-  /// prove-time/steps/peak-rss). Written only on a clean run (zero failures),
-  /// so a present file always holds a valid measurement. Requires `--constant`.
-  /// This is the machine-readable source the CI bench driver merges; the
-  /// human-readable summary still prints regardless.
-  #[arg(long, requires = "constant")]
+  /// Write per-constant results JSON `{ "<name>": { … } }` (accumulated across --consts).
+  #[arg(long, requires = "consts")]
   json: Option<PathBuf>,
 
-  /// Write per-phase timings (`{"span","seconds"}` JSON Lines) to this path via
-  /// tracing-texray's sink, for the CI drill-down. The host records its
-  /// `execute` / `prove` phases here; any zisk-sdk tracing spans nested under
-  /// them are captured too.
+  /// Enable tracing-texray; with --json, per-phase spans are written to <json>.spans.
   #[arg(long)]
-  texray_json: Option<PathBuf>,
+  texray: bool,
 }
 
 /// Peak resident set size (bytes) across this process *and its children*, from
@@ -237,19 +222,57 @@ fn peak_rss_bytes() -> Option<u64> {
   }
 }
 
-/// Write the neutral per-constant entry `{ "<name>": <metrics> }` to `path`
-/// (the shape `run.sh` merges with `jq -s`). serde_json handles key escaping so
-/// arbitrary Lean names are safe.
+/// Append the per-constant entry `{ "<name>": <metrics> }` to the neutral
+/// results JSON at `path`. If the file exists, its object is loaded and the new
+/// key is merged in (overwriting on collision), so a multi-const run
+/// (`--consts a,b,c`) accumulates one map with an entry per name. Written after
+/// every constant, so an external `timeout` still leaves a complete file of the
+/// entries collected so far. serde_json handles key escaping so arbitrary Lean
+/// names are safe.
 fn write_json_entry(
   path: &PathBuf,
   name: &str,
   metrics: serde_json::Value,
 ) -> Result<()> {
-  let entry = serde_json::json!({ name: metrics });
-  std::fs::write(path, serde_json::to_string(&entry)?)
+  let mut map: serde_json::Map<String, serde_json::Value> =
+    match std::fs::read(path) {
+      Ok(bytes) => serde_json::from_slice(&bytes).unwrap_or_default(),
+      Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+        serde_json::Map::new()
+      },
+      Err(e) => return Err(anyhow::anyhow!("read {}: {e}", path.display())),
+    };
+  map.insert(name.to_string(), metrics);
+  std::fs::write(path, serde_json::to_string(&serde_json::Value::Object(map))?)
     .map_err(|e| anyhow::anyhow!("write {}: {e}", path.display()))
 }
 
+/// Union `--consts` (comma-list) with names read from `--consts-file` (one per
+/// line, `#` comments and blank lines dropped), preserving first-seen order so
+/// the same name is never re-proven.
+fn collect_consts(args: &Args) -> Result<Vec<String>> {
+  let mut seen: std::collections::HashSet<String> =
+    std::collections::HashSet::new();
+  let mut out: Vec<String> = Vec::new();
+  for name in &args.consts {
+    let trimmed = name.trim();
+    if !trimmed.is_empty() && seen.insert(trimmed.to_string()) {
+      out.push(trimmed.to_string());
+    }
+  }
+  if let Some(path) = &args.consts_file {
+    let contents = std::fs::read_to_string(path)
+      .map_err(|e| anyhow::anyhow!("read {}: {e}", path.display()))?;
+    for line in contents.lines() {
+      let name = line.split('#').next().unwrap_or("").trim();
+      if !name.is_empty() && seen.insert(name.to_string()) {
+        out.push(name.to_string());
+      }
+    }
+  }
+  Ok(out)
+}
+
 /// 112-byte public output of one shard-guest proof.
 ///
 /// First 44 bytes are the original prefix (env_hash, range_start/range_end —
@@ -789,10 +812,7 @@ fn check_input_coherence(
   Ok(failures)
 }
 
-fn build_client(
-  gpu: bool,
-  asm: bool,
-) -> Result<EmbeddedClient> {
+fn build_client(gpu: bool, asm: bool) -> Result<EmbeddedClient> {
   // Executor choice. The default is the Assembly executor (`asm = true`,
   // i.e. no `--emulator`): it is markedly faster at trace generation and is
   // the prerequisite for the hints stream. It historically broke under our
@@ -825,7 +845,7 @@ fn build_client(
   builder.build()
 }
 
-/// Check a single constant chosen by Lean NAME (the `--constant` path).
+/// Check a single constant chosen by Lean NAME (one iteration of `--consts`).
 /// Resolve name → constant address via the env's `named` metadata, map to its
 /// ingress block's work item, and ship its closure sub-env. By default the
 /// check-list is the ENTIRE closure (full-closure typecheck); with
@@ -955,7 +975,10 @@ async fn run_constant(
   result.get_public_values_slice(&mut buf);
   let publics = ShardPublics::decode(&buf);
   let leaf_ms = result.get_proving_time();
-  tracing_texray::json_sink::record_manual("zisk/prove", leaf_ms as f64 / 1000.0);
+  tracing_texray::json_sink::record_manual(
+    "zisk/prove",
+    leaf_ms as f64 / 1000.0,
+  );
   let expected = subject_of_cover(&cover);
   if *expected.as_bytes() != publics.subject_root {
     bail!(
@@ -1672,9 +1695,11 @@ async fn main() -> Result<()> {
   // own tracing spans — which requires composing it with the SDK's global logger
   // (`zisk_sdk::setup_logger`), currently the sole subscriber.
   tracing_texray::rss_sampler::start(std::time::Duration::from_millis(50));
-  if let Some(path) = &args.texray_json {
-    if let Some(p) = path.to_str() {
-      let _ = tracing_texray::json_sink::to_file(p);
+  // With --texray + --json, per-phase span timings land at `<json>.spans` as
+  // JSON Lines — the CI drill-down input.
+  if args.texray {
+    if let Some(json) = args.json.as_ref().and_then(|p| p.to_str()) {
+      let _ = tracing_texray::json_sink::to_file(format!("{json}.spans"));
     }
   }
 
@@ -1700,9 +1725,9 @@ async fn main() -> Result<()> {
       "--shard-plan requires exactly one --ixe input (the env the manifest was built for)"
     );
   }
-  // `--constant` selects a named constant from one env.
-  if args.constant.is_some() && inputs.len() > 1 {
-    bail!("--constant requires exactly one --ixe input");
+  // `--consts` selects named constants from one env.
+  if !args.consts.is_empty() && inputs.len() > 1 {
+    bail!("--consts requires exactly one --ixe input");
   }
 
   // ---- Plan every input up front (parse + shard). ----
@@ -1766,8 +1791,7 @@ async fn main() -> Result<()> {
   let grand_target_count: usize = plans.iter().map(|p| p.target_count).sum();
   let total_leaves: usize = plans.iter().map(|p| p.shards.len()).sum();
 
-  let client =
-    build_client(args.gpu, !args.emulator)?;
+  let client = build_client(args.gpu, !args.emulator)?;
   client.setup(&SHARD_PROGRAM).run()?.await?;
   // Skip agg-guest setup unless we'll produce more than one leaf proof.
   // The shard-plan path sets up the agg program itself, after its leaves.
@@ -1796,9 +1820,12 @@ async fn main() -> Result<()> {
     return Ok(());
   }
 
-  // ---- Single named constant (no manifest/range). ----
-  if let Some(name) = &args.constant {
-    run_constant(&client, &plans[0], name, &args).await?;
+  // ---- Named constants (no manifest/range). Loops one leaf per name. ----
+  let consts = collect_consts(&args)?;
+  if !consts.is_empty() {
+    for name in &consts {
+      run_constant(&client, &plans[0], name, &args).await?;
+    }
     return Ok(());
   }
 
@@ -2090,3 +2117,70 @@ mod closure_tests {
     );
   }
 }
+
+#[cfg(test)]
+mod cli_tests {
+  use clap::Parser;
+
+  use super::{Args, collect_consts};
+
+  fn parse(argv: &[&str]) -> Args {
+    Args::try_parse_from(
+      std::iter::once("zisk-host").chain(argv.iter().copied()),
+    )
+    .expect("parse ok")
+  }
+  fn parse_err(argv: &[&str]) -> String {
+    Args::try_parse_from(
+      std::iter::once("zisk-host").chain(argv.iter().copied()),
+    )
+    .unwrap_err()
+    .to_string()
+  }
+
+  #[test]
+  fn consts_splits_on_comma() {
+    let a = parse(&["--consts", "Nat.add_comm,Nat.succ,String.append"]);
+    assert_eq!(a.consts, vec!["Nat.add_comm", "Nat.succ", "String.append"]);
+  }
+
+  #[test]
+  fn consts_repeatable_and_comma_lists_stack() {
+    let a = parse(&["--consts", "a", "--consts", "b,c"]);
+    assert_eq!(a.consts, vec!["a", "b", "c"]);
+  }
+
+  #[test]
+  fn skip_deps_requires_consts() {
+    assert!(parse_err(&["--skip-deps"]).contains("--consts"));
+  }
+
+  #[test]
+  fn json_requires_consts() {
+    assert!(parse_err(&["--json", "out.json"]).contains("--consts"));
+  }
+
+  #[test]
+  fn consts_conflicts_with_shard_plan() {
+    let s = parse_err(&["--consts", "a", "--shard-plan", "p.ixes"]);
+    assert!(s.contains("shard-plan") || s.contains("shard_plan"));
+  }
+
+  #[test]
+  fn consts_conflicts_with_only_shard() {
+    let s = parse_err(&["--consts", "a", "--only-shard", "1"]);
+    assert!(s.contains("only-shard") || s.contains("only_shard"));
+  }
+
+  #[test]
+  fn collect_unions_and_dedups() {
+    let dir = std::env::temp_dir();
+    let path = dir.join("zisk_host_cli_test_consts.txt");
+    std::fs::write(&path, "a\nb\n# comment\n  c  \n\na\n").expect("write");
+    let a =
+      parse(&["--consts", "a,d", "--consts-file", path.to_str().unwrap()]);
+    let got = collect_consts(&a).expect("collect");
+    assert_eq!(got, vec!["a", "d", "b", "c"]);
+    let _ = std::fs::remove_file(&path);
+  }
+}

From 53f64713431e25e129a0c4000179ef2e355071b0 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 2 Jul 2026 15:57:45 -0400
Subject: [PATCH 10/27] feat(ci): compile as a !benchmark backend; full-closure
 zkVM parity; threshold + doc cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**`compile` as a !benchmark backend**
- bench.py: `DEFAULT_MODE["compile"]="compile"`,
  `MAIN_TESTBEDS[("compile","compile")]="ix-compile-x64-32x"`,
  `METRICS[("compile","compile")]=["compile-time","throughput","file-size",
  "constants"]`. `cmd_manifest` short-circuits on `--backend compile`: writes
  a one-line names.txt with the CamelCase env slug (`InitStd` / `Lean` /
  `Mathlib` / `FLT`) and skips the CSV. New `--backend` arg on manifest.
- run.sh: setup step teed to `$compile_log` for every backend; `compile`
  ignores REUSE_IXE (needs a fresh compile to measure) and parses
  `##benchmark## <elapsed_ms> <bytes> <constants>` into the neutral
  `{ "<CamelCase>": {"compile-time","file-size","constants","throughput"} }`
  shape. Compile job on main (`bench-main.yml` testbed
  `ix-compile-x64-32x`) is the same run bencher already stores.
- bench-pr.yml: `!benchmark ([aiur] [zisk] [sp1] [ooc] [compile] | all)`;
  manifest step passes `--backend "$BACKEND"`.

**zkVM parity with ooc (drop `--skip-deps`)**
- run.sh zisk/sp1 branch: `--consts <c>` without `--skip-deps` so the check
  is directly comparable to `ooc`'s `ix check-rs --consts <c>` (also
  full-closure). Comparing subject-only zkVM against full-closure ooc mixed
  in-circuit-vs-out-of-circuit overhead with scope; both are now
  full-closure, so the delta isolates just the overhead.

**Threshold semantics for deterministic-but-directional measures**
- bench-main.yml zkvm-execute (now zkvm-typecheck): `cycles`, `shards`,
  `max-shard-cycles` change from pinned `0/0` to `upper 0 / lower _`.
  These are deterministic (no noise) but a real guest / packer improvement
  legitimately drops them; the old pin flagged wins as regressions.
  `constants` (definitional count) stays `0/0`.

**docs/benchmarking.md — stale content swept**
- Removed the "design-level skips post a note explaining why" paragraph
  (that machinery was already removed).
- Replaced the `cheap → prove / heavy → execute-only` fallback description
  with the RAM-watchdog reality (attempts prove for every primary; OOMs go
  through as `{"oom": true}` sentinels rendered `OOM`).
- Backends table adds `compile`; drops metrics `aiur` doesn't actually emit.
- "Constant set": ~20 primaries in ~68 total (was ~11 in ~60); notes tier
  is informational-only now, rows may omit trailing zeros, and `compile`
  short-circuits the CSV.
- Per-measure threshold breakdown replaces the incorrect "Deterministic
  measures … are pinned exactly" claim.
- "Not yet covered" now lists three real TODOs (zkVM prove, per-constant
  phase drill-down, non-main base branches) with pointers to the in-code
  TODO markers.

**Code comment audit** — every touched comment/docstring re-read; stale
references to `--skip-deps` parity, tier-based fallback, sub-span
drill-down, and cache-hit rendering all updated to match implementation.
---
 .github/scripts/bench.py         |  58 +++++++++----
 .github/scripts/run.sh           |  65 +++++++++++----
 .github/workflows/bench-main.yml |  14 ++--
 .github/workflows/bench-pr.yml   |  22 ++---
 docs/benchmarking.md             | 134 +++++++++++++++++++++----------
 5 files changed, 203 insertions(+), 90 deletions(-)

diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index b6990d31..4a89cbf2 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -21,10 +21,21 @@
 
 # ───────────────────────── parse ─────────────────────────
 # Default mode per backend. Aiur is the only backend with a real choice:
-# `prove` (default) is the full pipeline; `execute` skips Phase 2 (--execute-only)
-# and reports the fft-cost / execute-time subset. Users opt in with the bare
-# `execute` token in `!benchmark`. The zkVMs and ooc are always execute.
-DEFAULT_MODE = {"aiur": "prove", "zisk": "execute", "sp1": "execute", "ooc": "execute"}
+# `prove` (default) is the full pipeline; `execute` skips Phase 2
+# (`--execute-only`) and reports the `fft-cost` / `execute-time` subset —
+# users opt in via the bare `execute` token in `!benchmark`. The zkVMs and
+# `ooc` always run execute; `compile` runs `ix compile`.
+DEFAULT_MODE = {
+    "aiur":    "prove",
+    "zisk":    "execute",
+    "sp1":     "execute",
+    "ooc":     "execute",
+    # `compile` benchmarks `ix compile <env>.lean → <env>.ixe` — the same job
+    # `bench-main.yml`'s `compile` matrix uploads under testbed `ix-compile-*`.
+    # Mode is `compile` (there's no execute/prove split); the "benchmark name"
+    # in bencher is the CamelCase env slug (`InitStd`, `Lean`, `Mathlib`, `FLT`).
+    "compile": "compile",
+}
 BACKENDS = tuple(DEFAULT_MODE)
 ENVS = ("initStd", "lean", "mathlib")
 CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_FULL"}
@@ -101,10 +112,18 @@ def mode_for(b):
 
 # ──────────────────────── manifest ────────────────────────
 def cmd_manifest(a):
+    # `compile` doesn't consume Vectors.csv — the "benchmark name" on bencher
+    # is the CamelCase env slug (`initStd` → `InitStd`), one per cell.
+    if a.backend == "compile":
+        name = a.env[:1].upper() + a.env[1:]
+        with open(a.out, "w") as f:
+            f.write(name + "\n")
+        print(f"count=1\ntier=n/a")
+        return
     # prove defaults to the cheap tier to keep the full set bounded; the curated
-    # primary subset is exempt — run.sh gates prove vs execute-only per-constant
-    # on the tier column, so all primaries are selected here (heavy ones fall
-    # back to execute-only in run.sh, not by being excluded up here).
+    # primary subset is exempt — run.sh's aiur prove path attempts prove for
+    # every primary (RAM watchdog catches OOMs), so all primaries are selected
+    # here regardless of tier.
     tier = a.tier or ("cheap" if (a.mode == "prove" and not a.primary) else "all")
     names = []
     with open(a.csv) as f:
@@ -141,11 +160,12 @@ def cmd_manifest(a):
 # prove-time / no throughput). Bencher stores only the prove set for main —
 # `execute` mode filters that same JSON down to the execute-side columns.
 METRICS = {
-    ("aiur", "prove"):    ["fft-cost", "execute-time", "prove-time", "peak-rss"],
-    ("aiur", "execute"):  ["fft-cost", "execute-time", "peak-rss"],
-    ("zisk", "execute"):  ["cycles", "execute-time", "throughput", "peak-rss"],
-    ("sp1",  "execute"):  ["cycles", "execute-time", "throughput", "peak-rss"],
-    ("ooc",  "execute"):  ["throughput", "check-time", "peak-rss"],
+    ("aiur",    "prove"):    ["fft-cost", "execute-time", "prove-time", "peak-rss"],
+    ("aiur",    "execute"):  ["fft-cost", "execute-time", "peak-rss"],
+    ("zisk",    "execute"):  ["cycles", "execute-time", "throughput", "peak-rss"],
+    ("sp1",     "execute"):  ["cycles", "execute-time", "throughput", "peak-rss"],
+    ("ooc",     "execute"):  ["throughput", "check-time", "peak-rss"],
+    ("compile", "compile"):  ["compile-time", "throughput", "file-size", "constants"],
 }
 
 
@@ -343,11 +363,12 @@ def cmd_comment(a):
 MAIN_TESTBEDS = {
     # `aiur execute` uses the same testbed as `aiur prove` — bencher stores
     # only prove and the execute columns are extracted from that JSON.
-    ("aiur", "prove"):    "aiur-check-x64-32x",
-    ("aiur", "execute"):  "aiur-check-x64-32x",
-    ("zisk", "execute"):  "zisk-check-x64-32x",
-    ("sp1",  "execute"):  "sp1-check-x64-32x",
-    ("ooc",  "execute"):  "ooc-check-x64-32x",
+    ("aiur",    "prove"):    "aiur-check-x64-32x",
+    ("aiur",    "execute"):  "aiur-check-x64-32x",
+    ("zisk",    "execute"):  "zisk-check-x64-32x",
+    ("sp1",     "execute"):  "sp1-check-x64-32x",
+    ("ooc",     "execute"):  "ooc-check-x64-32x",
+    ("compile", "compile"):  "ix-compile-x64-32x",
 }
 
 
@@ -423,6 +444,9 @@ def main():
     m.add_argument("--csv", required=True); m.add_argument("--env", required=True)
     m.add_argument("--mode", required=True); m.add_argument("--tier", default="")
     m.add_argument("--shard", default="0"); m.add_argument("--out", required=True)
+    m.add_argument("--backend", default="",
+                   help="Backend for this cell (used to special-case `compile`, "
+                        "which doesn't consume Vectors.csv).")
     m.add_argument("--primary", action="store_true",
                    help="Restrict to the primary subset (the primary=1 column).")
     m.set_defaults(fn=cmd_manifest)
diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index 10992130..87f52e2c 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -7,14 +7,16 @@
 #   run.sh <repo_dir> <env> <backend> <mode> <names_file> <out_json>
 #     repo_dir : checked-out worktree (has .lake/build/bin/{ix,bench-typecheck})
 #     env      : initStd | lean | mathlib  (any case; used verbatim for <env>.ixe)
-#     backend  : aiur | zisk | sp1 | ooc
-#     mode     : execute | prove
+#     backend  : aiur | zisk | sp1 | ooc | compile
+#     mode     : execute | prove | compile
 #
 # `ix` / `bench-typecheck` come from <repo_dir> (so base measures base's code, PR
-# the PR's — the caller puts <repo_dir>/.lake/build/bin on PATH). Each constant
-# is its own subprocess (a failure/timeout drops only that row). Only JSON is
-# written to stdout — tool output and `::warning::`/`::notice::` go to logs /
-# stderr so they never corrupt the merged JSON stream.
+# the PR's — the caller puts <repo_dir>/.lake/build/bin on PATH). For the
+# per-constant backends (aiur, zisk, sp1, ooc), each name is its own subprocess
+# so a failure/timeout drops only that row. The `compile` backend is per-env
+# (the env slug is the benchmark name) and measures the compile step directly.
+# Only JSON is written to stdout — tool output and `::warning::`/`::notice::`
+# go to logs / stderr so they never corrupt the merged JSON stream.
 set -uo pipefail
 
 repo=${1:?repo_dir}; benv=${2:?env}; backend=${3:?backend}; mode=${4:?mode}
@@ -81,17 +83,20 @@ case "$(printf '%s' "$benv" | tr '[:upper:]' '[:lower:]')" in
   *) echo "unknown env: $benv" >&2; exit 2 ;;
 esac
 
+tmp=$(mktemp -d)
+compile_log="$tmp/compile.log"
+
+# `compile` backend needs a fresh compile to measure — never honor REUSE_IXE.
 ixe="$repo/$benv.ixe"
-if [ "${REUSE_IXE:-0}" = 1 ] && [ -f "$ixe" ]; then
+if [ "${REUSE_IXE:-0}" = 1 ] && [ "$backend" != compile ] && [ -f "$ixe" ]; then
   echo "reusing existing $ixe (REUSE_IXE)" >&2
 else
   echo "::group::ix compile $module → $benv.ixe ($backend/$mode)"
-  "$repo/.lake/build/bin/ix" compile "$repo/Benchmarks/Compile/$module.lean" --out "$ixe"
+  "$repo/.lake/build/bin/ix" compile "$repo/Benchmarks/Compile/$module.lean" \
+    --out "$ixe" 2>&1 | tee "$compile_log"
   echo "::endgroup::"
 fi
 
-tmp=$(mktemp -d)
-
 case "$backend" in
   aiur)
     # One bench-typecheck per constant (isolation + per-constant peak-rss).
@@ -162,8 +167,12 @@ case "$backend" in
       [ -z "$c" ] && continue
       slug=$(printf '%s' "$c" | tr '/ .:' '____')
       res="$tmp/$slug.json"; log="$tmp/$slug.log"; spans="$res.spans"
+      # Full-closure check (no --skip-deps) so this is directly comparable to
+      # the ooc `ix check-rs --consts` run — the delta then isolates the
+      # in-circuit-vs-out-of-circuit overhead rather than mixing in subject-
+      # only vs full-closure scope.
       ( cd "$work" && timeout 25m "$bin" --execute --ixe "$ixe" \
-          --consts "$c" --skip-deps --json "$res" --texray ) \
+          --consts "$c" --json "$res" --texray ) \
         > "$log" 2>&1 \
         || { echo "::warning::$backend execute '$c' failed/timed out; dropping" >&2; continue; }
       # The host writes $res only on a clean (zero-failure) run.
@@ -178,9 +187,10 @@ case "$backend" in
     # off the structured line
     #   `##check## <elapsed_ms> <passed> <failures> <total> <peak-rss-bytes>`
     # (peak-rss from ix check's tracing-texray tree sampler): the whole env in
-    # parallel (`--anon`, keyed by env), and a per-primary subject check
-    # (`--consts`, keyed by constant) for an apples-to-apples baseline next to
-    # the zkVM `--skip-deps` execute.
+    # parallel (`--anon`, keyed by env), and a per-primary full-closure check
+    # (`--consts`, keyed by constant) — apples-to-apples with the zkVM execute
+    # above (also full-closure now), so the delta isolates in-circuit vs
+    # out-of-circuit overhead.
     ooc_one() {  # <label> <ix-check-args…>  → prints one JSON object
       local label="$1"; shift
       local log="$tmp/n.out"
@@ -209,6 +219,33 @@ case "$backend" in
     emit_empty
     ;;
 
+  compile)
+    # `ix compile <env>.lean → <env>.ixe` is the benchmark; the compile step
+    # above always ran fresh for this backend (REUSE_IXE ignored) and teed to
+    # `$compile_log`. `ix compile` emits `##benchmark## <elapsed_ms> <bytes>
+    # <constants>` which we parse into the neutral results shape. The bencher
+    # benchmark name is the CamelCase env slug (matches bench-main.yml's
+    # matrix.bench keys: `InitStd`, `Lean`, `Mathlib`, `FLT`).
+    line=$(grep '^##benchmark##' "$compile_log" 2>/dev/null | tail -1)
+    if [ -z "$line" ]; then
+      echo "::warning::compile: no ##benchmark## line in $compile_log; dropping" >&2
+      emit_empty
+    else
+      elapsed_ms=$(echo "$line" | awk '{print $2}')
+      bytes=$(echo "$line" | awk '{print $3}')
+      constants=$(echo "$line" | awk '{print $4}')
+      benv_cc=$(printf '%s' "$benv" | awk '{print toupper(substr($0,1,1)) substr($0,2)}')
+      elapsed_s=$(awk -v e="$elapsed_ms" 'BEGIN{printf "%.3f", e/1000}')
+      throughput=$(awk -v c="$constants" -v e="$elapsed_ms" \
+        'BEGIN{ if (e>0) printf "%.2f", c*1000/e; else print 0 }')
+      jq -n --arg n "$benv_cc" \
+            --argjson t "$elapsed_s" --argjson b "$bytes" \
+            --argjson c "$constants" --argjson tp "$throughput" \
+        '{($n): {"compile-time":$t,"file-size":$b,"constants":$c,"throughput":$tp}}' \
+        > "$out"
+    fi
+    ;;
+
   *) echo "unknown backend: $backend" >&2; exit 2 ;;
 esac
 echo "rows in $out: $(jq 'length' "$out" 2>/dev/null || echo '?')" >&2
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index 228e60dd..284d81f8 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -1,4 +1,4 @@
-name: Benchmarks
+name: Benchmark main
 
 # Benchmarks tracked on Bencher on every push to main, all reusing the one
 # compiled `.ixe` so the compiler runs once:
@@ -320,8 +320,10 @@ jobs:
           # so span timings are tracked over time alongside the headline metrics.
           jq 'map_values((.phases // {}) as $p | del(.phases) | (. + ($p | with_entries(.key |= "phase:" + .))) | map_values({value: .}))' neutral.json > bench.json
           cat bench.json
-      # cycles / shards / max-shard-cycles are deterministic per guest ELF →
-      # pinned (0/0). execute-time / peak-rss / throughput are noisy wall-clock →
+      # cycles / shards / max-shard-cycles are deterministic per guest ELF, but
+      # a real guest / packer improvement legitimately drops them — upper-only
+      # 0% bound (flag regressions, let wins through), like `fft-cost` on the
+      # aiur job. execute-time / peak-rss / throughput are noisy wall-clock →
       # percentage bounds (throughput's regression is a drop).
       - uses: ./.github/actions/bencher-track
         with:
@@ -333,13 +335,13 @@ jobs:
           thresholds: |
             --threshold-measure cycles --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
-            --threshold-lower-boundary 0
+            --threshold-lower-boundary _
             --threshold-measure shards --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
-            --threshold-lower-boundary 0
+            --threshold-lower-boundary _
             --threshold-measure max-shard-cycles --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
-            --threshold-lower-boundary 0
+            --threshold-lower-boundary _
             --threshold-measure execute-time --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index c15ea302..07340d88 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -1,20 +1,21 @@
 # `!benchmark` PR command: run the curated constant set (Benchmarks/Vectors.csv)
 # through chosen prover backend(s) and post a main-vs-PR comparison table.
 #
-#   !benchmark ([aiur] [zisk] [sp1] [ooc] | all) [execute]
+#   !benchmark ([aiur] [zisk] [sp1] [ooc] [compile] | all) [execute]
 #   BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
-#   BENCH_FULL=1                   # run the full curated set, not the ~11 primary
-#   BENCH_TIER=cheap|heavy|all     # override the mode default (execute=all, prove=cheap)
+#   BENCH_FULL=1                   # run the full curated set, not just primary
+#   BENCH_TIER=cheap|heavy|all     # tier override (default: all)
 #   BENCH_SHARD=1                  # restrict to the multi-shard target constants
 #   RUST_LOG=info                  # passthrough env (allowlisted)
 #
 # Mode is fixed per backend: `aiur` runs `prove` by default (its report also
 # carries the execute-side columns `fft-cost` / `execute-time`); `zisk` / `sp1`
-# / `ooc` always run `execute`. The optional `execute` token flips aiur to
-# execute-only (skips Phase 2); it's a no-op on other backends. main's numbers
-# come from bencher.dev; the workflow falls back to re-running the base SHA
-# locally only when bencher hasn't ingested it yet (freshly-pushed main whose
-# CI is still running).
+# / `ooc` run `execute`; `compile` runs `ix compile <env>.lean → <env>.ixe`
+# (the same job bench-main.yml uploads under testbed `ix-compile-*`). The
+# optional bare `execute` token flips `aiur` to execute-only (skips Phase 2);
+# on the other backends it's a no-op. main's numbers come from bencher.dev;
+# the workflow falls back to re-running the base SHA locally only when bencher
+# hasn't ingested it yet (freshly-pushed main whose CI is still running).
 name: Benchmark pull requests
 
 on:
@@ -108,13 +109,16 @@ jobs:
           ${{ needs.setup.outputs.passthrough-env }}
           PTENV
       # Select the constants for this cell → names.txt. Defaults to the primary
-      # subset; BENCH_FULL=1 (→ full=1) runs the whole curated set.
+      # subset; BENCH_FULL=1 (→ full=1) runs the whole curated set. The `compile`
+      # backend short-circuits this in bench.py — its "benchmark name" is the
+      # CamelCase env slug, so names.txt gets one line and the CSV is ignored.
       - name: Select constants from Benchmarks/Vectors.csv
         id: man
         run: |
           PRIMARY=--primary; [ "$FULL" = 1 ] && PRIMARY=
           python3 .github/scripts/bench.py manifest \
             --csv Benchmarks/Vectors.csv --env "$BENV" --mode "$MODE" \
+            --backend "$BACKEND" \
             --tier "$TIER" --shard "$SHARD" $PRIMARY --out "$GITHUB_WORKSPACE/names.txt" \
             | tee -a "$GITHUB_OUTPUT"
 
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index 28b0d634..1b11f626 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -6,10 +6,7 @@ the same backend drivers:
 - **`!benchmark` PR comment** (`.github/workflows/bench-pr.yml`) — on demand,
   posts a **main-vs-PR** comparison table on the pull request. main's numbers
   are pulled from bencher.dev via its public reports API (`bench.py fetch-main`);
-  the PR side is measured fresh. Design-level skips (`aiur execute` — redundant
-  with `aiur prove`; `zkVM prove` — no GPU on main; `ooc prove` — no in-circuit
-  prove) are filtered from the matrix at parse time and post a note explaining
-  why. For supported combinations, if bencher hasn't ingested the base SHA yet
+  the PR side is measured fresh. If bencher hasn't ingested the base SHA yet
   (freshly-pushed main whose push CI is still running), the workflow falls
   back to re-running the base SHA locally.
 - **Bencher.dev** (`.github/workflows/bench-main.yml`) — on every push to `main`,
@@ -21,20 +18,28 @@ the same backend drivers:
 
 | backend | what it measures | metrics |
 |---|---|---|
-| `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove) | `fft-cost`, `execute-time`, `prove-time`, `peak-rss`, `constants`, `throughput` |
+| `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove) | `fft-cost`, `execute-time`, `prove-time`, `peak-rss` |
 | `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `peak-rss` |
-| `ooc` | the same kernel run **out-of-circuit and in parallel** (`ix check-rs`) — far faster | `throughput` (constants/s), `check-time`, `peak-rss`, `constants` |
-
-In **prove** mode, `run.sh` proves each `cheap`-tier primary and falls back to
-**execute-only** for the `heavy`-tier ones (a single-shard prove would exceed
-Aiur's ~128 GB RAM ceiling), so every primary still reports metrics. The `ooc` backend reports two views: the **whole env** (`ix check-rs
---anon`, keyed by env) and a **per-primary subject check** (`ix check-rs
---consts`, keyed by constant — apples-to-apples with the zkVM `--skip-deps`
-execute).
-
-All four are driven by `.github/scripts/run.sh` (compile the env `.ixe`, run the
+| `ooc` | the same kernel run **out-of-circuit and in parallel** (`ix check-rs`) — far faster | `throughput`, `check-time`, `peak-rss` |
+| `compile` | `ix compile <env>.lean → <env>.ixe` on the current PR — measures the compile step itself, keyed by CamelCase env slug (`InitStd`, `Lean`, `Mathlib`, `FLT`) | `compile-time`, `throughput`, `file-size`, `constants` |
+
+In **prove** mode, `run.sh` attempts a full prove for every primary — cheap
+*and* heavy. A RAM watchdog (`watch_ram_kill`) samples the process tree's RSS
+every ~3 s and `SIGKILL`s the tree if it exceeds `AIUR_PROVE_MAX_RSS_GB`
+(default 120 GB — 8 GB headroom under a 128 GB runner). When killed, the
+constant records the neutral `{"oom": true}` sentinel and `bench.py compare`
+renders `OOM` cells (with `n/a` Δ%) in the table for that row.
+
+The `ooc` backend reports two views: the **whole env** (`ix check-rs --anon`,
+keyed by env) and a **per-primary full-closure check** (`ix check-rs --consts`,
+keyed by constant — apples-to-apples with the zkVM execute (also full-closure),
+so the delta isolates in-circuit vs out-of-circuit overhead rather than mixing
+in subject-only vs full-closure scope).
+
+All are driven by `.github/scripts/run.sh` (compile the env `.ixe`, run the
 backend, emit a neutral `{ "<name>": { "<metric>": n } }` JSON). The PR workflow
-compares two such JSONs; the bencher workflow wraps one in Bencher Metric Format.
+compares two such JSONs; the bencher workflow wraps one in Bencher Metric
+Format.
 
 ### Peak RAM and the per-phase drill-down (tracing-texray)
 
@@ -48,60 +53,101 @@ sink, one `{"span","seconds"}` per closed span) to a side file, which `run.sh`
 aggregates into a `phases` object on the constant's entry. `aiur` yields a rich
 breakdown (`aiur/execute`, `aiur/witness`, `stark/fri_open`, …) since the prover
 instruments those spans; `zisk`/`sp1` record a single `execute`/`prove` phase;
-`ooc` records none. In a `!benchmark` comparison, `bench.py` renders any
-multi-phase constant as a collapsible **per-phase timing drill-down** (main vs
-PR seconds + Δ%), so a regression can be traced to the phase that moved.
+`ooc` records none. `bench-main.yml` flattens the `phases` object into
+`phase:<span>` measures on the way to bencher, so each span is tracked over
+time. (**TODO**: `bench.py compare` used to emit a per-constant collapsible
+drill-down in the PR comment; that renderer was removed while the primary
+table's flag / threshold semantics were being stabilised — see the TODO in
+`bench.py` at the previous `_phase_details` location. The `phases` data is
+still populated in the neutral JSON, ready to consume.)
 
 ## Constant set — `Benchmarks/Vectors.csv`
 
-One CSV is the single source of truth for *which* constants to run: `name,env,tier,shard_target,primary`. Measurements never live here — they're in each tool's neutral results JSON and in bencher.dev.
+One CSV is the single source of truth for *which* constants to run:
+`name,env,tier,shard_target,primary`. Rows may omit trailing zero fields — the
+parser tolerates 3, 4, or 5 columns, defaulting `shard_target` and `primary`
+to `0`. Measurements never live here; they live in each tool's neutral results
+JSON and in bencher.dev.
 
 - `env` — compile target the constant resolves in (`initStd` / `lean` / `mathlib`).
-- `tier` — `cheap` (prove-feasible on a CI runner) or `heavy` (execute-only; a
-  single-shard prove would OOM).
-- `primary` — the **~11-constant default subset**, spanning shape and the
-  cheap→heavy cost range (3 are shard targets). Everything defaults to this;
-  the full ~60-constant set is opt-in.
-
-`bench.py manifest` selects names by env + mode (`prove`→cheap, `execute`→all) +
-`--primary`. `bench.py compare` renders the PR table.
+- `tier` — `cheap` (prove-feasible on a CI runner under Aiur's ~128 GB RAM
+  ceiling) or `heavy` (a single-shard prove would OOM without the runner-
+  installed RAM watchdog killing it). Informational-only after the RAM
+  watchdog took over gating — `run.sh` no longer branches on `tier`.
+- `primary` — the curated **primary subset** (currently ~20 constants across
+  initStd + mathlib), spanning shape and cost range. Default for the
+  `!benchmark` PR comment and the bencher jobs. Set `BENCH_FULL=1` to include
+  everything (~68 total).
+- `shard_target` — marks a heavy constant designated for the manifest-sharded
+  prove path (currently 4 rows).
+
+`bench.py manifest` selects names by env + `--primary` (plus optional
+`--tier`, `--shard`). The `compile` backend short-circuits this — its
+"benchmark" is the env slug itself, so `manifest` writes a one-line
+`names.txt` with the CamelCase env name (`InitStd`, etc.) and skips the CSV.
+`bench.py compare` renders the PR table from the two side JSONs.
 
 ## `!benchmark` grammar
 
 Maintainer comment on a PR:
 
 ```
-!benchmark ([aiur] [zisk] [sp1] [ooc] | all) [execute]
+!benchmark ([aiur] [zisk] [sp1] [ooc] [compile] | all) [execute]
 BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
-BENCH_FULL=1                   # run the full curated set, not the ~11 primary
-BENCH_TIER=cheap|heavy|all     # override the mode default (execute=all, prove=cheap)
+BENCH_FULL=1                   # run the full curated set, not just primary
+BENCH_TIER=cheap|heavy|all     # tier override (default: all)
 BENCH_SHARD=1                  # restrict to the multi-shard target constants
 RUST_LOG=info                  # passthrough env (allowlisted)
 ```
 
 Mode is fixed per backend: `aiur` runs `prove` by default (its report also
 carries the execute-side columns `fft-cost` / `execute-time` alongside
-`prove-time`), while `zisk` / `sp1` / `ooc` always run `execute`. The optional
-bare `execute` token flips `aiur` to execute-only (`bench-typecheck
---execute-only`; skips Phase 2); on the other three backends it's a no-op.
-Defaults: `aiur`, `initStd`, primary subset. Backends fan out as a matrix;
-`main` results are pulled from bencher.dev.
+`prove-time`); `zisk` / `sp1` / `ooc` run `execute`; `compile` runs `ix
+compile`. The optional bare `execute` token flips `aiur` to execute-only
+(`bench-typecheck --execute-only`, skips Phase 2); on other backends it's a
+no-op. Defaults: `aiur`, `initStd`, primary subset. Backends fan out as a
+matrix; each cell is one `(backend, env, mode)` job. main's numbers are
+pulled from bencher.dev.
 
 ## Bencher jobs (`bench-main.yml`)
 
 `build → compile → { prove, zkvm-execute, ooc-check }`, each reporting to its
-own testbed + **workload** (`aiur-check`, `zisk-check`, `sp1-check`, `ooc-check`, `ix-compile`).
-All four typecheck testbeds share the same slug shape:
-`<backend>-check-x64-32x`.
-Deterministic measures (cycles, fft-cost, constants, …) are pinned exactly;
-noisy wall-clock measures (time, RAM, throughput) ride percentage bounds, both
-windowed to the per-workload `bencher-thresholds-reset-<workload>` tag.
+own testbed + **workload** (`aiur-check`, `zisk-check`, `sp1-check`,
+`ooc-check`, `ix-compile`). The four typecheck testbeds share the shape
+`<backend>-check-x64-32x`; the compile job uses `ix-compile-x64-32x`. Every
+bench job runs on the same runner (`warp-ubuntu-latest-x64-32x`).
+
+Threshold semantics per measure kind:
+- **`constants`** — pinned exactly (0/0). A definitional count; either
+  direction is worth flagging (someone added/removed a def).
+- **`fft-cost`, `cycles`, `shards`, `max-shard-cycles`** — deterministic but
+  directional: `upper 0` (any increase is a real regression), `lower _`
+  (drops are legitimate wins — algorithmic improvements, better packing).
+- **`execute-time`, `prove-time`, `check-time`, `compile-time`, `peak-rss`,
+  `file-size`** — noisy wall-clock or size measures: `upper 0.05–0.10`,
+  `lower _`.
+- **`throughput`** — higher-is-better: `upper _`, `lower 0.05–0.10`.
+- **`phase:<span>`** — uploaded for trend visibility, intentionally left
+  un-thresholded (dynamic names + noise; the PR-comment drill-down is where
+  phase-level attention goes when the drill-down is reinstated).
+
+All thresholds are windowed to the per-workload
+`bencher-thresholds-reset-<workload>` tag.
 
 To re-baseline a workload after an intended step change, comment
 `!bencher-thresholds-reset <workload|all>` on the merging PR, or run the
-`bencher-thresholds-reset` workflow (`.github/workflows/bencher-thresholds-reset.yml`).
+`bencher-thresholds-reset` workflow
+(`.github/workflows/bencher-thresholds-reset.yml`).
 
 ## Not yet covered
 
 - **zkVM proving** (Zisk/SP1 `prove`) is not wired up — needs a self-hosted
-  GPU runner. Currently zkVMs are execute-only on both surfaces.
+  GPU runner. `bench.py`'s parse layer treats `zisk`/`sp1` as `execute`-only.
+- **Per-constant phase drill-down** in the PR comment (was removed while the
+  primary table's semantics were stabilised; TODO in `bench.py` marks the
+  reinstatement point — the `phases` data is still populated in the
+  neutral JSON and flattened to `phase:<span>` on bencher).
+- **Non-`main` base branches** — `bench.py fetch-main` hardcodes
+  `branch=main`; a PR against a non-main base always falls through to the
+  local base-run path. TODO in `bench.py` lays out the three-step plan
+  (producer / consumer / fallback).

From ca8198c737d97e2662e749e1a9db29708d1b4f74 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 2 Jul 2026 16:05:57 -0400
Subject: [PATCH 11/27] feat(ci): annotate compare table cells with ratio (1.5x
 faster/slower)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Δ% is easier to interpret at scale when paired with a ratio — e.g.
`-33.3% (1.50x faster)` for a big prove-time drop; `+15.2% (1.15x slower)`
for a real execute-time regression. Only shown when the factor is >= 1.05x
(roughly ±5% in either direction) so sub-noise deltas don't get
`(1.02x slower)` clutter next to `+1.1%`. Cell format cascades cleanly:
`{Δ%} [(1.NN× {slower|faster})] [emoji]`.
---
 .github/scripts/bench.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index 4a89cbf2..6eee8c59 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -252,6 +252,15 @@ def _delta(main, pr):
     return (pr - main) / main * 100.0
 
 
+def _ratio(main, pr):
+    """(factor, direction) with `factor` always ≥ 1.0. Metrics are lower-
+    is-better, so `pr > main` reads as slower / larger; `pr < main` as
+    faster / smaller. Returns None if either side is missing or non-positive."""
+    if main is None or pr is None or main <= 0 or pr <= 0:
+        return None
+    return (pr / main, "slower") if pr >= main else (main / pr, "faster")
+
+
 def _load(path):
     try:
         with open(path) as f:
@@ -321,6 +330,12 @@ def _oom(d, n):
             dp = _delta(mv, pv)
             cell = "n/a" if dp is None else f"{dp:+.1f}%"
             if dp is not None:
+                # Ratio only when the change is big enough that "1.18× slower"
+                # carries new signal beyond the percentage — sub-5% deltas would
+                # just add `(1.03× slower)` noise to the cell.
+                r = _ratio(mv, pv)
+                if r is not None and r[0] >= 1.05:
+                    cell += f" ({r[0]:.2f}× {r[1]})"
                 if dp > a.threshold:
                     cell += " ⚠️"; regressed.add(n)
                 elif dp < -a.threshold:

From d92a5c08ed59813be741cb64e3bd9650cc065a93 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 2 Jul 2026 19:51:52 -0400
Subject: [PATCH 12/27] =?UTF-8?q?fix(ci):=20review=20fixes=20=E2=80=94=20f?=
 =?UTF-8?q?ull-closure=20check-rs,=20OOM/polarity/caching,=20dedup?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes from the branch-wide review, plus build-once caching and a dedup pass.

**check-rs full-closure mode (new kernel FFI)**
- `rs_kernel_check_anon_consts`: resolve displayed names via the env's
  `named` map (the zkVM hosts' resolution), then anon-check the seeds'
  FULL dependency closures via the existing `closure_addrs` +
  `build_anon_work` primitives — no sub-env serialization. `skip_deps`
  restricts to subject-only. `index_anon_work` extracted so the whole-env
  and filtered paths share the result-slot indexing.
- `ix check-rs --anon --consts <names>` now runs that (full-closure default,
  `--skip-deps` opt-out — the hosts' CLI shape); meta-mode seeded checks are
  unchanged (subject-only, bisect flows intact). run.sh's ooc per-constant
  view uses it, so ooc-vs-zkVM deltas genuinely isolate in-circuit overhead
  (both sides now full-closure anon; previously ooc was subject-only meta).

**OOM sentinel end-to-end**
- run.sh merges `oom: true` INTO already-measured Phase-1 metrics (was:
  overwrite); compare renders OOM only for the missing metrics.
- `bench.py bmf` (one neutral→BMF converter replacing four hand-copied
  jq/awk pipelines in bench-main.yml) strips the boolean sentinel — one
  OOM row no longer fails the whole bencher upload. This also covers the
  zkVM execute path, which gained its own RAM watchdog + sentinel.
- Watchdog: setsid + process-group kill (reaches all descendants); kernel
  OOM beating the 3 s sampler (exit 137, no marker) labels OOM instead of
  silently dropping; per-constant wall-clock timeouts (AIUR_PROVE_TIMEOUT
  50m / AIUR_EXECUTE_TIMEOUT 25m); $out re-merged per constant so a
  job-level kill keeps completed rows.

**compare correctness**
- throughput is higher-is-better: flags/worst/ratio direction-aware
  (HIGHER_IS_BETTER + _badness).
- Ratio wording follows metric kind: times faster/slower, bytes
  larger/smaller, counts more/fewer.
- `execute-peak-rss`: Phase-1 RSS high-water sampled at the Phase 1/2
  boundary in bench-typecheck (both modes) and adopted by the zkVM hosts'
  execute JSON — one name for "execute-phase peak" on every backend;
  bare `peak-rss` reserved for prove-phase peaks. aiur-execute compares
  it apples-to-apples against prove-run baselines (raw peak-rss would
  dwarf it).
- One-side-empty tables get a loud note (e.g. CLI-incompatible base).
- Tidy JSON: bench-typecheck emits decimal `JsonNumber`s (`jsonRound`)
  instead of Float's full binary repr.

**fetch-main robustness**
- Retry with backoff + newest-first pagination (base SHA beyond page one
  no longer forces a permanent fallback).
- Exit codes honored by bench-pr.yml: 3 (transient) → local fallback;
  2 (BACKEND_TABLE / bench-main drift) fails the cell loudly.
- ooc whole-env row keyed CamelCase on both sides (benv_cc table in
  run.sh, ENV_CC in bench.py) and admitted past the --names filter.

**build-once + caching (bench-pr.yml)**
- New build job: ix + bench-typecheck built once per head SHA (they embed
  the IxVM kernel/prover), cached as `bench-bins-<sha>`; cells restore
  into .bins/pr instead of per-cell Lean builds. Re-running \!benchmark on
  the same commit skips the build.
- Base fallback restores bench-main's own `bench-bins-`/`bench-ixe-`
  caches (toolchain cmp + mathlib-oleans guard) before paying for a
  from-scratch base build. PR-side .ixe cached across cells.
- Cache keys renamed `aiur-bench-bins-`/`aiur-ixe-` →
  `bench-bins-`/`bench-ixe-` (they serve every backend).
- run.sh resolves tools in-tree first, then PATH (`resolve_bin`).

**dedup**
- bench.py: DEFAULT_MODE + METRICS + MAIN_TESTBEDS collapsed into one
  BACKEND_TABLE (mode / testbed / per-mode metrics).
- bench-main's compile job routes through run.sh's compile backend + bmf —
  the `##benchmark##` line is parsed in exactly one place (run.sh gains
  `flt` in its env table for the FLT matrix cell).
- `Ix.Cli.ConstsFile`: one names-file/comma-list parser (inline-`#`
  comments, dedup) shared by check-rs meta+anon, bench-typecheck, and
  `ix compile --exclude-file` — closes the whole-line-vs-inline comment
  drift across the five previous copies.

**hosts**
- clap `requires = "consts"` dropped from --json/--skip-deps (rejected
  valid --consts-file-only runs); validated in main after collect_consts.
  zisk's multi-`--ixe` guard now covers --consts-file.

**misc**
- Vectors.csv / docs tier semantics corrected (manifest-only consumer);
  stale prove-fallback comments fixed; tracing-texray bumped to bd4faa08;
  push-to-main workflows drop their concurrency groups (every merged
  commit must be benchmarked; a later merge must never cancel one).

Verified: cargo check -p ix-ffi, lake build ix / bench-typecheck, live
runs of the new check-rs mode against a 480 MB Init/Std env (closure =
34 items / 42 targets for Nat.add_comm; --skip-deps = 1; missing names
error loudly), bmf/parse/compare dry-runs, YAML + bash -n + AST checks.
zisk/sp1 host crates compile-verified in CI only (local sandbox lacks
the pil2-proofman C++ toolchain).
---
 .github/scripts/bench.py          | 321 ++++++++++++++++++++++--------
 .github/scripts/run.sh            | 151 ++++++++++----
 .github/workflows/bench-main.yml  |  85 ++++----
 .github/workflows/bench-pr.yml    | 181 +++++++++++++++--
 .github/workflows/ignored.yml     |   6 +-
 .github/workflows/riscv-bench.yml |   6 +-
 Benchmarks/Typecheck.lean         |  75 +++----
 Benchmarks/Vectors.csv            |   7 +-
 Cargo.lock                        | 104 ++--------
 Cargo.toml                        |   2 +-
 Ix/Cli/CheckRsCmd.lean            | 104 +++++++---
 Ix/Cli/CompileCmd.lean            |  19 +-
 Ix/Cli/ConstsFile.lean            |  61 ++++++
 Ix/KernelCheck.lean               |  26 +++
 crates/ffi/src/kernel.rs          | 247 ++++++++++++++++++++++-
 docs/benchmarking.md              |  36 ++--
 sp1/Cargo.lock                    |  88 +-------
 sp1/host/Cargo.toml               |   2 +-
 sp1/host/src/main.rs              |  30 +--
 zisk/Cargo.lock                   |  82 +-------
 zisk/host/Cargo.toml              |   2 +-
 zisk/host/src/main.rs             |  51 +++--
 22 files changed, 1106 insertions(+), 580 deletions(-)
 create mode 100644 Ix/Cli/ConstsFile.lean

diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index 6eee8c59..c3aecdf6 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -3,41 +3,87 @@
 
   parse       COMMENT_BODY env → matrix + config (writes $GITHUB_OUTPUT)
   manifest    Benchmarks/Vectors.csv → the constant names for one cell
+  bmf         neutral results JSON → Bencher Metric Format (bench-main.yml)
   fetch-main  base SHA + cell → main.json pulled from bencher.dev
   compare     main.json + pr.json → a Markdown main-vs-PR table
   comment     per-cell table files → the final PR comment body
 
 The neutral results JSON every backend normalises to (see run.sh) is
-`{ "<name>": { "<metric>": <number>, ... }, ... }`. All metrics are
-lower-is-better, so a positive Δ% is a regression.
+`{ "<name>": { "<metric>": <number>, ... }, ... }`. Most metrics are
+lower-is-better (a positive Δ% is a regression); the exceptions live in
+HIGHER_IS_BETTER (throughput), where the polarity is flipped.
 """
 import argparse
 import glob
 import json
 import os
+import time
 import urllib.parse
 import urllib.request
 
 
-# ───────────────────────── parse ─────────────────────────
-# Default mode per backend. Aiur is the only backend with a real choice:
-# `prove` (default) is the full pipeline; `execute` skips Phase 2
-# (`--execute-only`) and reports the `fft-cost` / `execute-time` subset —
-# users opt in via the bare `execute` token in `!benchmark`. The zkVMs and
-# `ooc` always run execute; `compile` runs `ix compile`.
-DEFAULT_MODE = {
-    "aiur":    "prove",
-    "zisk":    "execute",
-    "sp1":     "execute",
-    "ooc":     "execute",
-    # `compile` benchmarks `ix compile <env>.lean → <env>.ixe` — the same job
-    # `bench-main.yml`'s `compile` matrix uploads under testbed `ix-compile-*`.
-    # Mode is `compile` (there's no execute/prove split); the "benchmark name"
-    # in bencher is the CamelCase env slug (`InitStd`, `Lean`, `Mathlib`, `FLT`).
-    "compile": "compile",
+# ─────────────────── backend identity table ────────────────────
+# Single source of truth for what each backend is:
+#   default_mode — what `!benchmark <backend>` runs. The bare `execute` token
+#     switches to the "execute" metrics entry when one exists (only aiur has
+#     a real choice: `prove` is the full pipeline; `execute` skips Phase 2 via
+#     `--execute-only`).
+#   testbed — the bencher testbed bench-main.yml uploads main's numbers to.
+#     MUST match that workflow's `testbed:` strings; fetch-main fails a cell
+#     loudly (exit 2) when a (backend, mode) has no entry here, so drift shows
+#     up as a red cell instead of a silent local-rebuild fallback.
+#   metrics — compare-table columns per supported mode. aiur's execute entry
+#     reads the SAME testbed as prove (bencher stores only prove runs; the
+#     execute-side columns — incl. execute-peak-rss, sampled at the Phase 1/2
+#     boundary — are extracted from that JSON, apples-to-apples).
+# `compile` benchmarks `ix compile <env>.lean → <env>.ixe`; its benchmark name
+# on bencher is the CamelCase env slug (ENV_CC below).
+BACKEND_TABLE = {
+    "aiur": {
+        "default_mode": "prove",
+        "testbed": "aiur-check-x64-32x",
+        "metrics": {
+            "prove":   ["fft-cost", "execute-time", "prove-time", "peak-rss"],
+            "execute": ["fft-cost", "execute-time", "execute-peak-rss"],
+        },
+    },
+    "zisk": {
+        "default_mode": "execute",
+        "testbed": "zisk-check-x64-32x",
+        "metrics": {
+            "execute": ["cycles", "execute-time", "throughput", "execute-peak-rss"],
+        },
+    },
+    "sp1": {
+        "default_mode": "execute",
+        "testbed": "sp1-check-x64-32x",
+        "metrics": {
+            "execute": ["cycles", "execute-time", "throughput", "execute-peak-rss"],
+        },
+    },
+    "ooc": {
+        "default_mode": "execute",
+        "testbed": "ooc-check-x64-32x",
+        "metrics": {
+            "execute": ["throughput", "check-time", "peak-rss"],
+        },
+    },
+    "compile": {
+        "default_mode": "compile",
+        "testbed": "ix-compile-x64-32x",
+        "metrics": {
+            "compile": ["compile-time", "throughput", "file-size", "constants"],
+        },
+    },
 }
-BACKENDS = tuple(DEFAULT_MODE)
+BACKENDS = tuple(BACKEND_TABLE)
 ENVS = ("initStd", "lean", "mathlib")
+# CamelCase benchmark key per env — must match bench-main.yml's matrix.bench
+# values (the names bencher stores env-keyed rows under: ooc whole-env,
+# compile). One explicit table, not a first-letter-upper derivation, so an
+# env whose CamelCase isn't mechanical (e.g. a future `flt` → `FLT`) can't
+# silently diverge from the workflow.
+ENV_CC = {"initStd": "InitStd", "lean": "Lean", "mathlib": "Mathlib"}
 CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_FULL"}
 PASSTHROUGH_KEYS = {"RUST_LOG", "WITHOUT_VK_VERIFICATION", "RUSTFLAGS"}
 
@@ -82,8 +128,12 @@ def cmd_parse(_a):
     full = "1" if cfg.get("BENCH_FULL") == "1" else "0"  # full set vs primary subset
 
     def mode_for(b):
-        # `execute` only affects aiur (the zkVMs and ooc are execute-only anyway).
-        return "execute" if (b == "aiur" and execute_flag) else DEFAULT_MODE[b]
+        # The bare `execute` token selects a backend's execute entry when it
+        # has one — a real switch only for aiur (everything else already
+        # defaults to execute, or has no execute mode at all: compile).
+        if execute_flag and "execute" in BACKEND_TABLE[b]["metrics"]:
+            return "execute"
+        return BACKEND_TABLE[b]["default_mode"]
 
     cells = []
     for b in backends:
@@ -115,9 +165,8 @@ def cmd_manifest(a):
     # `compile` doesn't consume Vectors.csv — the "benchmark name" on bencher
     # is the CamelCase env slug (`initStd` → `InitStd`), one per cell.
     if a.backend == "compile":
-        name = a.env[:1].upper() + a.env[1:]
         with open(a.out, "w") as f:
-            f.write(name + "\n")
+            f.write(ENV_CC[a.env] + "\n")
         print(f"count=1\ntier=n/a")
         return
     # prove defaults to the cheap tier to keep the full set bounded; the curated
@@ -155,30 +204,17 @@ def cmd_manifest(a):
 
 
 # ───────────────────────── compare ─────────────────────────
-# Compare-column set per (backend, mode). Aiur has both modes: `prove` shows
-# the full execute+prove metric set; `execute` is a subset (Phase 1 only, no
-# prove-time / no throughput). Bencher stores only the prove set for main —
-# `execute` mode filters that same JSON down to the execute-side columns.
-METRICS = {
-    ("aiur",    "prove"):    ["fft-cost", "execute-time", "prove-time", "peak-rss"],
-    ("aiur",    "execute"):  ["fft-cost", "execute-time", "peak-rss"],
-    ("zisk",    "execute"):  ["cycles", "execute-time", "throughput", "peak-rss"],
-    ("sp1",     "execute"):  ["cycles", "execute-time", "throughput", "peak-rss"],
-    ("ooc",     "execute"):  ["throughput", "check-time", "peak-rss"],
-    ("compile", "compile"):  ["compile-time", "throughput", "file-size", "constants"],
-}
-
-
 def _num(d, name, metric):
     v = d.get(name, {}).get(metric)
     return v if isinstance(v, (int, float)) else None
 
 
 # Per-metric formatting kind. Metric names are the neutral-JSON keys the tools
-# emit (see METRICS above). Unknown metrics fall through to `_human_auto`.
+# emit (see BACKEND_TABLE). Unknown metrics fall through to `_human_auto`.
 _METRIC_KIND = {
     # bytes
     "peak-rss": "bytes",
+    "execute-peak-rss": "bytes",
     "file-size": "bytes",
     # seconds
     "execute-time": "seconds",
@@ -246,19 +282,51 @@ def _human(v, metric=None):
     return _human_auto(v)
 
 
+# Metrics where a LARGER value is the improvement. Everything else is
+# lower-is-better (times, RAM, cycles, fft-cost, sizes).
+HIGHER_IS_BETTER = {"throughput"}
+
+
 def _delta(main, pr):
     if main is None or pr is None or main == 0:
         return None
     return (pr - main) / main * 100.0
 
 
-def _ratio(main, pr):
-    """(factor, direction) with `factor` always ≥ 1.0. Metrics are lower-
-    is-better, so `pr > main` reads as slower / larger; `pr < main` as
-    faster / smaller. Returns None if either side is missing or non-positive."""
+def _badness(dp, metric):
+    """Signed regression magnitude: positive ⇒ the PR got worse on `metric`.
+    For lower-is-better metrics that's a positive Δ%; for higher-is-better
+    (throughput) it's a negative Δ%."""
+    if dp is None:
+        return None
+    return -dp if metric in HIGHER_IS_BETTER else dp
+
+
+# Ratio direction words per metric kind (grew, shrank). Rates and times read
+# as faster/slower; sizes as larger/smaller; counts (cycles, fft-cost, …) as
+# more/fewer — "1.15× slower" is meaningless for a byte or count metric.
+_RATIO_WORDS = {
+    "seconds": ("slower", "faster"),
+    "bytes":   ("larger", "smaller"),
+    "count":   ("more", "fewer"),
+    "int":     ("more", "fewer"),
+}
+
+
+def _ratio(main, pr, metric):
+    """(factor, direction word) with `factor` always ≥ 1.0. Wording follows
+    the metric's kind and polarity: throughput (a rate) and the time metrics
+    read as faster/slower, sizes as larger/smaller, counts as more/fewer.
+    Returns None if either side is missing or non-positive."""
     if main is None or pr is None or main <= 0 or pr <= 0:
         return None
-    return (pr / main, "slower") if pr >= main else (main / pr, "faster")
+    grew = pr >= main
+    factor = pr / main if grew else main / pr
+    if metric in HIGHER_IS_BETTER:      # rate: more per second = faster
+        return (factor, "faster" if grew else "slower")
+    kind = _METRIC_KIND.get(metric, "auto")
+    words = _RATIO_WORDS.get(kind, ("larger", "smaller"))
+    return (factor, words[0] if grew else words[1])
 
 
 def _load(path):
@@ -280,7 +348,7 @@ def _load(path):
 
 
 def cmd_compare(a):
-    metrics = a.metric or METRICS.get((a.backend, a.mode))
+    metrics = a.metric or BACKEND_TABLE.get(a.backend, {}).get("metrics", {}).get(a.mode)
     if not metrics:
         raise SystemExit("compare: pass --metric or a known --backend/--mode")
     title = a.title
@@ -301,6 +369,18 @@ def emit(text):
         emit((title or "") + "\n\n_No results were produced (every constant failed, "
              "timed out, or was dropped). See the workflow logs._")
         return
+    # One side empty while the other measured is almost always a broken side
+    # (e.g. the base-run fallback hit a CLI-incompatible base), not a real
+    # all-regressed/all-new comparison — say so instead of a silent n/a column.
+    side_note = ""
+    if not main_d:
+        side_note = ("\n\n_⚠️ main produced no results — the base-side run "
+                     "failed entirely (often a CLI-incompatible base binary "
+                     "when bencher had no data). Deltas unavailable; see the "
+                     "workflow logs._")
+    elif not pr_d:
+        side_note = ("\n\n_⚠️ the PR side produced no results — every "
+                     "constant failed or was dropped. See the workflow logs._")
 
     primary = metrics[0]
     names.sort(key=lambda n: (0, -v) if (v := (_num(pr_d, n, primary)
@@ -316,32 +396,33 @@ def _oom(d, n):
         return isinstance(d.get(n), dict) and d[n].get("oom") is True
 
     regressed, improved = set(), set()
-    worst = None  # (dp, name, metric)
+    worst = None  # (badness, dp, name, metric)
     for n in names:
         cells = [f"`{n}`"]
         main_oom, pr_oom = _oom(main_d, n), _oom(pr_d, n)
         for m in metrics:
             mv, pv = _num(main_d, n, m), _num(pr_d, n, m)
-            mv_h = "OOM" if main_oom else _human(mv, m)
-            pv_h = "OOM" if pr_oom else _human(pv, m)
-            if main_oom or pr_oom:
-                cells += [mv_h, pv_h, "n/a"]
-                continue
+            # An OOM entry may still carry real Phase-1 measurements (run.sh
+            # merges the sentinel into whatever was recorded before the kill);
+            # render those, and OOM only for the metrics the kill prevented.
+            mv_h = "OOM" if (main_oom and mv is None) else _human(mv, m)
+            pv_h = "OOM" if (pr_oom and pv is None) else _human(pv, m)
             dp = _delta(mv, pv)
+            bad = _badness(dp, m)
             cell = "n/a" if dp is None else f"{dp:+.1f}%"
             if dp is not None:
                 # Ratio only when the change is big enough that "1.18× slower"
                 # carries new signal beyond the percentage — sub-5% deltas would
                 # just add `(1.03× slower)` noise to the cell.
-                r = _ratio(mv, pv)
+                r = _ratio(mv, pv, m)
                 if r is not None and r[0] >= 1.05:
                     cell += f" ({r[0]:.2f}× {r[1]})"
-                if dp > a.threshold:
+                if bad > a.threshold:
                     cell += " ⚠️"; regressed.add(n)
-                elif dp < -a.threshold:
+                elif bad < -a.threshold:
                     cell += " 🟢"; improved.add(n)
-                if worst is None or dp > worst[0]:
-                    worst = (dp, n, m)
+                if worst is None or bad > worst[0]:
+                    worst = (bad, dp, n, m)
             cells += [mv_h, pv_h, cell]
         rows.append("| " + " | ".join(cells) + " |")
 
@@ -349,8 +430,10 @@ def _oom(d, n):
     s = (f"_{len(names)} constants · {len(regressed)} regressed · "
          f"{len(improved)} improved (|Δ| > {a.threshold:g}% on any metric)._")
     if worst and worst[0] is not None and worst[0] > a.threshold:
-        s += f" Worst: `{worst[1]}` `{worst[2]}` {worst[0]:+.1f}%."
+        s += f" Worst: `{worst[2]}` `{worst[3]}` {worst[1]:+.1f}%."
     out.append(s)
+    if side_note:
+        out.append(side_note.strip())
     # TODO: emit per-constant phase drill-down (see the TODO by _phase_details).
     emit("\n".join(out))
 
@@ -370,36 +453,63 @@ def cmd_comment(a):
     print(open(a.out).read())
 
 
-# ─────────────────────── fetch-main ──────────────────────
-# Testbeds bench-main.yml uploads to, keyed by (backend, mode). Only the
-# pairs main actually runs land here — anything else (e.g. `aiur execute`,
-# `zisk prove`) has no bencher data; fetch-main exits non-zero for those and
-# bench-pr.yml falls back to running main locally.
-MAIN_TESTBEDS = {
-    # `aiur execute` uses the same testbed as `aiur prove` — bencher stores
-    # only prove and the execute columns are extracted from that JSON.
-    ("aiur",    "prove"):    "aiur-check-x64-32x",
-    ("aiur",    "execute"):  "aiur-check-x64-32x",
-    ("zisk",    "execute"):  "zisk-check-x64-32x",
-    ("sp1",     "execute"):  "sp1-check-x64-32x",
-    ("ooc",     "execute"):  "ooc-check-x64-32x",
-    ("compile", "compile"):  "ix-compile-x64-32x",
-}
+# ──────────────────────── bmf ─────────────────────────
+def cmd_bmf(a):
+    """Neutral results JSON → Bencher Metric Format.
+
+    One converter for every bench-main.yml upload site (previously four
+    hand-copied jq pipelines): flattens each entry's `phases` object into
+    `phase:<span>` measures, strips the boolean `oom` sentinel (BMF values
+    must be numeric — one boolean would fail the whole `bencher run` upload;
+    the sentinel is for the PR comparison table only), and drops entries left
+    with no measures.
+    """
+    with open(a.infile) as f:
+        neutral = json.load(f)
+    out = {}
+    for name, entry in (neutral or {}).items():
+        if not isinstance(entry, dict):
+            continue
+        phases = entry.get("phases")
+        phases = phases if isinstance(phases, dict) else {}
+        measures = {}
+        for k, v in entry.items():
+            if k in ("phases", "oom"):
+                continue
+            if isinstance(v, (int, float)) and not isinstance(v, bool):
+                measures[k] = {"value": v}
+        for span, secs in phases.items():
+            if isinstance(secs, (int, float)) and not isinstance(secs, bool):
+                measures[f"phase:{span}"] = {"value": secs}
+        if measures:
+            out[name] = measures
+    with open(a.out, "w") as f:
+        json.dump(out, f, indent=1)
+    print(f"bmf: {len(out)} benchmark(s) → {a.out}")
 
 
+# ─────────────────────── fetch-main ──────────────────────
 def cmd_fetch_main(a):
     """Pull the base SHA's neutral results JSON from bencher.dev.
 
-    Exits 2 if (backend, mode) isn't a combination main runs. Exits 3 if
-    bencher has no report at that hash yet (freshly-pushed main whose CI is
-    still ingesting) or the request failed. Callers fall back to running
-    main locally on any non-zero exit.
+    The testbed comes from BACKEND_TABLE — supported (backend, mode) pairs are
+    exactly the table's metrics keys. Exit codes are load-bearing for
+    bench-pr.yml: 3 = transient (bencher has no report at that hash yet, or
+    the API failed after retries) — the caller falls back to running main
+    locally; 2 = permanent config error ((backend, mode) not in BACKEND_TABLE,
+    i.e. table / bench-main.yml drift) — the caller fails the cell loudly
+    instead of paying the fallback forever.
     """
-    testbed = MAIN_TESTBEDS.get((a.backend, a.mode))
+    entry = BACKEND_TABLE.get(a.backend)
+    testbed = entry["testbed"] if entry and a.mode in entry["metrics"] else None
     if not testbed:
         print(f"fetch-main: no main testbed for {a.backend}/{a.mode}")
         raise SystemExit(2)
     wanted = set(open(a.names).read().split()) if a.names else None
+    # ooc's headline row is keyed by the CamelCase env slug (not a Vectors.csv
+    # constant), so names.txt alone would filter it out — admit it explicitly.
+    if wanted is not None and a.env:
+        wanted.add(ENV_CC.get(a.env, a.env))
     # TODO: support any base/PR branch, not just `main`. Today bench-main.yml
     # only runs on push to main and this query hardcodes `branch=main`, so a PR
     # against a non-main base branch (e.g. a long-running feature branch) always
@@ -407,19 +517,44 @@ def cmd_fetch_main(a):
     # bench-main.yml (or a sibling) upload reports for other tracked branches,
     # (2) plumb `--branch` here from `github.base_ref` in bench-pr.yml, (3) fall
     # back to `main` when the base branch has no bencher data.
-    params = {"branch": "main", "testbed": testbed, "per_page": 255}
-    url = "https://api.bencher.dev/v0/projects/ix/reports?" + urllib.parse.urlencode(params)
-    try:
-        with urllib.request.urlopen(url, timeout=15) as f:
-            reports = json.load(f)
-    except Exception as e:
-        print(f"fetch-main: bencher API error: {e}")
-        raise SystemExit(3)
     # Bencher stores the git hash at `branch.head.version.hash`.
-    at_sha = [
-        r for r in reports
-        if (((r.get("branch") or {}).get("head") or {}).get("version") or {}).get("hash") == a.sha
-    ]
+    def _report_hash(r):
+        return (((r.get("branch") or {}).get("head") or {}).get("version") or {}).get("hash")
+
+    def _get_json(url, attempts=3):
+        for i in range(attempts):
+            try:
+                with urllib.request.urlopen(url, timeout=15) as f:
+                    return json.load(f)
+            except Exception as e:
+                if i == attempts - 1:
+                    raise
+                print(f"fetch-main: attempt {i + 1} failed ({e}); retrying")
+                time.sleep(2 ** i)
+
+    # Page newest-first until the base SHA's reports are found (a matrix env
+    # uploads one report each, all within one push's CI window, so once we've
+    # matched and a later page yields nothing new we're past it). A transient
+    # API error is retried before the expensive local-base fallback fires.
+    per_page = 255
+    at_sha, page = [], 1
+    while page <= 8:  # 2040 newest reports — far beyond a realistic backlog
+        params = {"branch": "main", "testbed": testbed,
+                  "per_page": per_page, "page": page}
+        url = ("https://api.bencher.dev/v0/projects/ix/reports?"
+               + urllib.parse.urlencode(params))
+        try:
+            reports = _get_json(url)
+        except Exception as e:
+            print(f"fetch-main: bencher API error: {e}")
+            raise SystemExit(3)
+        matches = [r for r in reports if _report_hash(r) == a.sha]
+        if at_sha and not matches:
+            break            # past the SHA's window
+        at_sha += matches
+        if len(reports) < per_page:
+            break            # end of data
+        page += 1
     if not at_sha:
         print(f"fetch-main: no reports for {a.backend}/{a.mode} @ {a.sha[:8]}")
         raise SystemExit(3)
@@ -466,10 +601,20 @@ def main():
                    help="Restrict to the primary subset (the primary=1 column).")
     m.set_defaults(fn=cmd_manifest)
 
+    b = sub.add_parser("bmf")
+    b.add_argument("--in", dest="infile", required=True,
+                   help="Neutral results JSON (run.sh output).")
+    b.add_argument("--out", required=True,
+                   help="Bencher Metric Format JSON for `bencher run`.")
+    b.set_defaults(fn=cmd_bmf)
+
     fm = sub.add_parser("fetch-main")
     fm.add_argument("--sha", required=True)
     fm.add_argument("--backend", required=True)
     fm.add_argument("--mode", required=True)
+    fm.add_argument("--env", default="",
+                    help="Cell env; admits the env-keyed row (ooc whole-env) "
+                         "past the --names filter.")
     fm.add_argument("--names", help="Only fetch benchmarks whose names appear in this file.")
     fm.add_argument("--out", required=True)
     fm.set_defaults(fn=cmd_fetch_main)
diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index 87f52e2c..0191128b 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -43,10 +43,11 @@ merge_phases() {  # <results.json> <spans.jsonl>
 
 # Background RAM watchdog. Every ~3 s, sum RSS across `root_pid` and every
 # descendant (via `ps -eo pid,ppid,rss` + a small BFS); when the total exceeds
-# `max_gb`, touch `marker` and SIGKILL the tree. Callers detect the kill by
-# testing `-f "$marker"` after wait. Idempotent-ish under EPERM: descendants
-# spawned after the last sample are only reaped on the next sweep, but their
-# parent dying takes them out anyway.
+# `max_gb`, touch `marker` and SIGKILL the whole process GROUP (the root is
+# started with `setsid`, so `kill -- -pid` reaches every descendant, not just
+# depth-1 children). Callers detect the kill by testing `-f "$marker"` after
+# wait. The 3 s cadence can lose a fast spike to the kernel OOM killer first —
+# callers treat exit 137 without a marker as OOM too.
 watch_ram_kill() {  # <root_pid> <max_gb> <marker>
   local root_pid=$1 max_gb=$2 marker=$3
   local max_kb=$((max_gb * 1024 * 1024)) total_kb
@@ -65,24 +66,53 @@ watch_ram_kill() {  # <root_pid> <max_gb> <marker>
     if [ -n "$total_kb" ] && [ "$total_kb" -gt "$max_kb" ]; then
       echo "::warning::RAM watchdog: killing pid=$root_pid tree-RSS=${total_kb}kB > ${max_kb}kB (~${max_gb} GB)" >&2
       : > "$marker"
-      kill -KILL "$root_pid" 2>/dev/null || true
-      pkill -KILL -P "$root_pid" 2>/dev/null || true
+      kill -KILL -- "-$root_pid" 2>/dev/null || kill -KILL "$root_pid" 2>/dev/null || true
       return
     fi
     sleep 3
   done
 }
 
+# Merge the OOM sentinel into a constant's results file, PRESERVING any
+# metrics measured before the kill (bench-typecheck persists Phase-1
+# fft-cost/execute-time before the prove starts). bench.py compare renders
+# `OOM` only for the metrics that are absent.
+mark_oom() {  # <results.json> <name>
+  local res="$1" c="$2"
+  if [ -s "$res" ]; then
+    jq --arg n "$c" '.[$n] = ((.[$n] // {}) + {oom: true})' "$res" > "$res.o" \
+      && mv "$res.o" "$res" \
+      || jq -n --arg n "$c" '{($n): {oom: true}}' > "$res"
+  else
+    jq -n --arg n "$c" '{($n): {oom: true}}' > "$res"
+  fi
+}
+
 # `$benv` is used verbatim for the `.ixe` filename (bench-pr compiles `initStd.ixe`;
 # the bencher jobs reuse the compile job's cached `InitStd.ixe`), and lowercased
-# only to pick the Compile module.
+# only to pick the Compile module. `$benv_cc` is the CamelCase form — the
+# canonical BENCHMARK KEY for env-keyed rows (ooc whole-env, compile), so the
+# PR side (`initStd`) and the bencher side (`InitStd`, from bench-main's
+# matrix.bench) agree on one name.
 case "$(printf '%s' "$benv" | tr '[:upper:]' '[:lower:]')" in
-  initstd) module=CompileInitStd ;;
-  lean)    module=CompileLean ;;
-  mathlib) module=CompileMathlib ;;
+  initstd) module=CompileInitStd; benv_cc=InitStd ;;
+  lean)    module=CompileLean;    benv_cc=Lean ;;
+  mathlib) module=CompileMathlib; benv_cc=Mathlib ;;
+  flt)     module=CompileFLT;     benv_cc=FLT ;;
   *) echo "unknown env: $benv" >&2; exit 2 ;;
 esac
 
+# Tool resolution: prefer the in-tree build (so base measures base's code, PR
+# the PR's), fall back to PATH — CI restores cached binaries onto PATH instead
+# of building in-tree.
+resolve_bin() {  # <name> → prints the path, or fails
+  local name="$1" in_tree="$repo/.lake/build/bin/$1"
+  if [ -x "$in_tree" ]; then printf '%s' "$in_tree"
+  else command -v "$name" || { echo "::error::$name not found (in-tree or PATH)" >&2; return 2; }
+  fi
+}
+ix_bin=$(resolve_bin ix) || exit 2
+
 tmp=$(mktemp -d)
 compile_log="$tmp/compile.log"
 
@@ -92,7 +122,7 @@ if [ "${REUSE_IXE:-0}" = 1 ] && [ "$backend" != compile ] && [ -f "$ixe" ]; then
   echo "reusing existing $ixe (REUSE_IXE)" >&2
 else
   echo "::group::ix compile $module → $benv.ixe ($backend/$mode)"
-  "$repo/.lake/build/bin/ix" compile "$repo/Benchmarks/Compile/$module.lean" \
+  "$ix_bin" compile "$repo/Benchmarks/Compile/$module.lean" \
     --out "$ixe" 2>&1 | tee "$compile_log"
   echo "::endgroup::"
 fi
@@ -101,11 +131,16 @@ case "$backend" in
   aiur)
     # One bench-typecheck per constant (isolation + per-constant peak-rss).
     # Execute mode → Phase 1 only (--execute-only). Prove mode → always attempt
-    # a full prove (no tier gate). A RAM watchdog SIGKILLs the process tree if
-    # its tree-RSS approaches the runner's ceiling; the constant then records
-    # the neutral OOM sentinel `{ "<name>": {"oom": true} }` so bench.py compare
-    # renders `OOM` in that row instead of dropping it.
+    # a full prove (no tier gate), bounded two ways: a RAM watchdog SIGKILLs
+    # the process group when tree-RSS nears the runner's ceiling (the constant
+    # then records the `oom: true` sentinel — merged into any Phase-1 metrics
+    # already measured — so bench.py compare renders `OOM` instead of dropping
+    # the row), and a wall-clock `timeout` bounds a runaway prove. `$out` is
+    # re-merged after every constant so a job-level kill mid-loop still leaves
+    # the completed rows on disk.
     ceiling_gb=${AIUR_PROVE_MAX_RSS_GB:-120}
+    bt_bin=$(resolve_bin bench-typecheck) || exit 2
+    rows="$tmp/rows"; mkdir -p "$rows"
     while IFS= read -r c; do
       [ -z "$c" ] && continue
       slug=$(printf '%s' "$c" | tr '/ .:' '____')
@@ -115,11 +150,15 @@ case "$backend" in
       # with --texray + --json it also writes per-phase aiur/*, stark/* timings
       # to `<json>.spans` for the drill-down.
       if [ "$mode" = execute ]; then
-        bench-typecheck --ixe "$ixe" --consts "$c" --json "$res" --execute-only --texray \
+        timeout "${AIUR_EXECUTE_TIMEOUT:-25m}" \
+          "$bt_bin" --ixe "$ixe" --consts "$c" --json "$res" --execute-only --texray \
           > "$tmp/$slug.log" 2>&1 \
-          || { echo "::warning::aiur execute '$c' failed; dropping" >&2; continue; }
+          || { echo "::warning::aiur execute '$c' failed/timed out; dropping" >&2; continue; }
       else
-        ( bench-typecheck --ixe "$ixe" --consts "$c" --json "$res" --texray ) \
+        # setsid: bench-typecheck leads its own process group so the watchdog's
+        # group-kill reaches every descendant.
+        setsid timeout "${AIUR_PROVE_TIMEOUT:-50m}" \
+          "$bt_bin" --ixe "$ixe" --consts "$c" --json "$res" --texray \
           > "$tmp/$slug.log" 2>&1 &
         bt_pid=$!
         watch_ram_kill "$bt_pid" "$ceiling_gb" "$oom" &
@@ -127,17 +166,22 @@ case "$backend" in
         wait "$bt_pid" 2>/dev/null; bt_exit=$?
         kill "$w_pid" 2>/dev/null || true
         wait "$w_pid" 2>/dev/null || true
-        if [ -f "$oom" ]; then
-          echo "::warning::aiur prove '$c' OOM-killed at ${ceiling_gb} GB" >&2
-          jq -n --arg n "$c" '{($n): {oom: true}}' > "$res"
+        # Exit 137 (SIGKILL) without our marker = the kernel OOM killer beat
+        # the 3 s sampling window — still an OOM, label it as one.
+        if [ -f "$oom" ] || [ "$bt_exit" -eq 137 ]; then
+          echo "::warning::aiur prove '$c' OOM-killed (marker=$([ -f "$oom" ] && echo watchdog || echo kernel), ceiling ${ceiling_gb} GB)" >&2
+          mark_oom "$res" "$c"
         elif [ "$bt_exit" -ne 0 ]; then
           echo "::warning::aiur prove '$c' failed (exit $bt_exit); dropping" >&2
           continue
         fi
       fi
       merge_phases "$res" "$spans"
-      [ -s "$res" ] && cat "$res"
-    done < "$names" | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null
+      if [ -s "$res" ]; then
+        cp "$res" "$rows/$slug.json"
+        jq -s 'reduce .[] as $o ({}; . + $o)' "$rows"/*.json > "$out" 2>/dev/null || true
+      fi
+    done < "$names"
     emit_empty
     ;;
 
@@ -163,22 +207,47 @@ case "$backend" in
     # in-session as root; the host children inherit it. Without this the ASM
     # services die with `mmap(rom) errno=11`. SP1 needs no such raise.
     [ "$backend" = zisk ] && sudo prlimit --pid $$ --memlock=unlimited:unlimited
+    ceiling_gb=${ZKVM_EXECUTE_MAX_RSS_GB:-120}
+    rows="$tmp/rows"; mkdir -p "$rows"
     while IFS= read -r c; do
       [ -z "$c" ] && continue
       slug=$(printf '%s' "$c" | tr '/ .:' '____')
-      res="$tmp/$slug.json"; log="$tmp/$slug.log"; spans="$res.spans"
+      res="$tmp/$slug.json"; log="$tmp/$slug.log"; spans="$res.spans"; oom="$tmp/$slug.oom"
+      rm -f "$oom"
       # Full-closure check (no --skip-deps) so this is directly comparable to
-      # the ooc `ix check-rs --consts` run — the delta then isolates the
+      # the ooc `ix check-rs --anon --consts` run — the delta then isolates the
       # in-circuit-vs-out-of-circuit overhead rather than mixing in subject-
-      # only vs full-closure scope.
-      ( cd "$work" && timeout 25m "$bin" --execute --ixe "$ixe" \
+      # only vs full-closure scope. Full closures are RAM-unbounded (the ASM
+      # microservices mmap multi-GB ROMs on top of the guest trace), so the
+      # same watchdog as the aiur prove path guards the runner.
+      # `exec setsid`: the subshell (whose pid is $!) replaces itself with the
+      # session leader, so the watchdog's group-kill (`kill -- -$!`) reaches
+      # the host and every descendant — without a plain subshell wrapper whose
+      # pgid would be run.sh's own.
+      ( cd "$work" && exec setsid timeout 25m "$bin" --execute --ixe "$ixe" \
           --consts "$c" --json "$res" --texray ) \
-        > "$log" 2>&1 \
-        || { echo "::warning::$backend execute '$c' failed/timed out; dropping" >&2; continue; }
-      # The host writes $res only on a clean (zero-failure) run.
+        > "$log" 2>&1 &
+      zk_pid=$!
+      watch_ram_kill "$zk_pid" "$ceiling_gb" "$oom" &
+      w_pid=$!
+      wait "$zk_pid" 2>/dev/null; zk_exit=$?
+      kill "$w_pid" 2>/dev/null || true
+      wait "$w_pid" 2>/dev/null || true
+      if [ -f "$oom" ] || [ "$zk_exit" -eq 137 ]; then
+        echo "::warning::$backend execute '$c' OOM-killed (marker=$([ -f "$oom" ] && echo watchdog || echo kernel), ceiling ${ceiling_gb} GB)" >&2
+        mark_oom "$res" "$c"
+      elif [ "$zk_exit" -ne 0 ]; then
+        echo "::warning::$backend execute '$c' failed/timed out (exit $zk_exit); dropping" >&2
+        continue
+      fi
+      # The host writes $res only on a clean (zero-failure) run. `$out` is
+      # re-merged per constant so a job-level kill keeps completed rows.
       merge_phases "$res" "$spans"
-      [ -s "$res" ] && cat "$res"
-    done < "$names" | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null
+      if [ -s "$res" ]; then
+        cp "$res" "$rows/$slug.json"
+        jq -s 'reduce .[] as $o ({}; . + $o)' "$rows"/*.json > "$out" 2>/dev/null || true
+      fi
+    done < "$names"
     emit_empty
     ;;
 
@@ -187,14 +256,15 @@ case "$backend" in
     # off the structured line
     #   `##check## <elapsed_ms> <passed> <failures> <total> <peak-rss-bytes>`
     # (peak-rss from ix check's tracing-texray tree sampler): the whole env in
-    # parallel (`--anon`, keyed by env), and a per-primary full-closure check
-    # (`--consts`, keyed by constant) — apples-to-apples with the zkVM execute
-    # above (also full-closure now), so the delta isolates in-circuit vs
-    # out-of-circuit overhead.
+    # parallel (`--anon`, keyed by env), and a per-primary check
+    # (`--anon --consts`, keyed by constant) that runs the constant's FULL
+    # dependency closure in anon mode — the same mode and scope as the zkVM
+    # execute above, so the delta isolates in-circuit vs out-of-circuit
+    # overhead rather than mixing in closure-size or metadata effects.
     ooc_one() {  # <label> <ix-check-args…>  → prints one JSON object
       local label="$1"; shift
       local log="$tmp/n.out"
-      ix check-rs "$ixe" "$@" > "$log" 2>>"$log" \
+      "$ix_bin" check-rs "$ixe" "$@" > "$log" 2>>"$log" \
         || { echo "::warning::ooc '$label' check failed; dropping" >&2; return; }
       local line ems fl tot rss
       line=$(grep '^##check##' "$log" | tail -1)
@@ -210,10 +280,12 @@ case "$backend" in
         '{($n): {constants:$c, "check-time":$s, throughput:$tp, "peak-rss":$rss}}'
     }
     {
-      ooc_one "$benv" --anon
+      # Whole-env row keyed by the CamelCase env slug so the PR side matches
+      # what bench-main.yml uploads to bencher (matrix.bench, e.g. `InitStd`).
+      ooc_one "$benv_cc" --anon
       while IFS= read -r c; do
         [ -z "$c" ] && continue
-        ooc_one "$c" --consts "$c"
+        ooc_one "$c" --anon --consts "$c"
       done < "$names"
     } | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null
     emit_empty
@@ -234,7 +306,6 @@ case "$backend" in
       elapsed_ms=$(echo "$line" | awk '{print $2}')
       bytes=$(echo "$line" | awk '{print $3}')
       constants=$(echo "$line" | awk '{print $4}')
-      benv_cc=$(printf '%s' "$benv" | awk '{print toupper(substr($0,1,1)) substr($0,2)}')
       elapsed_s=$(awk -v e="$elapsed_ms" 'BEGIN{printf "%.3f", e/1000}')
       throughput=$(awk -v c="$constants" -v e="$elapsed_ms" \
         'BEGIN{ if (e>0) printf "%.2f", c*1000/e; else print 0 }')
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index 284d81f8..29b15db3 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -24,6 +24,10 @@ permissions:
   contents: read
   checks: write
 
+# No concurrency group: push-to-main and manual dispatch only — every merged
+# commit gets benchmarked; a later merge must never cancel or queue behind an
+# in-flight run.
+
 env:
   COMPILE_DIR: Benchmarks/Compile
 
@@ -62,7 +66,7 @@ jobs:
       - uses: actions/cache/save@v5
         with:
           path: ~/.local/bin
-          key: aiur-bench-bins-${{ github.sha }}
+          key: bench-bins-${{ github.sha }}
 
   # Compile each library env to a `.ixe` and track compile throughput. Caches
   # the `.ixe` (keyed by sha + matrix job) for the prove job to consume.
@@ -90,7 +94,7 @@ jobs:
       - uses: actions/cache/restore@v5
         with:
           path: ~/.local/bin
-          key: aiur-bench-bins-${{ github.sha }}
+          key: bench-bins-${{ github.sha }}
       - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
       # FC's library env lives in a sibling `${COMPILE_DIR}FC` package dir, so
       # point COMPILE_DIR there for the FC matrix job.
@@ -115,35 +119,24 @@ jobs:
       # warning that must not fail the build.
       - run: lake build Compile${{ matrix.bench }}
         working-directory: ${{ env.COMPILE_DIR }}
-      # Serialize the env to a `.ixe` and emit the `##benchmark##` line.
-      - name: Run ix compile
+      # Serialize the env to a `.ixe` and measure the compile via run.sh's
+      # `compile` backend — the same driver + `##benchmark##` parser the
+      # !benchmark PR path uses, so the line format is parsed in exactly one
+      # place. run.sh writes `<Bench>.ixe` at the workspace root (the env arg
+      # is used verbatim for the filename) and neutral.json keyed by the
+      # CamelCase env slug; bmf wraps it for bencher.
+      - name: Run ix compile benchmark
         run: |
-          ix compile ${{ env.COMPILE_DIR }}/Compile${{ matrix.bench }}.lean \
-            --out ${{ matrix.bench }}.ixe 2>&1 | tee output.txt
+          bash .github/scripts/run.sh . ${{ matrix.bench }} compile compile /dev/null neutral.json
+          python3 .github/scripts/bench.py bmf --in neutral.json --out benchmark.json
+          cat benchmark.json
       # Cache the `.ixe` for the prove job (reused, never recompiled there).
       # Only the matrix jobs the prove job consumes, to stay under the repo cache limit.
       - if: matrix.bench == 'InitStd' || matrix.bench == 'Mathlib'
         uses: actions/cache/save@v5
         with:
           path: ${{ matrix.bench }}.ixe
-          key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
-      - name: Generate compile benchmark JSON
-        run: |
-          line=$(grep '^##benchmark##' output.txt)
-          elapsed_s=$(echo "$line" | awk '{printf "%.3f", $2 / 1000}')
-          bytes=$(echo "$line" | awk '{print $3}')
-          constants=$(echo "$line" | awk '{print $4}')
-          throughput=$(echo "$line" | awk '{if ($2 > 0) printf "%.2f", $4 * 1000 / $2; else print 0}')
-          cat > benchmark.json <<EOF
-          {
-            "${{ matrix.bench }}": {
-              "compile-time": {"value": ${elapsed_s}},
-              "file-size": {"value": ${bytes}},
-              "throughput": {"value": ${throughput}},
-              "constants": {"value": ${constants}}
-            }
-          }
-          EOF
+          key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
       # Upload compile metrics. Every measure shares the per-workload baseline
       # window (data points since the ix-compile reset tag): file-size/constants
       # are deterministic, pinned exactly (0/0); compile-time rides a 5% upper
@@ -187,7 +180,7 @@ jobs:
       - uses: actions/cache/restore@v5
         with:
           path: ~/.local/bin
-          key: aiur-bench-bins-${{ github.sha }}
+          key: bench-bins-${{ github.sha }}
       - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
       # Provision the toolchain so the bench-typecheck binary finds libleanshared
       # (no package build). use-github-cache off: nothing to cache here, and
@@ -201,7 +194,7 @@ jobs:
       - uses: actions/cache/restore@v5
         with:
           path: ${{ matrix.bench }}.ixe
-          key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
+          key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
           fail-on-cache-miss: true
       # Run each constant in its own process so a clean failure or timeout drops
       # only that constant from the report. NB: a constant heavy enough to OOM
@@ -217,18 +210,18 @@ jobs:
         env:
           REUSE_IXE: "1"
         run: |
-          # All primaries: run.sh proves each cheap-tier primary and execute-only's
-          # the heavy ones (so heavy primaries still report execute metrics).
+          # All primaries: run.sh attempts a full prove of every primary under
+          # the RAM watchdog; a too-large prove records the `oom: true` sentinel
+          # (alongside any Phase-1 execute metrics measured before the kill).
           # Per-constant peak-rss, same path as the !benchmark PR run.
           benv="${{ matrix.bench }}"; benv="${benv,}"   # InitStd→initStd, Mathlib→mathlib
           python3 .github/scripts/bench.py manifest \
             --csv Benchmarks/Vectors.csv --env "$benv" --mode prove --primary --out names.txt
           echo "proving $(wc -l < names.txt) primary constants:"; cat names.txt
           bash .github/scripts/run.sh . "${{ matrix.bench }}" aiur prove names.txt neutral.json
-          # Wrap neutral { name: { metric: v } } → Bencher Metric Format,
-          # flattening the per-phase `phases` object into `phase:<span>` measures
-          # so span timings are tracked over time alongside the headline metrics.
-          jq 'map_values((.phases // {}) as $p | del(.phases) | (. + ($p | with_entries(.key |= "phase:" + .))) | map_values({value: .}))' neutral.json > aiur.json
+          # neutral → Bencher Metric Format (phases flattened to phase:<span>
+          # measures; the boolean `oom` sentinel stripped — see bench.py bmf).
+          python3 .github/scripts/bench.py bmf --in neutral.json --out aiur.json
           cat aiur.json
       # Upload Aiur metrics. Every measure shares the per-workload baseline
       # window (data points since the aiur-check reset tag). constants is deterministic
@@ -264,12 +257,15 @@ jobs:
             --threshold-measure peak-rss --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
+            --threshold-measure execute-peak-rss --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
             --threshold-measure throughput --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
             --threshold-lower-boundary 0.10
 
   # Execute the same constants through the Zisk and SP1 zkVM hosts and track
-  # cycles / execute-time / throughput / peak-rss (and shards / max-shard-cycles
+  # cycles / execute-time / throughput / execute-peak-rss (and shards / max-shard-cycles
   # for any sharded run). Lean-free: reuses the compile job's cached `.ixe` and
   # only builds the Rust host. zkVM proving needs a GPU (absent here), so this is
   # execute-only. Toolchain + deps come from the shared install-{zisk,sp1} actions.
@@ -300,7 +296,7 @@ jobs:
       - uses: actions/cache/restore@v5
         with:
           path: ${{ matrix.bench }}.ixe
-          key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
+          key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
           fail-on-cache-miss: true
       - name: Run ${{ matrix.backend }} execute benchmark
         env:
@@ -315,10 +311,10 @@ jobs:
           echo "executing $(wc -l < names.txt) primary constants:"; cat names.txt
           bash .github/scripts/run.sh . ${{ matrix.bench }} ${{ matrix.backend }} execute \
             names.txt neutral.json
-          # Wrap neutral { name: { metric: v } } → Bencher Metric Format,
-          # flattening the per-phase `phases` object into `phase:<span>` measures
-          # so span timings are tracked over time alongside the headline metrics.
-          jq 'map_values((.phases // {}) as $p | del(.phases) | (. + ($p | with_entries(.key |= "phase:" + .))) | map_values({value: .}))' neutral.json > bench.json
+          # neutral → Bencher Metric Format (phases flattened; the boolean
+          # `oom` sentinel a watchdog-killed execute records is stripped —
+          # see bench.py bmf).
+          python3 .github/scripts/bench.py bmf --in neutral.json --out bench.json
           cat bench.json
       # cycles / shards / max-shard-cycles are deterministic per guest ELF, but
       # a real guest / packer improvement legitimately drops them — upper-only
@@ -345,7 +341,7 @@ jobs:
             --threshold-measure execute-time --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
-            --threshold-measure peak-rss --threshold-test percentage
+            --threshold-measure execute-peak-rss --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
             --threshold-measure throughput --threshold-test percentage
@@ -373,7 +369,7 @@ jobs:
       - uses: actions/cache/restore@v5
         with:
           path: ~/.local/bin
-          key: aiur-bench-bins-${{ github.sha }}
+          key: bench-bins-${{ github.sha }}
       - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
       # Provision the toolchain so `ix` finds libleanshared (no package build).
       - uses: leanprover/lean-action@v1
@@ -388,7 +384,7 @@ jobs:
       - uses: actions/cache/restore@v5
         with:
           path: ${{ matrix.bench }}.ixe
-          key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
+          key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
           fail-on-cache-miss: true
       # run.sh ooc runs `ix check --anon` (whole env, parallel) and emits the
       # neutral { <env>: { constants, check-time, throughput, peak-rss } } — same
@@ -397,15 +393,14 @@ jobs:
         env:
           REUSE_IXE: "1"
         run: |
-          # Whole env (keyed by env) + the primary constants subject-checked
+          # Whole env (keyed by env) + the primary constants checked full-closure
           # (keyed by constant) for an apples-to-apples baseline next to zisk/sp1.
           benv="${{ matrix.bench }}"; benv="${benv,}"
           python3 .github/scripts/bench.py manifest \
             --csv Benchmarks/Vectors.csv --env "$benv" --mode execute --primary --out names.txt
           bash .github/scripts/run.sh . "${{ matrix.bench }}" ooc execute names.txt neutral.json
-          # Wrap → Bencher Metric Format, flattening `phases` into `phase:<span>`
-          # measures (a no-op for ooc, which records no spans).
-          jq 'map_values((.phases // {}) as $p | del(.phases) | (. + ($p | with_entries(.key |= "phase:" + .))) | map_values({value: .}))' neutral.json > bench.json
+          # neutral → Bencher Metric Format (see bench.py bmf).
+          python3 .github/scripts/bench.py bmf --in neutral.json --out bench.json
           cat bench.json
       # constants is deterministic → pinned (0/0); check-time / throughput /
       # peak-rss are noisy parallel wall-clock → percentage bounds.
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index 07340d88..e8908086 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -76,8 +76,56 @@ jobs:
           COMMENT_BODY: ${{ github.event.comment.body }}
         run: python3 .github/scripts/bench.py parse
 
-  benchmark:
+  # Build the PR's `ix` + `bench-typecheck` once (they embed the IxVM kernel
+  # and the Aiur prover), stage under ~/.local/bin, and cache by head SHA —
+  # the matrix cells restore instead of re-running the full Lean build per
+  # cell, and re-running !benchmark on the same commit skips the build
+  # entirely. ubuntu-latest mirrors bench-main.yml's build job so PR binaries
+  # carry the same instruction-set provenance as the binaries behind bencher's
+  # main-side numbers.
+  build:
     needs: setup
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          ref: ${{ needs.setup.outputs.head-sha }}
+      - id: bins
+        uses: actions/cache/restore@v5
+        with:
+          path: ~/.local/bin
+          key: bench-bins-${{ needs.setup.outputs.head-sha }}
+      - if: steps.bins.outputs.cache-hit != 'true'
+        uses: actions-rust-lang/setup-rust-toolchain@v1
+      # `.cargo/config.toml` sets `-Ctarget-cpu=native`; log the build CPU so a
+      # benchmark shift can be traced to an instruction-set change.
+      - name: Log build CPU
+        if: steps.bins.outputs.cache-hit != 'true'
+        run: |
+          lscpu
+          grep -qw avx512f /proc/cpuinfo \
+            && echo "AVX-512F: present (compiled into the binary)" \
+            || echo "AVX-512F: absent"
+      - if: steps.bins.outputs.cache-hit != 'true'
+        uses: leanprover/lean-action@v1
+        with:
+          auto-config: false
+          build: true
+          build-args: "ix bench-typecheck"
+          use-github-cache: false
+      - if: steps.bins.outputs.cache-hit != 'true'
+        run: |
+          mkdir -p ~/.local/bin
+          cp .lake/build/bin/ix .lake/build/bin/bench-typecheck ~/.local/bin/
+          chmod +x ~/.local/bin/ix ~/.local/bin/bench-typecheck
+      - if: steps.bins.outputs.cache-hit != 'true'
+        uses: actions/cache/save@v5
+        with:
+          path: ~/.local/bin
+          key: bench-bins-${{ needs.setup.outputs.head-sha }}
+
+  benchmark:
+    needs: [setup, build]
     runs-on: ${{ matrix.cell.runner }}
     timeout-minutes: 120
     strategy:
@@ -122,18 +170,39 @@ jobs:
             --tier "$TIER" --shard "$SHARD" $PRIMARY --out "$GITHUB_WORKSPACE/names.txt" \
             | tee -a "$GITHUB_OUTPUT"
 
-      # Lean toolchain + build `ix` and `bench-typecheck` for the PR side (every
-      # backend needs `ix` to compile the env to a `.ixe`). Mathlib cache only
-      # pulled for the mathlib env.
-      - name: Build PR (ix, bench-typecheck)
+      # Restore the once-built PR binaries (see the build job) and stage them
+      # into .bins/pr — ~/.local/bin is reused below for the base side's cache
+      # restore, so each side keeps its own directory and PATH. run.sh resolves
+      # tools in-tree first, then PATH, so the staged dir Just Works.
+      - name: Restore PR binaries
+        uses: actions/cache/restore@v5
+        with:
+          path: ~/.local/bin
+          key: bench-bins-${{ env.HEAD_SHA }}
+          fail-on-cache-miss: true
+      - run: |
+          mkdir -p .bins/pr
+          mv ~/.local/bin/ix ~/.local/bin/bench-typecheck .bins/pr/
+      # Toolchain provisioning only (no package build): the restored binaries
+      # link against the toolchain's libleanshared. Mathlib olean cache only
+      # for the mathlib env (its `ix compile` needs the oleans).
+      - name: Provision Lean toolchain
         uses: leanprover/lean-action@v1
         with:
           lake-package-directory: .
           auto-config: false
-          build: true
-          build-args: "ix bench-typecheck"
+          build: false
           use-github-cache: false
           use-mathlib-cache: ${{ matrix.cell.env == 'mathlib' && 'true' || 'false' }}
+      # The compiled env is identical across every cell of the same PR commit —
+      # cache it so only the first cell pays the `ix compile`. (The `compile`
+      # backend ignores REUSE_IXE by design: it measures the compile itself.)
+      - name: Restore PR .ixe
+        id: pr-ixe
+        uses: actions/cache/restore@v5
+        with:
+          path: ${{ matrix.cell.env }}.ixe
+          key: bench-pr-ixe-${{ env.HEAD_SHA }}-${{ matrix.cell.env }}
       # zkVM cells additionally need the Rust toolchain + the backend's toolchain
       # and system deps (the shared composite install actions).
       - name: Set up zkVM Rust toolchain
@@ -150,25 +219,78 @@ jobs:
 
       # ---------- main side ----------
       # Try bencher.dev first (bench-main.yml has uploaded main's numbers).
-      # continue-on-error so bench.py's exit 2 (unsupported backend/mode) or 3
-      # (no data yet / API error) doesn't fail the job — we fall back to a
-      # local base run below.
+      # fetch-main's exit codes are load-bearing: 3 = transient (base SHA not
+      # ingested yet) → fall back to a local base run; anything else (2 =
+      # backend/mode has no main testbed — a BACKEND_TABLE / bench-main.yml
+      # drift) is a permanent misconfiguration that a local rebuild can never
+      # fix, so fail the cell loudly instead of silently paying the fallback
+      # on every future run.
       - name: Fetch main from bencher
         id: bencher
-        continue-on-error: true
         run: |
+          set +e
           python3 .github/scripts/bench.py fetch-main \
-            --sha "$BASE_SHA" --backend "$BACKEND" --mode "$MODE" \
+            --sha "$BASE_SHA" --backend "$BACKEND" --mode "$MODE" --env "$BENV" \
             --names "$GITHUB_WORKSPACE/names.txt" \
             --out "$GITHUB_WORKSPACE/main.json"
+          rc=$?
+          set -e
+          case $rc in
+            0) echo "source=bencher" >> "$GITHUB_OUTPUT" ;;
+            3) echo "source=ran" >> "$GITHUB_OUTPUT" ;;
+            *) echo "::error::fetch-main: permanent config error (exit $rc) — check BACKEND_TABLE in bench.py vs bench-main.yml testbeds"; exit "$rc" ;;
+          esac
       - name: Checkout base (bencher had no data)
-        if: steps.bencher.outcome != 'success'
+        if: steps.bencher.outputs.source == 'ran'
         uses: actions/checkout@v6
         with:
           ref: ${{ env.BASE_SHA }}
           path: base
+      # bench-main.yml's build/compile jobs already cached the base SHA's
+      # binaries and `.ixe` on the push to main — restore both before paying
+      # for a from-scratch base build. The `.ixe` cache key/path use the
+      # CamelCase env slug (bench-main's matrix.bench).
+      - name: Compute env slug
+        if: steps.bencher.outputs.source == 'ran'
+        id: envcc
+        run: |
+          benv="$BENV"
+          echo "cc=${benv^}" >> "$GITHUB_OUTPUT"
+      - name: Restore base binaries (bench-main build cache)
+        if: steps.bencher.outputs.source == 'ran'
+        id: base-bins
+        uses: actions/cache/restore@v5
+        with:
+          path: ~/.local/bin
+          key: bench-bins-${{ env.BASE_SHA }}
+      - name: Restore base .ixe (bench-main compile cache)
+        if: steps.bencher.outputs.source == 'ran'
+        id: base-ixe
+        uses: actions/cache/restore@v5
+        with:
+          path: ${{ steps.envcc.outputs.cc }}.ixe
+          key: bench-ixe-${{ env.BASE_SHA }}-${{ steps.envcc.outputs.cc }}
+      # Cached base binaries are usable only when the two `lean-toolchain`
+      # files are identical (plain `cmp`), and — for the mathlib env — only
+      # when the `.ixe` was also restored (otherwise base's `ix compile` needs
+      # mathlib oleans that only the full build fetches).
+      - name: Resolve base binaries
+        if: steps.bencher.outputs.source == 'ran'
+        id: base-src
+        run: |
+          cached=false
+          if [ "${{ steps.base-bins.outputs.cache-hit }}" = true ] \
+             && cmp -s lean-toolchain base/lean-toolchain; then
+            if [ "$BENV" != mathlib ] || [ "${{ steps.base-ixe.outputs.cache-hit }}" = true ]; then
+              mkdir -p .bins/base
+              mv ~/.local/bin/ix ~/.local/bin/bench-typecheck .bins/base/ 2>/dev/null || true
+              [ -x .bins/base/ix ] && [ -x .bins/base/bench-typecheck ] && cached=true
+            fi
+          fi
+          echo "cached=$cached" >> "$GITHUB_OUTPUT"
+          echo "base binaries: $([ "$cached" = true ] && echo restored from bench-main cache || echo building from source)"
       - name: Build base (ix, bench-typecheck)
-        if: steps.bencher.outcome != 'success'
+        if: steps.bencher.outputs.source == 'ran' && steps.base-src.outputs.cached != 'true'
         uses: leanprover/lean-action@v1
         with:
           lake-package-directory: base
@@ -177,30 +299,51 @@ jobs:
           build-args: "ix bench-typecheck"
           use-github-cache: false
           use-mathlib-cache: ${{ matrix.cell.env == 'mathlib' && 'true' || 'false' }}
+      # NOTE: this runs the PR's run.sh against base-built binaries. When a PR
+      # changes the benchmark CLIs themselves, the base binaries reject the new
+      # flags and every constant drops — compare then renders the loud
+      # "main produced no results" note instead of a silent all-n/a table.
       - name: Run backend on base → main.json
-        if: steps.bencher.outcome != 'success'
+        if: steps.bencher.outputs.source == 'ran'
         run: |
-          export PATH="$PWD/base/.lake/build/bin:$PATH"
+          if [ "${{ steps.base-ixe.outputs.cache-hit }}" = true ]; then
+            mv "${{ steps.envcc.outputs.cc }}.ixe" "base/$BENV.ixe"
+            export REUSE_IXE=1
+          fi
+          if [ "${{ steps.base-src.outputs.cached }}" = true ]; then
+            export PATH="$PWD/.bins/base:$PATH"
+          else
+            export PATH="$PWD/base/.lake/build/bin:$PATH"
+          fi
           bash .github/scripts/run.sh base "$BENV" "$BACKEND" "$MODE" \
             "$GITHUB_WORKSPACE/names.txt" "$GITHUB_WORKSPACE/main.json"
 
       # ---------- PR side ----------
       - name: Run backend on PR → pr.json
+        env:
+          REUSE_IXE: ${{ steps.pr-ixe.outputs.cache-hit == 'true' && '1' || '0' }}
         run: |
-          export PATH="$PWD/.lake/build/bin:$PATH"
+          export PATH="$PWD/.bins/pr:$PATH"
           bash .github/scripts/run.sh . "$BENV" "$BACKEND" "$MODE" \
             "$GITHUB_WORKSPACE/names.txt" "$GITHUB_WORKSPACE/pr.json"
+      # First cell to compile the PR env publishes it for the others (racing
+      # saves are fine — the first wins, the rest fail gracefully).
+      - name: Save PR .ixe
+        if: steps.pr-ixe.outputs.cache-hit != 'true'
+        uses: actions/cache/save@v5
+        with:
+          path: ${{ matrix.cell.env }}.ixe
+          key: bench-pr-ixe-${{ env.HEAD_SHA }}-${{ matrix.cell.env }}
 
       # ---------- compare ----------
       - name: Build comparison table
         run: |
-          src=ran; [ "${{ steps.bencher.outcome }}" = success ] && src=bencher
           mkdir -p out
           python3 .github/scripts/bench.py compare \
             --main main.json --pr pr.json --out "out/table-$LABEL.md" \
             --backend "$BACKEND" --env "$BENV" --mode "$MODE" \
             --count "${{ steps.man.outputs.count }}" \
-            --main-source "$src"
+            --main-source "${{ steps.bencher.outputs.source }}"
           cat "out/table-$LABEL.md"
 
       - name: Upload table
diff --git a/.github/workflows/ignored.yml b/.github/workflows/ignored.yml
index 5e92186a..99751746 100644
--- a/.github/workflows/ignored.yml
+++ b/.github/workflows/ignored.yml
@@ -8,9 +8,9 @@ on:
 permissions:
   contents: read
 
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
+# No concurrency group: push-to-main and manual dispatch only — every merged
+# commit runs the extended tests; a later merge must never cancel an in-flight
+# run.
 
 jobs:
   ignored-test:
diff --git a/.github/workflows/riscv-bench.yml b/.github/workflows/riscv-bench.yml
index 2ac4db24..fd2380a7 100644
--- a/.github/workflows/riscv-bench.yml
+++ b/.github/workflows/riscv-bench.yml
@@ -12,9 +12,9 @@ on:
 permissions:
   contents: read
 
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
+# No concurrency group: this workflow only runs on push-to-main and manual
+# dispatch, and every merged commit should be benchmarked — a later merge must
+# never cancel or queue behind an in-flight run.
 
 jobs:
   # Compile a tiny env once (ix is already built here) and hand it to the zkVM
diff --git a/Benchmarks/Typecheck.lean b/Benchmarks/Typecheck.lean
index d36ab53d..d9bd11ca 100644
--- a/Benchmarks/Typecheck.lean
+++ b/Benchmarks/Typecheck.lean
@@ -5,6 +5,7 @@ import Ix.Aiur.Compiler
 import Ix.Aiur.Statistics
 import Ix.TracingTexray
 import Ix.Benchmark.Bench
+import Ix.Cli.ConstsFile
 import Ix.Cli.NameResolve
 
 /-!
@@ -65,8 +66,11 @@ warning, so a single bad name never fails the run. The harness imposes no time
 limit; bound a run with an external `timeout` if needed.
 
 The JSON is a neutral, flat shape (`{ "<name>": { "constants": …, "fft-cost": …,
-"execute-time": …, "prove-time": …, "throughput": … } }`, where `prove-time` and
-`throughput` appear only for proven constants); any bencher-specific reshaping
+"execute-time": …, "execute-peak-rss": …, "prove-time": …, "peak-rss": …,
+"throughput": … } }`). `execute-peak-rss` is the Phase-1 RSS high-water,
+sampled before proving starts, so it is comparable across execute-only and
+prove runs; `prove-time`, `peak-rss` (the prover's high-water), and
+`throughput` appear only for proven constants. Any bencher-specific reshaping
 is the caller's job (see `.github/workflows/bench-main.yml`).
 -/
 
@@ -85,15 +89,6 @@ def friParameters : Aiur.FriParameters := {
   queryProofOfWorkBits := 0
 }
 
-/-- `--consts-file` lines as raw strings: one name per line. Everything from a
-    `#` to end of line is a comment (whole-line or inline); blank lines are
-    dropped. `#` never appears in a Lean name, so splitting on it is safe.
-    Resolution against the env happens later (so the `toString` fallback can
-    see the displayed form the user wrote). -/
-def parseConstsFile (contents : String) : Array String :=
-  (contents.splitOn "\n").filterMap (fun line =>
-    let s := ((line.splitOn "#").head?.getD "").trimAscii
-    if s.isEmpty then none else some s.toString) |>.toArray
 
 
 /-- Per-constant measurements. `proveSec` is `none` when the constant was
@@ -107,28 +102,43 @@ structure Result where
   /-- Peak resident-set size in bytes (tracing-texray tree sampler), captured
       after the constant's heaviest phase. -/
   peakRss : Option Nat := none
+  /-- Phase-1 (execute) RSS high-water mark, sampled before any proving
+      allocations. Present in BOTH modes so an execute-only run compares
+      apples-to-apples against a prove run's baseline — `peak-rss` in a prove
+      run is dominated by the prover and would dwarf an execute-only peak. -/
+  executePeakRss : Option Nat := none
   deriving Inhabited
 
-/-- Round a Float to `d` decimal places, to keep the emitted JSON readable. -/
-def roundTo (d : Nat) (f : Float) : Float :=
+/-- A `Json` number with at most `d` decimal places, rendered decimally.
+    `Float`'s own `ToJson` prints the full binary representation
+    (`0.02602000000000000146…`), so build the `JsonNumber` (mantissa ·
+    10⁻ᵈ) directly from the rounded value instead. -/
+def jsonRound (d : Nat) (f : Float) : Json :=
   let scale := (10.0 : Float) ^ d.toFloat
-  (f * scale).round / scale
+  let scaled := f * scale
+  let m : Int :=
+    if scaled < 0 then -Int.ofNat (-scaled).round.toUInt64.toNat
+    else Int.ofNat scaled.round.toUInt64.toNat
+  Json.num ⟨m, d⟩
 
 /-- Neutral, flat results object: `name → { constants, fft-cost, execute-time,
     prove-time?, throughput? }`. No bencher-specific shaping. -/
 def Result.toJsonEntry (r : Result) : String × Json :=
   let base : List (String × Json) :=
     [ ("constants", Lean.toJson r.constants)
-    , ("fft-cost", Lean.toJson (roundTo 0 r.fftCost))
-    , ("execute-time", Lean.toJson (roundTo 6 r.executeSec)) ]
+    , ("fft-cost", jsonRound 0 r.fftCost)
+    , ("execute-time", jsonRound 6 r.executeSec) ]
   let base := match r.peakRss with
     | some n => base ++ [ ("peak-rss", Lean.toJson n) ]
     | none => base
+  let base := match r.executePeakRss with
+    | some n => base ++ [ ("execute-peak-rss", Lean.toJson n) ]
+    | none => base
   -- prove-time and the derived proving throughput (constants/prove-time, the
   -- proving analog of compile's constants/sec) are present only once proven.
   let fields := match r.proveSec with
-    | some p => base ++ [ ("prove-time", Lean.toJson (roundTo 6 p))
-                        , ("throughput", Lean.toJson (roundTo 2 (r.constants.toFloat / p))) ]
+    | some p => base ++ [ ("prove-time", jsonRound 6 p)
+                        , ("throughput", jsonRound 2 (r.constants.toFloat / p)) ]
     | none => base
   (r.name, Json.mkObj fields)
 
@@ -144,22 +154,11 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
   let some ixeArg := p.flag? "ixe"
     | IO.eprintln "error: --ixe <path> is required"; return 1
   let ixePath := ixeArg.as! String
-  -- Names come from `--consts` (comma-list) and/or a `--consts-file` file.
-  let cliNames : Array String := match p.flag? "consts" with
-    | some f => ((f.as! String).splitOn ",").toArray.filterMap (fun s =>
-        let t := s.trim
-        if t.isEmpty then none else some t)
-    | none => #[]
-  let fileNames ← match p.flag? "consts-file" with
-    | some f => pure (parseConstsFile (← IO.FS.readFile (f.as! String)))
-    | none => pure #[]
-  -- Union, preserving first-seen order, so the same const isn't proven twice.
-  let nameArgs := Id.run do
-    let mut seen : Std.HashSet String := {}
-    let mut acc : Array String := #[]
-    for n in cliNames ++ fileNames do
-      if !seen.contains n then seen := seen.insert n; acc := acc.push n
-    return acc
+  -- `--consts` comma-list ∪ `--consts-file`, shared grammar + dedup
+  -- (Ix.Cli.ConstsFile — same parser as `ix check-rs`). Raw strings:
+  -- resolution against the env happens later (so the `toString` fallback can
+  -- see the displayed form the user wrote).
+  let nameArgs ← Ix.Cli.ConstsFile.gather p
   if nameArgs.isEmpty then
     IO.eprintln "error: provide at least one constant via --consts <n1,n2,…> and/or --consts-file <path>"
     return 1
@@ -250,11 +249,15 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
       IO.FS.writeFile path (Json.mkObj (results.map Result.toJsonEntry).toList).pretty
     | none => pure ()
 
+  -- Phase-1 RSS high-water, sampled BEFORE any proving allocations so the
+  -- measure is comparable between execute-only and prove runs (`peak-rss`
+  -- from a prove run is the prover's high-water and would dwarf it).
+  let executePeak ← TracingTexray.peakTreeRssBytes
+  execed := execed.map (fun (r, a) => ({ r with executePeakRss := some executePeak }, a))
+
   -- `--execute-only`: stop after Phase 1; the results JSON (if requested) is
   -- already complete with the execute metrics.
   if executeOnly then
-    let peak ← TracingTexray.peakTreeRssBytes
-    execed := execed.map (fun (r, a) => ({ r with peakRss := some peak }, a))
     writeJson (execed.map (·.1))
     match jsonOut with
     | some path => IO.println s!"wrote {execed.size} execute-only benchmarks to {path}"
diff --git a/Benchmarks/Vectors.csv b/Benchmarks/Vectors.csv
index 35057169..e1b2bb45 100644
--- a/Benchmarks/Vectors.csv
+++ b/Benchmarks/Vectors.csv
@@ -8,8 +8,11 @@
 # carry the first three):
 #   name         fully-qualified Lean name (resolves via NameResolve.resolveIxeAddr).
 #   env          compile target / .ixe it resolves in: initStd | lean | mathlib.
-#   tier         cheap = prove-feasible per-PR; heavy = execute-only / sharded.
-#                run.sh's prove-loop reads this to gate prove vs execute-only.
+#   tier         cheap = prove-feasible on a CI runner; heavy = a single-shard
+#                prove exceeds the RAM watchdog ceiling (expect an OOM row).
+#                Consumed by bench.py manifest only (BENCH_TIER filter; the
+#                non-primary prove set defaults to cheap) — run.sh attempts a
+#                full prove of every selected constant regardless of tier.
 #   shard_target 1 = heavy constant designated as a multi-shard prove target.
 #   primary      1 = part of the primary subset spanning shape + the cheap->heavy
 #                cost range. Default for the !benchmark PR comment and the
diff --git a/Cargo.lock b/Cargo.lock
index a8af9cc1..2c5bc0f2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -831,7 +831,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.61.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -2335,7 +2335,7 @@ version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -3085,7 +3085,7 @@ dependencies = [
  "once_cell",
  "socket2",
  "tracing",
- "windows-sys 0.60.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -3368,7 +3368,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b55fb86dfd3a2f5f76ea78310a88f96c4ea21a3031f8d212443d56123fd0521"
 dependencies = [
  "libc",
- "windows-sys 0.61.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -4022,7 +4022,7 @@ dependencies = [
 [[package]]
 name = "tracing-texray"
 version = "0.2.0"
-source = "git+https://github.com/argumentcomputer/tracing-texray?rev=15ae57cfbcb234ff5911a306898e7d93d396d648#15ae57cfbcb234ff5911a306898e7d93d396d648"
+source = "git+https://github.com/argumentcomputer/tracing-texray?rev=bd4faa08a4fa4edb46bde393b4d20c6bd49591d0#bd4faa08a4fa4edb46bde393b4d20c6bd49591d0"
 dependencies = [
  "loom",
  "parking_lot",
@@ -4499,7 +4499,7 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets 0.52.6",
+ "windows-targets",
 ]
 
 [[package]]
@@ -4508,16 +4508,7 @@ version = "0.59.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
 dependencies = [
- "windows-targets 0.52.6",
-]
-
-[[package]]
-name = "windows-sys"
-version = "0.60.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
-dependencies = [
- "windows-targets 0.53.5",
+ "windows-targets",
 ]
 
 [[package]]
@@ -4535,31 +4526,14 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
 dependencies = [
- "windows_aarch64_gnullvm 0.52.6",
- "windows_aarch64_msvc 0.52.6",
- "windows_i686_gnu 0.52.6",
- "windows_i686_gnullvm 0.52.6",
- "windows_i686_msvc 0.52.6",
- "windows_x86_64_gnu 0.52.6",
- "windows_x86_64_gnullvm 0.52.6",
- "windows_x86_64_msvc 0.52.6",
-]
-
-[[package]]
-name = "windows-targets"
-version = "0.53.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
-dependencies = [
- "windows-link",
- "windows_aarch64_gnullvm 0.53.1",
- "windows_aarch64_msvc 0.53.1",
- "windows_i686_gnu 0.53.1",
- "windows_i686_gnullvm 0.53.1",
- "windows_i686_msvc 0.53.1",
- "windows_x86_64_gnu 0.53.1",
- "windows_x86_64_gnullvm 0.53.1",
- "windows_x86_64_msvc 0.53.1",
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
 ]
 
 [[package]]
@@ -4577,96 +4551,48 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
-
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
-
 [[package]]
 name = "windows_i686_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
-[[package]]
-name = "windows_i686_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
-
 [[package]]
 name = "windows_i686_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
-[[package]]
-name = "windows_i686_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
-
 [[package]]
 name = "windows_i686_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
-[[package]]
-name = "windows_i686_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
-
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
-
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
-
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
-
 [[package]]
 name = "winnow"
 version = "1.0.2"
diff --git a/Cargo.toml b/Cargo.toml
index 618aa969..b8afcfd6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -58,7 +58,7 @@ sha2 = "0.10"
 tiny-keccak = { version = "2", features = ["keccak"] }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
-tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "15ae57cfbcb234ff5911a306898e7d93d396d648" }
+tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "bd4faa08a4fa4edb46bde393b4d20c6bd49591d0" }
 
 [workspace.lints.rust]
 invalid_reference_casting = "warn"
diff --git a/Ix/Cli/CheckRsCmd.lean b/Ix/Cli/CheckRsCmd.lean
index 596b3578..190f08ee 100644
--- a/Ix/Cli/CheckRsCmd.lean
+++ b/Ix/Cli/CheckRsCmd.lean
@@ -6,18 +6,22 @@
 
   - Default (Meta): kernel runs with metadata fields populated (Lean.Name,
     binder info, mdata). Supports `--ns` / `--consts` / `--consts-file`
-    for seed filtering and `--fail-out` for bisect-loop workflows.
+    for seed filtering and `--fail-out` for bisect-loop workflows. Seeded
+    meta checks are SUBJECT-ONLY: each seed is checked with its deps
+    lazily ingressed but trusted, not re-checked.
   - `--anon` (metadata-free): the env is loaded via `Env::get_anon` —
     `named`/`names`/`comms` sections are discarded at load time, never
-    reaching the kernel. Every kernel-checkable address (every constant
-    except Muts blocks and projections — projections are covered by
-    their parent block) is checked. The kernel's typechecking logic
-    structurally cannot read metadata (`M::MField<T>` is `()` in Anon
-    mode); progress labels are `@<hex>` addresses, not names.
-
-    `--anon` is incompatible with `--ns` / `--consts` / `--consts-file`:
-    the anon path checks everything in the env. Add `--addrs <hex,…>`
-    in the future if address-based filtering is needed.
+    reaching the kernel. The kernel's typechecking logic structurally
+    cannot read metadata (`M::MField<T>` is `()` in Anon mode); progress
+    labels are `@<hex>` addresses, not names.
+
+    Without a filter, every kernel-checkable address is checked (whole
+    env). With `--consts` / `--consts-file`, the named constants are
+    checked together with their FULL dependency closures — the same mode
+    and scope as the zkVM hosts' `--consts` execute path, so an
+    out-of-circuit run is directly comparable to the in-circuit one. Add
+    `--skip-deps` for a subject-only check (deps trusted), mirroring
+    `zisk-host --skip-deps`. `--ns` prefix filtering stays meta-only.
 
   Direct Lean → kernel typechecking (compile-and-check from source) is
   available via the `rsCheckConstsFFI` API for tests
@@ -30,6 +34,7 @@ public import Ix.Common
 public import Ix.KernelCheck
 public import Ix.Meta
 public import Ix.TracingTexray
+public import Ix.Cli.ConstsFile
 public import Ix.Cli.ValidateCmd
 public import Std.Internal.UV.System
 
@@ -49,18 +54,6 @@ private structure SeedSpec where
 private def SeedSpec.isEmpty (s : SeedSpec) : Bool :=
   s.prefixes.isEmpty && s.exacts.isEmpty
 
-/-- Read one constant name per line from `path`. Blank lines and lines
-    starting with `#` (after trimming) are ignored. -/
-private def readNamesFile (path : String) : IO (List Lean.Name) := do
-  let content ← IO.FS.readFile path
-  let lines := content.splitOn "\n"
-  let names : List Lean.Name := lines.filterMap fun raw =>
-    let cs := raw.toList.dropWhile Char.isWhitespace
-    let trimmed := String.ofList (cs.reverse.dropWhile Char.isWhitespace).reverse
-    if trimmed.isEmpty || trimmed.startsWith "#" then none
-    else some trimmed.toName
-  pure names
-
 /-- Build a `SeedSpec` from `--ns`, `--consts`, and `--consts-file`. -/
 private def resolveSeedSpec (p : Cli.Parsed) : IO (Option SeedSpec) := do
   let nsFlag     := p.flag? "ns"
@@ -83,7 +76,8 @@ private def resolveSeedSpec (p : Cli.Parsed) : IO (Option SeedSpec) := do
     exacts := exacts ++ parsed
   if let some flag := fileFlag then
     let path := flag.as! String
-    let parsed ← readNamesFile path
+    -- Shared grammar (Ix.Cli.ConstsFile); meta seeds resolve via `toName`.
+    let parsed := (← ConstsFile.read path).toList.map (·.toName)
     if parsed.isEmpty then
       IO.println s!"[check] warning: --consts-file '{path}' yielded zero names"
     else
@@ -173,6 +167,48 @@ private def runCheckAnon (envPath : String) (p : Cli.Parsed) : IO UInt32 := do
   IO.println s!"##check## {elapsed} {passed} {failures.size} {results.size} {peakRss}"
   return if failures.isEmpty then 0 else 1
 
+/-- Anon-mode per-constant runner: dispatch to `rsCheckAnonConstsFFI`. Checks
+    the named constants and (by default) their full dependency closures — the
+    zkVM hosts' semantics — or subject-only under `--skip-deps`. -/
+private def runCheckAnonConsts (envPath : String) (p : Cli.Parsed) : IO UInt32 := do
+  let verbose := p.flag? "verbose" |>.isSome
+  let skipDeps := p.hasFlag "skip-deps"
+  let failOutPath : String :=
+    match p.flag? "fail-out" with
+    | some flag => flag.as! String
+    | none      => ""
+  -- Raw strings (no toName round-trip): the FFI resolves displayed forms
+  -- against the env's `named` map, matching the zkVM hosts' resolution.
+  let names ← ConstsFile.gather p
+  if names.isEmpty then
+    IO.println "[check] error: --consts/--consts-file resolved to zero names"
+    return 1
+
+  let scope := if skipDeps then "subject-only" else "full-closure"
+  IO.println s!"Running Ix kernel check (anon mode, {scope}) on {envPath}"
+  IO.println s!"[check] {names.size} seed constant(s): {", ".intercalate names.toList}"
+  let start ← IO.monoMsNow
+  let results ← rsCheckAnonConstsFFI envPath names skipDeps (!verbose) failOutPath
+  let elapsed := (← IO.monoMsNow) - start
+
+  let mut passed := 0
+  let mut failures : Array (String × String) := #[]
+  for (hex, res) in results do
+    match res with
+    | none => passed := passed + 1
+    | some err => failures := failures.push (s!"#{hex}", err.message)
+
+  IO.println s!"[check] checked {results.size} constants in {elapsed.formatMs}"
+  IO.println s!"[check] {passed}/{results.size} passed"
+  reportFailures failures
+
+  if !failOutPath.isEmpty then
+    IO.println s!"[check] streamed {failures.size} failure(s) to {failOutPath}"
+
+  let peakRss ← TracingTexray.peakTreeRssBytes
+  IO.println s!"##check## {elapsed} {passed} {failures.size} {results.size} {peakRss}"
+  return if failures.isEmpty then 0 else 1
+
 /-- Meta-mode runner: dispatch to `rsCheckIxonFFI` with seed filtering. -/
 private def runCheckMeta (envPath : String) (p : Cli.Parsed) : IO UInt32 := do
   let verbose := p.flag? "verbose" |>.isSome
@@ -247,14 +283,19 @@ def runCheckRsCmd (p : Cli.Parsed) : IO UInt32 := do
     Std.Internal.UV.System.osSetenv "IX_KERNEL_CHECK_WORKERS" (toString n)
 
   let anon := p.flag? "anon" |>.isSome
+  let hasConsts := (p.flag? "consts").isSome || (p.flag? "consts-file").isSome
+  if p.hasFlag "skip-deps" && !(anon && hasConsts) then
+    p.printError "error: --skip-deps only applies to `--anon --consts/--consts-file` \
+      (meta-mode seeded checks are always subject-only)"
+    return 1
   if anon then
-    let hasConsts := p.flag? "consts" |>.isSome
-    let hasNs := p.flag? "ns" |>.isSome
-    let hasConstsFile := p.flag? "consts-file" |>.isSome
-    if hasConsts || hasNs || hasConstsFile then
-      p.printError "error: --anon checks the entire env; --consts/--ns/--consts-file are unsupported"
+    if p.flag? "ns" |>.isSome then
+      p.printError "error: --ns prefix filtering is meta-only; --anon supports --consts/--consts-file"
       return 1
-    runCheckAnon envPath p
+    if hasConsts then
+      runCheckAnonConsts envPath p
+    else
+      runCheckAnon envPath p
   else
     runCheckMeta envPath p
 
@@ -268,8 +309,9 @@ def checkRsCmd : Cli.Cmd := `[Cli|
   FLAGS:
     anon;                   "Run the kernel in anon mode (no metadata read from .ixe)"
     ns            : String; "Comma-separated Lean.Name prefixes to filter on (meta mode only)"
-    consts        : String; "Comma-separated EXACT constant names to seed (meta mode only)"
-    "consts-file" : String; "Path to a file with one constant name per line (meta mode only)"
+    consts        : String; "Comma-separated EXACT constant names. Meta mode: subject-only seed check. Anon mode: full-closure check of each name (the zkVM hosts' semantics; --skip-deps for subject-only)."
+    "consts-file" : String; "Path to a file with one constant name per line (`#` comments); unions with --consts."
+    "skip-deps";            "With --anon --consts: check each named constant subject-only, trusting its deps (same flag as zisk-host/sp1-host/bench-typecheck)."
     "fail-out"    : String; "Write failing constants to this path (consumable by --consts-file)"
     workers       : Nat;    "Number of parallel kernel-check workers; 1 disables parallelism (default: available_parallelism). Plumbs via IX_KERNEL_CHECK_WORKERS env var."
     verbose;                "Log every constant on its own line (default: quiet)"
diff --git a/Ix/Cli/CompileCmd.lean b/Ix/Cli/CompileCmd.lean
index a00f95e5..b9b3a38b 100644
--- a/Ix/Cli/CompileCmd.lean
+++ b/Ix/Cli/CompileCmd.lean
@@ -3,6 +3,7 @@ public import Cli
 public import Ix.Common
 public import Ix.CompileM
 public import Ix.Meta
+public import Ix.Cli.ConstsFile
 public import Ix.Cli.ValidateCmd
 
 public section
@@ -14,19 +15,6 @@ private def defaultOutPathFor (pathStr : String) : String :=
   let stem := path.fileStem.getD (path.fileName.getD pathStr)
   stem.toLower ++ ".ixe"
 
-/-- Read one constant name per line from `path`. Blank lines and lines
-    starting with `#` (after trimming) are ignored. Mirrors
-    `Ix.Cli.CheckCmd.readNamesFile`. -/
-private def readNamesFile (path : String) : IO (List Lean.Name) := do
-  let content ← IO.FS.readFile path
-  let lines := content.splitOn "\n"
-  let names : List Lean.Name := lines.filterMap fun raw =>
-    let cs := raw.toList.dropWhile Char.isWhitespace
-    let trimmed := String.ofList (cs.reverse.dropWhile Char.isWhitespace).reverse
-    if trimmed.isEmpty || trimmed.startsWith "#" then none
-    else some trimmed.toName
-  pure names
-
 def runCompileCmd (p : Cli.Parsed) : IO UInt32 := do
   let some path := p.positionalArg? "path"
     | p.printError "error: must specify <path> to a Lean source file"
@@ -51,7 +39,10 @@ def runCompileCmd (p : Cli.Parsed) : IO UInt32 := do
     if let some flag := p.flag? "exclude" then
       for n in parsePrefixes (flag.as! String) do s := s.insert n
     if let some flag := p.flag? "exclude-file" then
-      for n in ← readNamesFile (flag.as! String) do s := s.insert n
+      -- Shared names-file grammar (Ix.Cli.ConstsFile); names resolve here
+      -- via `toName` like the `--exclude` comma-list.
+      for n in ← Ix.Cli.ConstsFile.read (flag.as! String) do
+        s := s.insert n.toName
     pure s
   if !excludeSet.isEmpty then
     IO.println s!"[compile] exclude: {excludeSet.size} name(s) will be dropped from seed set"
diff --git a/Ix/Cli/ConstsFile.lean b/Ix/Cli/ConstsFile.lean
new file mode 100644
index 00000000..5083348b
--- /dev/null
+++ b/Ix/Cli/ConstsFile.lean
@@ -0,0 +1,61 @@
+/-
+  Shared parsing for constant-name inputs (`--consts` comma-lists and
+  `--consts-file` files) across every CLI that takes them: `ix check-rs`,
+  `ix compile --exclude-file`, and `bench-typecheck`.
+
+  One grammar everywhere: one name per line, everything from a `#` to end of
+  line is a comment (whole-line or inline), blank lines dropped. `#` never
+  appears in a Lean name, so splitting on it is safe. The zkVM hosts'
+  `--consts-file` (Rust `collect_consts`) parses the same grammar, so a single
+  names file drives all backends identically.
+
+  Names stay RAW strings here — resolution differs per caller (`toName` for
+  meta-mode seeds, string-match against the env's `named` map for the anon /
+  zkVM-style paths, where a `toName` round-trip could mangle numeric or
+  private components).
+-/
+module
+public import Cli
+
+public section
+
+namespace Ix.Cli.ConstsFile
+
+/-- Parse names-file contents: one name per line, `#`-to-EOL comments,
+    blank lines dropped. -/
+def parseLines (contents : String) : Array String :=
+  (contents.splitOn "\n").filterMap (fun line =>
+    let s := ((line.splitOn "#").head?.getD "").trimAscii
+    if s.isEmpty then none else some s.toString) |>.toArray
+
+/-- Read and parse a names file. -/
+def read (path : String) : IO (Array String) :=
+  parseLines <$> IO.FS.readFile path
+
+/-- Split a `--consts`-style comma-list into trimmed, non-empty names. -/
+def parseCommaList (arg : String) : Array String :=
+  (arg.splitOn ",").filterMap (fun s =>
+    let t := s.trimAscii
+    if t.isEmpty then none else some t.toString) |>.toArray
+
+/-- Union of a parsed `--consts` comma-list flag and a `--consts-file` file
+    (both optional), deduped in first-seen order. -/
+def gather (p : Cli.Parsed)
+    (constsFlag : String := "consts") (fileFlag : String := "consts-file") :
+    IO (Array String) := do
+  let fromFlag : Array String :=
+    match p.flag? constsFlag with
+    | some f => parseCommaList (f.as! String)
+    | none => #[]
+  let fromFile : Array String ←
+    match p.flag? fileFlag with
+    | some f => read (f.as! String)
+    | none => pure #[]
+  -- Linear-scan dedupe: name lists are tens of entries, not thousands.
+  let mut acc : Array String := #[]
+  for n in fromFlag ++ fromFile do
+    if !acc.contains n then
+      acc := acc.push n
+  return acc
+
+end Ix.Cli.ConstsFile
diff --git a/Ix/KernelCheck.lean b/Ix/KernelCheck.lean
index f32a6fab..2213c062 100644
--- a/Ix/KernelCheck.lean
+++ b/Ix/KernelCheck.lean
@@ -142,6 +142,32 @@ opaque rsCheckAnonFFI :
     @& String →                          -- fail-out path ("" = none)
     IO (Array (String × Option CheckError))
 
+/-- FFI: anon-mode type-check of named constants with (by default) their full
+    dependency closures — the same mode and scope as the zkVM hosts' `--consts`
+    execute path, so an out-of-circuit run is directly comparable to the
+    in-circuit one. The `Bool` after the names is `skip-deps`: `true` checks
+    only each name's own work item (subject-only; deps trusted), mirroring
+    `zisk-host --skip-deps`.
+
+    Names are the constants' displayed forms (e.g. `"Nat.add_comm"`,
+    `"_private.Init.….instRxcHasSize_eq"`), resolved through the env's `named`
+    metadata by string match — the same resolution the zkVM hosts use — after
+    which the check runs on the anon view (the kernel never sees names). A
+    member of a mutual block selects the whole block's work item. Multiple
+    names union their closures into one check set.
+
+    Returns `(hex_address, Option CheckError)` pairs, one per checked target,
+    exactly like `rsCheckAnonFFI`. Errors (rather than returning) when a name
+    doesn't resolve, so a typo can't silently produce an empty check. -/
+@[extern "rs_kernel_check_anon_consts"]
+opaque rsCheckAnonConstsFFI :
+    @& String →                          -- .ixe path
+    @& Array String →                    -- constant names (displayed form)
+    @& Bool →                            -- skip-deps (subject-only)
+    @& Bool →                            -- quiet
+    @& String →                          -- fail-out path ("" = none)
+    IO (Array (String × Option CheckError))
+
 /-- FFI: profile a `.ixe` out of circuit, writing a `.ixprof` sidecar with
     per-block heartbeats + the delta-unfold graph (the sharding cost model,
     see `plans/sharding.md`). Runs the anon kernel over every checkable target.
diff --git a/crates/ffi/src/kernel.rs b/crates/ffi/src/kernel.rs
index a386d6a2..6438f8a6 100644
--- a/crates/ffi/src/kernel.rs
+++ b/crates/ffi/src/kernel.rs
@@ -1318,12 +1318,14 @@ enum AnonWorkItem {
 /// [`ix_kernel::anon_work::build_anon_work`] (shared with the
 /// SP1/Zisk guests) and layers the FFI's per-target result-slot
 /// bookkeeping on top.
-fn build_anon_work(
-  env: &IxonEnv,
-) -> Result<(Vec<AnonWorkItem>, Vec<Address>), String> {
+/// Assign result slots to a set of kernel work items — the indexing step
+/// shared by the whole-env check (every item) and the per-constant closure
+/// check (a filtered subset).
+fn index_anon_work(
+  kernel_work: Vec<ix_kernel::anon_work::AnonWorkItem>,
+) -> (Vec<AnonWorkItem>, Vec<Address>) {
   use ix_kernel::anon_work::AnonWorkItem as KItem;
 
-  let kernel_work = ix_kernel::anon_work::build_anon_work(env)?;
   let mut work: Vec<AnonWorkItem> = Vec::with_capacity(kernel_work.len());
   let mut addrs: Vec<Address> = Vec::new();
   for item in kernel_work {
@@ -1341,7 +1343,13 @@ fn build_anon_work(
       },
     }
   }
-  Ok((work, addrs))
+  (work, addrs)
+}
+
+fn build_anon_work(
+  env: &IxonEnv,
+) -> Result<(Vec<AnonWorkItem>, Vec<Address>), String> {
+  Ok(index_anon_work(ix_kernel::anon_work::build_anon_work(env)?))
 }
 
 #[allow(clippy::needless_pass_by_value)]
@@ -1623,6 +1631,235 @@ pub extern "C" fn rs_kernel_check_anon(
   build_anon_result_array(&addrs_for_return, &results)
 }
 
+/// FFI: anon-mode type-check of named constants with (by default) their full
+/// dependency closures — the same mode and scope as the zkVM hosts' `--consts`
+/// execute path, so an out-of-circuit run is directly comparable to the
+/// in-circuit one. `skip_deps` restricts the check to each name's own work
+/// item (subject-only; deps trusted), mirroring `zisk-host --skip-deps`.
+///
+/// Names resolve through the env's `named` metadata by displayed form (the
+/// same string match the zkVM hosts use), then the metadata is dropped and
+/// the check runs on the anon view — the kernel never sees names. A member
+/// of a mutual block selects the whole block's work item (blocks check
+/// atomically). Multiple names union their closures into one check set.
+///
+/// Returns `(hex_address, Option CheckError)` pairs, one per checked target,
+/// exactly like `rs_kernel_check_anon`.
+#[unsafe(no_mangle)]
+pub extern "C" fn rs_kernel_check_anon_consts(
+  env_path: LeanString<LeanBorrowed<'_>>,
+  names: LeanArray<LeanBorrowed<'_>>,
+  skip_deps: LeanBool<LeanBorrowed<'_>>,
+  quiet: LeanBool<LeanBorrowed<'_>>,
+  fail_out: LeanString<LeanBorrowed<'_>>,
+) -> LeanIOResult<LeanOwned> {
+  use ix_kernel::anon_work::{
+    AnonWorkItem as KItem, block_of_addr, closure_addrs, work_block_addr,
+  };
+
+  let total_start = Instant::now();
+  let quiet = quiet.to_bool();
+  let skip_deps = skip_deps.to_bool();
+  let path = env_path.to_string();
+  let fail_out_path = fail_out.to_string();
+  let fail_out_path =
+    if fail_out_path.is_empty() { None } else { Some(fail_out_path) };
+  let names_vec: Vec<String> = names.map(|obj| obj.as_string().to_string());
+  if names_vec.is_empty() {
+    return LeanIOResult::error_string(
+      "rs_kernel_check_anon_consts: no constant names given",
+    );
+  }
+
+  let t0 = Instant::now();
+  let bytes = match std::fs::read(&path) {
+    Ok(bytes) => bytes,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_kernel_check_anon_consts: failed to read {path}: {e}"
+      ));
+    },
+  };
+
+  // Resolve displayed names → addresses through the full env's `named`
+  // metadata (the anon view discards it); the full env drops right after.
+  let resolved: Vec<Address> = {
+    let mut slice: &[u8] = &bytes;
+    let full = match IxonEnv::get(&mut slice) {
+      Ok(env) => env,
+      Err(e) => {
+        return LeanIOResult::error_string(&format!(
+          "rs_kernel_check_anon_consts: failed to deserialize {path}: {e}"
+        ));
+      },
+    };
+    let by_name: FxHashMap<String, Address> = full
+      .named
+      .iter()
+      .map(|e| (e.key().to_string(), e.value().addr.clone()))
+      .collect();
+    let mut addrs = Vec::with_capacity(names_vec.len());
+    let mut missing: Vec<&str> = Vec::new();
+    for n in &names_vec {
+      match by_name.get(n) {
+        Some(a) => addrs.push(a.clone()),
+        None => missing.push(n),
+      }
+    }
+    if !missing.is_empty() {
+      return LeanIOResult::error_string(&format!(
+        "rs_kernel_check_anon_consts: no constant(s) named [{}] in {path}",
+        missing.join(", ")
+      ));
+    }
+    addrs
+  };
+  eprintln!(
+    "[rs_kernel_check_anon_consts] resolve:    {:>8.1?} ({} name(s))",
+    t0.elapsed(),
+    resolved.len()
+  );
+
+  let t1 = Instant::now();
+  let mut slice: &[u8] = &bytes;
+  let ixon_env = match IxonEnv::get_anon(&mut slice) {
+    Ok(env) => env,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_kernel_check_anon_consts: failed to deserialize (anon) {path}: {e}"
+      ));
+    },
+  };
+  drop(bytes);
+  eprintln!(
+    "[rs_kernel_check_anon_consts] anon parse: {:>8.1?} ({} consts)",
+    t1.elapsed(),
+    ixon_env.const_count(),
+  );
+
+  // Map each seed to the work item whose ingress block owns it (standalone →
+  // itself; a mutual-block member → the whole block, checked atomically).
+  let t2 = Instant::now();
+  let kernel_work = match ix_kernel::anon_work::build_anon_work(&ixon_env) {
+    Ok(work) => work,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_kernel_check_anon_consts: build_anon_work: {e}"
+      ));
+    },
+  };
+  let by_block: FxHashMap<Address, usize> = kernel_work
+    .iter()
+    .enumerate()
+    .map(|(i, w)| (work_block_addr(&ixon_env, w), i))
+    .collect();
+  let mut seed_items: Vec<usize> = Vec::new();
+  for addr in &resolved {
+    let block = block_of_addr(&ixon_env, addr);
+    match by_block.get(&block) {
+      Some(i) => {
+        if !seed_items.contains(i) {
+          seed_items.push(*i);
+        }
+      },
+      None => {
+        return LeanIOResult::error_string(&format!(
+          "rs_kernel_check_anon_consts: no work item covers {} (block {})",
+          addr.hex(),
+          block.hex()
+        ));
+      },
+    }
+  }
+
+  // The check set. Subject-only: the seeds' own items. Full-closure: every
+  // work item inside the seeds' dependency closure — the same set a zkVM
+  // guest enumerates from its closure sub-env (`build_sub_env` +
+  // `build_anon_work`), computed here directly from `closure_addrs` without
+  // serializing a sub-env.
+  let selected: Vec<KItem> = if skip_deps {
+    seed_items.iter().map(|&i| kernel_work[i].clone()).collect()
+  } else {
+    let roots: Vec<Address> = seed_items
+      .iter()
+      .flat_map(|&i| kernel_work[i].proven_targets())
+      .collect();
+    let closure = closure_addrs(&ixon_env, &roots);
+    kernel_work
+      .into_iter()
+      .filter(|w| match w {
+        KItem::Standalone { addr } => closure.contains(addr),
+        KItem::Block { block_addr, .. } => closure.contains(block_addr),
+      })
+      .collect()
+  };
+  let (work, addrs) = index_anon_work(selected);
+  eprintln!(
+    "[rs_kernel_check_anon_consts] build work: {:>8.1?} ({} items, {} targets, {})",
+    t2.elapsed(),
+    work.len(),
+    addrs.len(),
+    if skip_deps { "subject-only" } else { "full-closure" },
+  );
+
+  let failure_log: Option<Arc<FailureLog>> = match fail_out_path.as_deref() {
+    None => None,
+    Some(out_path) => match FailureLog::open(out_path, &path, addrs.len()) {
+      Ok(log) => {
+        eprintln!(
+          "[rs_kernel_check_anon_consts] streaming failures to {out_path}"
+        );
+        Some(Arc::new(log))
+      },
+      Err(e) => {
+        return LeanIOResult::error_string(&format!(
+          "rs_kernel_check_anon_consts: failed to open fail-out file {out_path}: {e}"
+        ));
+      },
+    },
+  };
+
+  let total = addrs.len();
+  let addrs_for_return = addrs.clone();
+  let t3 = Instant::now();
+  let ixon_env_arc = Arc::new(ixon_env);
+  let results = match run_anon_checks_parallel(
+    ixon_env_arc,
+    work,
+    addrs,
+    quiet,
+    failure_log.clone(),
+  ) {
+    Ok(r) => r,
+    Err(msg) => {
+      if let Some(log) = failure_log.as_ref() {
+        log.finalize();
+      }
+      return build_uniform_error(total, &format!("[thread] {msg}"));
+    },
+  };
+
+  let passed = results.iter().filter(|r| r.is_ok()).count();
+  let failed = results.iter().filter(|r| r.is_err()).count();
+  eprintln!(
+    "[rs_kernel_check_anon_consts] {passed}/{total} passed, {failed} failed ({:.1?})",
+    t3.elapsed()
+  );
+  eprintln!(
+    "[rs_kernel_check_anon_consts] total:      {:>8.1?}",
+    total_start.elapsed()
+  );
+  if let Some(log) = failure_log.as_ref() {
+    log.finalize();
+    eprintln!(
+      "[rs_kernel_check_anon_consts] streamed {} failure(s) to fail-out",
+      log.count()
+    );
+  }
+
+  build_anon_result_array(&addrs_for_return, &results)
+}
+
 // ===========================================================================
 // Sharding profiler: run the anon kernel out of circuit over a `.ixe`,
 // recording per-block heartbeats + the delta-unfold graph into a `.ixprof`.
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index 1b11f626..b0ee7d52 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -18,8 +18,8 @@ the same backend drivers:
 
 | backend | what it measures | metrics |
 |---|---|---|
-| `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove) | `fft-cost`, `execute-time`, `prove-time`, `peak-rss` |
-| `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `peak-rss` |
+| `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove) | `fft-cost`, `execute-time`, `execute-peak-rss`, `prove-time`, `peak-rss` |
+| `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `execute-peak-rss` |
 | `ooc` | the same kernel run **out-of-circuit and in parallel** (`ix check-rs`) — far faster | `throughput`, `check-time`, `peak-rss` |
 | `compile` | `ix compile <env>.lean → <env>.ixe` on the current PR — measures the compile step itself, keyed by CamelCase env slug (`InitStd`, `Lean`, `Mathlib`, `FLT`) | `compile-time`, `throughput`, `file-size`, `constants` |
 
@@ -31,10 +31,12 @@ constant records the neutral `{"oom": true}` sentinel and `bench.py compare`
 renders `OOM` cells (with `n/a` Δ%) in the table for that row.
 
 The `ooc` backend reports two views: the **whole env** (`ix check-rs --anon`,
-keyed by env) and a **per-primary full-closure check** (`ix check-rs --consts`,
-keyed by constant — apples-to-apples with the zkVM execute (also full-closure),
-so the delta isolates in-circuit vs out-of-circuit overhead rather than mixing
-in subject-only vs full-closure scope).
+keyed by env) and a **per-primary full-closure check** (`ix check-rs --anon
+--consts <name>`, keyed by constant). The per-primary view runs the constant's
+full dependency closure in anon mode — the same mode and scope as the zkVM
+execute — so the delta isolates in-circuit vs out-of-circuit overhead rather
+than mixing in closure-size or metadata effects. (`--skip-deps` exists on both
+sides for a subject-only variant, but the benchmarks use full-closure.)
 
 All are driven by `.github/scripts/run.sh` (compile the env `.ixe`, run the
 backend, emit a neutral `{ "<name>": { "<metric>": n } }` JSON). The PR workflow
@@ -71,9 +73,11 @@ JSON and in bencher.dev.
 
 - `env` — compile target the constant resolves in (`initStd` / `lean` / `mathlib`).
 - `tier` — `cheap` (prove-feasible on a CI runner under Aiur's ~128 GB RAM
-  ceiling) or `heavy` (a single-shard prove would OOM without the runner-
-  installed RAM watchdog killing it). Informational-only after the RAM
-  watchdog took over gating — `run.sh` no longer branches on `tier`.
+  ceiling) or `heavy` (a single-shard prove exceeds the RAM watchdog ceiling
+  and records an OOM row). Consumed only by `bench.py manifest` for selection
+  (the `BENCH_TIER` filter, and the non-primary prove set defaults to cheap);
+  `run.sh` itself never branches on tier — it attempts a full prove of every
+  selected constant under the watchdog.
 - `primary` — the curated **primary subset** (currently ~20 constants across
   initStd + mathlib), spanning shape and cost range. Default for the
   `!benchmark` PR comment and the bencher jobs. Set `BENCH_FULL=1` to include
@@ -101,8 +105,11 @@ RUST_LOG=info                  # passthrough env (allowlisted)
 ```
 
 Mode is fixed per backend: `aiur` runs `prove` by default (its report also
-carries the execute-side columns `fft-cost` / `execute-time` alongside
-`prove-time`); `zisk` / `sp1` / `ooc` run `execute`; `compile` runs `ix
+carries the execute-side columns `fft-cost` / `execute-time` /
+`execute-peak-rss` alongside `prove-time` / `peak-rss` — `execute-peak-rss`
+is sampled at the Phase 1/2 boundary, before proving allocations, precisely
+so execute-mode comparisons stay apples-to-apples against prove-run
+baselines); `zisk` / `sp1` / `ooc` run `execute`; `compile` runs `ix
 compile`. The optional bare `execute` token flips `aiur` to execute-only
 (`bench-typecheck --execute-only`, skips Phase 2); on other backends it's a
 no-op. Defaults: `aiur`, `initStd`, primary subset. Backends fan out as a
@@ -124,8 +131,11 @@ Threshold semantics per measure kind:
   directional: `upper 0` (any increase is a real regression), `lower _`
   (drops are legitimate wins — algorithmic improvements, better packing).
 - **`execute-time`, `prove-time`, `check-time`, `compile-time`, `peak-rss`,
-  `file-size`** — noisy wall-clock or size measures: `upper 0.05–0.10`,
-  `lower _`.
+  `execute-peak-rss`, `file-size`** — noisy wall-clock or size measures:
+  `upper 0.05–0.10`, `lower _`. `execute-peak-rss` is the execute phase's RSS
+  high-water on every backend that has one (bench-typecheck samples it at the
+  Phase 1/2 boundary; the zkVM hosts' execute peak carries the same name);
+  bare `peak-rss` is a prove-phase (or, for ooc, whole-check) peak.
 - **`throughput`** — higher-is-better: `upper _`, `lower 0.05–0.10`.
 - **`phase:<span>`** — uploaded for trend visibility, intentionally left
   un-thresholded (dynamic names + noise; the PR-comment drill-down is where
diff --git a/sp1/Cargo.lock b/sp1/Cargo.lock
index eb4c6af9..aa2a68a4 100644
--- a/sp1/Cargo.lock
+++ b/sp1/Cargo.lock
@@ -1029,7 +1029,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -2069,7 +2069,7 @@ version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -2859,7 +2859,7 @@ dependencies = [
  "once_cell",
  "socket2 0.6.3",
  "tracing",
- "windows-sys 0.60.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -3151,7 +3151,7 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -4669,7 +4669,7 @@ dependencies = [
  "getrandom 0.4.2",
  "once_cell",
  "rustix",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -5111,7 +5111,7 @@ dependencies = [
 [[package]]
 name = "tracing-texray"
 version = "0.2.0"
-source = "git+https://github.com/argumentcomputer/tracing-texray?rev=15ae57cfbcb234ff5911a306898e7d93d396d648#15ae57cfbcb234ff5911a306898e7d93d396d648"
+source = "git+https://github.com/argumentcomputer/tracing-texray?rev=bd4faa08a4fa4edb46bde393b4d20c6bd49591d0#bd4faa08a4fa4edb46bde393b4d20c6bd49591d0"
 dependencies = [
  "loom",
  "parking_lot",
@@ -5537,15 +5537,6 @@ dependencies = [
  "windows-targets 0.52.6",
 ]
 
-[[package]]
-name = "windows-sys"
-version = "0.60.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
-dependencies = [
- "windows-targets 0.53.5",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.61.2"
@@ -5579,30 +5570,13 @@ dependencies = [
  "windows_aarch64_gnullvm 0.52.6",
  "windows_aarch64_msvc 0.52.6",
  "windows_i686_gnu 0.52.6",
- "windows_i686_gnullvm 0.52.6",
+ "windows_i686_gnullvm",
  "windows_i686_msvc 0.52.6",
  "windows_x86_64_gnu 0.52.6",
  "windows_x86_64_gnullvm 0.52.6",
  "windows_x86_64_msvc 0.52.6",
 ]
 
-[[package]]
-name = "windows-targets"
-version = "0.53.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
-dependencies = [
- "windows-link",
- "windows_aarch64_gnullvm 0.53.1",
- "windows_aarch64_msvc 0.53.1",
- "windows_i686_gnu 0.53.1",
- "windows_i686_gnullvm 0.53.1",
- "windows_i686_msvc 0.53.1",
- "windows_x86_64_gnu 0.53.1",
- "windows_x86_64_gnullvm 0.53.1",
- "windows_x86_64_msvc 0.53.1",
-]
-
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.48.5"
@@ -5615,12 +5589,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
-
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.5"
@@ -5633,12 +5601,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
-
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.5"
@@ -5651,24 +5613,12 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
-[[package]]
-name = "windows_i686_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
-
 [[package]]
 name = "windows_i686_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
-[[package]]
-name = "windows_i686_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
-
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.5"
@@ -5681,12 +5631,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
-[[package]]
-name = "windows_i686_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
-
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.5"
@@ -5699,12 +5643,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
-
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.5"
@@ -5717,12 +5655,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
-
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.5"
@@ -5735,12 +5667,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
-
 [[package]]
 name = "winnow"
 version = "0.5.40"
diff --git a/sp1/host/Cargo.toml b/sp1/host/Cargo.toml
index eed2beb0..d03b0724 100644
--- a/sp1/host/Cargo.toml
+++ b/sp1/host/Cargo.toml
@@ -34,7 +34,7 @@ anyhow = "1"
 serde_json = "1"
 # Process-tree RSS sampler (accurate peak RAM) + per-phase timing sink for the
 # CI drill-down.
-tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "15ae57cfbcb234ff5911a306898e7d93d396d648" }
+tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "bd4faa08a4fa4edb46bde393b4d20c6bd49591d0" }
 # Throughput formatting (e.g. `42.0 consts/s`).
 human-repr = "1"
 # Proof-size measurement: SP1's `SP1ProofWithPublicValues::bytes()` returns
diff --git a/sp1/host/src/main.rs b/sp1/host/src/main.rs
index 3d050ff4..6b548514 100644
--- a/sp1/host/src/main.rs
+++ b/sp1/host/src/main.rs
@@ -57,8 +57,10 @@ struct Args {
   #[arg(long)]
   consts_file: Option<PathBuf>,
 
-  /// With --consts: check each subject only, trusting its deps.
-  #[arg(long, requires = "consts")]
+  /// With --consts/--consts-file: check each subject only, trusting its deps.
+  // Validated in main (not clap `requires = "consts"`): names may come from
+  // --consts-file alone, which a clap-level `requires` would wrongly reject.
+  #[arg(long)]
   skip_deps: bool,
 
   /// Write per-constant results JSON `{ "<name>": { … } }` (accumulated across --consts).
@@ -252,7 +254,7 @@ async fn main() -> Result<()> {
   // JSON Lines — the CI drill-down input.
   if args.texray {
     if let Some(json) = args.json.as_ref().and_then(|p| p.to_str()) {
-      let _ = tracing_texray::json_sink::to_file(format!("{json}.spans"));
+      let _ = tracing_texray::json_sink::to_file(&format!("{json}.spans"));
     }
   }
 
@@ -262,6 +264,9 @@ async fn main() -> Result<()> {
   if !consts.is_empty() && args.meta {
     bail!("--consts is Anon-only and cannot be combined with --meta");
   }
+  if consts.is_empty() && args.skip_deps {
+    bail!("--skip-deps requires constants via --consts or --consts-file");
+  }
 
   if consts.is_empty() {
     run_one(&client, &args, &whole_env_bytes, None).await?;
@@ -359,7 +364,10 @@ async fn run_one<C: Prover + Sync>(
           "cycles": cycles,
           "execute-time": (secs * 1e6).round() / 1e6,
           "throughput": tput.round(),
-          "peak-rss": peak_rss_bytes(),
+          // Named for what it measures (the execute phase's RSS high-water),
+          // matching bench-typecheck's execute-peak-rss; bare `peak-rss` is
+          // reserved for prove-phase peaks.
+          "execute-peak-rss": peak_rss_bytes(),
         }),
       )?;
     }
@@ -420,13 +428,6 @@ mod cli_tests {
     )
     .expect("parse ok")
   }
-  fn parse_err(argv: &[&str]) -> String {
-    Args::try_parse_from(
-      std::iter::once("sp1-host").chain(argv.iter().copied()),
-    )
-    .unwrap_err()
-    .to_string()
-  }
 
   #[test]
   fn consts_splits_on_comma() {
@@ -441,8 +442,11 @@ mod cli_tests {
   }
 
   #[test]
-  fn skip_deps_requires_consts() {
-    assert!(parse_err(&["--skip-deps"]).contains("--consts"));
+  fn skip_deps_parses_with_consts_file_only() {
+    // Names may come from --consts-file alone; clap must accept the parse
+    // (main validates after collect_consts).
+    let a = parse(&["--consts-file", "names.txt", "--skip-deps"]);
+    assert!(a.skip_deps);
   }
 
   #[test]
diff --git a/zisk/Cargo.lock b/zisk/Cargo.lock
index dfcb2928..18ac716b 100644
--- a/zisk/Cargo.lock
+++ b/zisk/Cargo.lock
@@ -4188,7 +4188,7 @@ dependencies = [
  "once_cell",
  "socket2",
  "tracing",
- "windows-sys 0.60.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -5417,7 +5417,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874"
 dependencies = [
  "rustix",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -5917,7 +5917,7 @@ dependencies = [
 [[package]]
 name = "tracing-texray"
 version = "0.2.0"
-source = "git+https://github.com/argumentcomputer/tracing-texray?rev=15ae57cfbcb234ff5911a306898e7d93d396d648#15ae57cfbcb234ff5911a306898e7d93d396d648"
+source = "git+https://github.com/argumentcomputer/tracing-texray?rev=bd4faa08a4fa4edb46bde393b4d20c6bd49591d0#bd4faa08a4fa4edb46bde393b4d20c6bd49591d0"
 dependencies = [
  "loom",
  "parking_lot",
@@ -6526,15 +6526,6 @@ dependencies = [
  "windows-targets 0.52.6",
 ]
 
-[[package]]
-name = "windows-sys"
-version = "0.60.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
-dependencies = [
- "windows-targets 0.53.5",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.61.2"
@@ -6568,30 +6559,13 @@ dependencies = [
  "windows_aarch64_gnullvm 0.52.6",
  "windows_aarch64_msvc 0.52.6",
  "windows_i686_gnu 0.52.6",
- "windows_i686_gnullvm 0.52.6",
+ "windows_i686_gnullvm",
  "windows_i686_msvc 0.52.6",
  "windows_x86_64_gnu 0.52.6",
  "windows_x86_64_gnullvm 0.52.6",
  "windows_x86_64_msvc 0.52.6",
 ]
 
-[[package]]
-name = "windows-targets"
-version = "0.53.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
-dependencies = [
- "windows-link 0.2.1",
- "windows_aarch64_gnullvm 0.53.1",
- "windows_aarch64_msvc 0.53.1",
- "windows_i686_gnu 0.53.1",
- "windows_i686_gnullvm 0.53.1",
- "windows_i686_msvc 0.53.1",
- "windows_x86_64_gnu 0.53.1",
- "windows_x86_64_gnullvm 0.53.1",
- "windows_x86_64_msvc 0.53.1",
-]
-
 [[package]]
 name = "windows-threading"
 version = "0.1.0"
@@ -6622,12 +6596,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
-
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.42.2"
@@ -6640,12 +6608,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
-
 [[package]]
 name = "windows_i686_gnu"
 version = "0.42.2"
@@ -6658,24 +6620,12 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
-[[package]]
-name = "windows_i686_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
-
 [[package]]
 name = "windows_i686_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
-[[package]]
-name = "windows_i686_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
-
 [[package]]
 name = "windows_i686_msvc"
 version = "0.42.2"
@@ -6688,12 +6638,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
-[[package]]
-name = "windows_i686_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
-
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.42.2"
@@ -6706,12 +6650,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
-
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.42.2"
@@ -6724,12 +6662,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
-
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.42.2"
@@ -6742,12 +6674,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
-
 [[package]]
 name = "winnow"
 version = "0.7.15"
diff --git a/zisk/host/Cargo.toml b/zisk/host/Cargo.toml
index f9026384..ecc874b0 100644
--- a/zisk/host/Cargo.toml
+++ b/zisk/host/Cargo.toml
@@ -22,7 +22,7 @@ serde_json = "1"
 # Accurate peak RAM via the process-tree sampler (captures the ASM
 # microservices' child-process memory that `/proc/self/status` misses) and the
 # per-phase timing sink feeding the CI drill-down.
-tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "15ae57cfbcb234ff5911a306898e7d93d396d648" }
+tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "bd4faa08a4fa4edb46bde393b4d20c6bd49591d0" }
 tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
 # Throughput formatting (e.g. `42.0 consts/s`).
 human-repr = "1"
diff --git a/zisk/host/src/main.rs b/zisk/host/src/main.rs
index fb68d981..f2d813bc 100644
--- a/zisk/host/src/main.rs
+++ b/zisk/host/src/main.rs
@@ -198,12 +198,14 @@ struct Args {
   #[arg(long, conflicts_with_all = ["shard_plan", "only_shard", "store_dir"])]
   consts_file: Option<PathBuf>,
 
-  /// With --consts: check each subject only, trusting its deps.
-  #[arg(long, requires = "consts")]
+  /// With --consts/--consts-file: check each subject only, trusting its deps.
+  // Validated in main (not clap `requires = "consts"`): names may come from
+  // --consts-file alone, which a clap-level `requires` would wrongly reject.
+  #[arg(long)]
   skip_deps: bool,
 
-  /// Write per-constant results JSON `{ "<name>": { … } }` (accumulated across --consts).
-  #[arg(long, requires = "consts")]
+  /// Write per-constant results JSON `{ "<name>": { … } }` (accumulated across names).
+  #[arg(long)]
   json: Option<PathBuf>,
 
   /// Enable tracing-texray; with --json, per-phase spans are written to <json>.spans.
@@ -962,7 +964,10 @@ async fn run_constant(
           "cycles": cycles,
           "execute-time": (execute_secs * 1e6).round() / 1e6,
           "throughput": tput.round(),
-          "peak-rss": peak_rss_bytes(),
+          // Named for what it measures (the execute phase's RSS high-water),
+          // matching bench-typecheck's execute-peak-rss; bare `peak-rss` is
+          // reserved for prove-phase peaks.
+          "execute-peak-rss": peak_rss_bytes(),
         }),
       )?;
     }
@@ -1699,7 +1704,7 @@ async fn main() -> Result<()> {
   // JSON Lines — the CI drill-down input.
   if args.texray {
     if let Some(json) = args.json.as_ref().and_then(|p| p.to_str()) {
-      let _ = tracing_texray::json_sink::to_file(format!("{json}.spans"));
+      let _ = tracing_texray::json_sink::to_file(&format!("{json}.spans"));
     }
   }
 
@@ -1725,9 +1730,16 @@ async fn main() -> Result<()> {
       "--shard-plan requires exactly one --ixe input (the env the manifest was built for)"
     );
   }
-  // `--consts` selects named constants from one env.
-  if !args.consts.is_empty() && inputs.len() > 1 {
-    bail!("--consts requires exactly one --ixe input");
+  // Named constants (from --consts and/or --consts-file) select from one env.
+  let consts = collect_consts(&args)?;
+  if !consts.is_empty() && inputs.len() > 1 {
+    bail!("--consts/--consts-file requires exactly one --ixe input");
+  }
+  if consts.is_empty() && args.skip_deps {
+    bail!("--skip-deps requires constants via --consts or --consts-file");
+  }
+  if consts.is_empty() && args.json.is_some() {
+    bail!("--json requires constants via --consts or --consts-file");
   }
 
   // ---- Plan every input up front (parse + shard). ----
@@ -1821,7 +1833,6 @@ async fn main() -> Result<()> {
   }
 
   // ---- Named constants (no manifest/range). Loops one leaf per name. ----
-  let consts = collect_consts(&args)?;
   if !consts.is_empty() {
     for name in &consts {
       run_constant(&client, &plans[0], name, &args).await?;
@@ -2151,13 +2162,19 @@ mod cli_tests {
   }
 
   #[test]
-  fn skip_deps_requires_consts() {
-    assert!(parse_err(&["--skip-deps"]).contains("--consts"));
-  }
-
-  #[test]
-  fn json_requires_consts() {
-    assert!(parse_err(&["--json", "out.json"]).contains("--consts"));
+  fn skip_deps_and_json_parse_with_consts_file_only() {
+    // --skip-deps/--json need names, but names may come from --consts-file
+    // alone — clap must accept the parse (main validates after
+    // collect_consts).
+    let a = parse(&[
+      "--consts-file",
+      "names.txt",
+      "--skip-deps",
+      "--json",
+      "out.json",
+    ]);
+    assert!(a.skip_deps);
+    assert_eq!(a.json.as_deref(), Some(std::path::Path::new("out.json")));
   }
 
   #[test]

From 6fc7fd64e27a32afc988b0f85342cc3126e73653 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 2 Jul 2026 21:20:08 -0400
Subject: [PATCH 13/27] ci: convert riscv-bench into a zkVM host build+test
 gate (no proving key)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

bench-main.yml's zkvm-execute job now runs real executions of both hosts
on every main push, but tolerates per-constant failures by design
(dropped rows, OOM sentinels) — it never turns red on a breakage. This
workflow becomes the red-X signal instead, kept cheap:

- Build-only + unit tests: `cargo build --release --bin <host>` plus
  `cargo test --release --bin <host>`, which covers the clap surface
  run.sh drives (--consts comma-splitting, --consts-file union/dedup,
  shard-plan conflicts). No execution, so no fixture compile job, no
  minimal.ixe artifact, no memlock prlimit.
- install-zisk gains a `proving-key` input (default true): the key is
  loaded at runtime by `client.setup()`, never at build time, so the
  build gate skips the ~3 GB download + const-tree regeneration.
- Renamed "RISC-V bench" → "zkVM host build" (it no longer benchmarks).
- The TEMPORARY sb/ci-benchmarks push trigger (from the mid-branch test
  commit) reverts to main-only here; the tip commit is its sole carrier.

The runtime/typecheck smoke signal (executing myReflEq and asserting
failures == 0) is retired: that coverage now lives in zkvm-execute's
real runs, at the cost of being warnings-not-red-X.
---
 .github/actions/install-zisk/action.yml |  19 ++++-
 .github/workflows/riscv-bench.yml       | 104 ++++++++----------------
 2 files changed, 50 insertions(+), 73 deletions(-)

diff --git a/.github/actions/install-zisk/action.yml b/.github/actions/install-zisk/action.yml
index e3b1162b..e0b7fb2e 100644
--- a/.github/actions/install-zisk/action.yml
+++ b/.github/actions/install-zisk/action.yml
@@ -1,10 +1,20 @@
 name: Install Zisk
 description: >-
   Install the system build deps, the ZisK zkVM toolchain (ziskup, CPU build),
-  and the fork-matching proving key needed to build and run the Zisk host.
-  Execute needs the key too — zisk-host's `client.setup()` loads the circuit's
-  const-tree files before either the execute or the prove branch. Assumes a
-  Rust toolchain is already set up.
+  and — unless `proving-key: false` — the fork-matching proving key needed to
+  RUN the Zisk host. Execute needs the key too (zisk-host's `client.setup()`
+  loads the circuit's const-tree files before either the execute or the prove
+  branch), but BUILDING the host does not, so build-only callers skip the
+  ~3 GB download + const-tree regeneration. Assumes a Rust toolchain is
+  already set up.
+
+inputs:
+  proving-key:
+    description: >-
+      Install the fork-matching proving key (required to execute or prove;
+      not needed to build). Set false for build-only jobs.
+    required: false
+    default: "true"
 
 runs:
   using: composite
@@ -55,6 +65,7 @@ runs:
     # ~48 GB. The object name carries the fork rev so a circuit change can't
     # silently reuse a stale key. Public bucket → plain curl, no AWS creds.
     - name: Restore Zisk proving key (fork circuit) from S3
+      if: inputs.proving-key == 'true'
       shell: bash
       run: |
         mkdir -p "$HOME/.zisk"
diff --git a/.github/workflows/riscv-bench.yml b/.github/workflows/riscv-bench.yml
index fd2380a7..862afaa0 100644
--- a/.github/workflows/riscv-bench.yml
+++ b/.github/workflows/riscv-bench.yml
@@ -1,59 +1,33 @@
-name: RISC-V bench
+name: zkVM host build
 
-# zkVM execute is ~10+ min (toolchain installs + host builds + emulation), so it
-# is kept off the per-PR path: this workflow runs only on pushes to main (and on
-# manual dispatch). It compiles the `minimal.ixe` fixture, then executes the
-# kernel typecheck of one constant in the SP1 and Zisk VMs — in parallel jobs.
+# Fast hard gate: do the Zisk/SP1 hosts (and their guest ELFs, via each
+# workspace's build.rs) still compile? Regular CI doesn't build these
+# workspaces (special toolchains), and bench-main.yml's zkvm-execute job —
+# which runs real executions — tolerates per-constant failures by design
+# (dropped rows, OOM sentinels), so it never turns red on a breakage. This
+# workflow is the red-X signal, kept cheap: build-only, no execution, and
+# therefore no Zisk proving key (the key is loaded at runtime by
+# `client.setup()`, never at build time — skipping it saves a ~3 GB download
+# + const-tree regeneration per run).
 on:
   push:
-    branches: [main, sb/ci-benchmarks]   # TEMPORARY: test on this branch
+    branches: main
   workflow_dispatch:
 
 permissions:
   contents: read
 
-# No concurrency group: this workflow only runs on push-to-main and manual
-# dispatch, and every merged commit should be benchmarked — a later merge must
-# never cancel or queue behind an in-flight run.
+# No concurrency group: push-to-main and manual dispatch only — every merged
+# commit gets gated; a later merge must never cancel an in-flight run.
 
 jobs:
-  # Compile a tiny env once (ix is already built here) and hand it to the zkVM
-  # execute jobs via artifact, so those jobs stay Lean-free.
-  compile-fixture:
-    name: Compile zkVM fixture (minimal.ixe)
-    runs-on: warp-ubuntu-latest-x64-16x
-    steps:
-      - uses: actions/checkout@v6
-      - uses: actions-rust-lang/setup-rust-toolchain@v1
-      - uses: leanprover/lean-action@v1
-        with:
-          build-args: "--wfail -v"
-      - name: Compile zkVM test fixture (minimal.ixe)
-        run: lake exe ix compile Tests/MinimalDefs.lean --out minimal.ixe
-      - uses: actions/upload-artifact@v4
-        with:
-          name: minimal-ixe
-          path: minimal.ixe
-          if-no-files-found: error
-
-  # Execute the kernel typecheck of the `minimal.ixe` fixture natively (no Nix,
-  # no proof, no GPU). SP1 and Zisk run as independent jobs so they parallelize;
-  # each installs only its own toolchain via sp1up / ziskup (prebuilt binaries)
-  # and downloads the shared fixture. minimal.ixe carries the full Init closure,
-  # so we scope execution with `--consts myReflEq --skip-deps`: that
-  # subject-only-typechecks just the named constant, trusting its Init
-  # dependencies as Claim assumptions, instead of typechecking all of Init (which
-  # never finishes in the emulator). Each host bails non-zero on any typecheck
-  # failure; we also assert the `failures: 0` line.
-  #
-  # The apt list is the shared superset both backends need: the ZisK book's full
-  # Ubuntu list (its prebuilt cargo-zisk and proofman's C++ link OpenMPI, OpenMP,
-  # GMP, nlohmann-json, nasm, secp256k1, …) plus pkg-config + libssl-dev for
-  # SP1's host crates (openssl/bindgen). The Nix shells provided all this; a bare
-  # runner doesn't. Must precede the toolchain install (it runs cargo-zisk).
-  sp1-execute:
-    name: SP1 zkVM Execute
-    needs: compile-fixture
+  # SP1 and Zisk build as independent jobs so they parallelize; each installs
+  # only its own toolchain via sp1up / ziskup (prebuilt binaries). The apt
+  # list inside the install actions is the shared superset both backends need
+  # (proofman's C++ links OpenMPI/OpenMP/GMP/…; SP1's host crates need
+  # pkg-config + libssl-dev).
+  sp1-build:
+    name: SP1 host build
     runs-on: warp-ubuntu-latest-x64-16x
     steps:
       - uses: actions/checkout@v6
@@ -61,20 +35,19 @@ jobs:
         with:
           cache-workspaces: sp1
       - uses: ./.github/actions/install-sp1
-      - uses: actions/download-artifact@v4
-        with:
-          name: minimal-ixe
       # The precompile-aware SP1 runner-binary is auto-built from the fork git
       # dep by `sp1-core-executor-runner`'s build script — no manual override.
-      - name: SP1 — execute minimal.ixe (assert failures == 0)
+      # `cargo test` reuses the build's dep graph and runs the host's unit
+      # tests — the clap surface run.sh drives (`--consts` comma-splitting,
+      # `--consts-file` union/dedup), so a CLI regression reds this gate too.
+      - name: Build + test sp1-host (guest ELF via build.rs)
         run: |
           cd sp1
-          cargo run --bin sp1-host -- --execute --ixe ../minimal.ixe --consts myReflEq --skip-deps | tee only.txt
-          grep -qE "failures: 0\b" only.txt
+          cargo build --release --bin sp1-host
+          cargo test --release --bin sp1-host
 
-  zisk-execute:
-    name: Zisk zkVM Execute
-    needs: compile-fixture
+  zisk-build:
+    name: Zisk host build
     runs-on: warp-ubuntu-latest-x64-16x
     steps:
       - uses: actions/checkout@v6
@@ -82,20 +55,13 @@ jobs:
         with:
           cache-workspaces: zisk
       - uses: ./.github/actions/install-zisk
-      - uses: actions/download-artifact@v4
         with:
-          name: minimal-ixe
-      - name: Zisk — execute minimal.ixe (assert failures == 0)
+          proving-key: false
+      # Unit tests: the clap surface run.sh drives, plus the closure auditor
+      # (closure_detects_missing_dep self-skips without an IX_TEST_IXE
+      # fixture — this gate has no Lean build to produce one).
+      - name: Build + test zisk-host (guest ELF via build.rs)
         run: |
           cd zisk
-          # ZisK's ASM microservices mmap the ROM with MAP_LOCKED, which needs
-          # unlimited locked memory — the Zisk book's "Critical Memory
-          # Configuration" prescribes DefaultLimitMEMLOCK=infinity. The runner
-          # caps the memlock hard limit (so a bare `ulimit -l unlimited` can't
-          # raise it) and we can't reboot it, so raise the limit in-session as
-          # root via prlimit; the cargo child (and the ASM services it spawns)
-          # inherit it. Without this the services die with
-          # `mmap(rom) errno=11` / "shmem creation ... failed".
-          sudo prlimit --pid $$ --memlock=unlimited:unlimited
-          cargo run --bin zisk-host -- --execute --ixe ../minimal.ixe --consts myReflEq --skip-deps | tee only.txt
-          grep -qE "failures: 0\b" only.txt
+          cargo build --release --bin zisk-host
+          cargo test --release --bin zisk-host

From 12ff29d7eeed0cfade0f91a5349b08e797b5b33e Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 2 Jul 2026 22:04:36 -0400
Subject: [PATCH 14/27] fix(ci): pre-build proofman-starks-lib-c to serialize
 the shared make
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

zisk-host pulls zisk-sdk as both a dependency and a build-dependency, so
cargo compiles proofman-starks-lib-c as two units whose build scripts can
run concurrently — and both run `make` inside the SHARED
~/.cargo/git/checkouts/pil2-proofman-* source tree (not OUT_DIR). On a
cold runner the Makefile stamp is absent, so both units take the
`make clean` + `make -j` path and race: one unit's clean deletes build/
while the other's g++ is mid-compile, dying with "opening dependency
file ….d: No such file or directory" (g++ writes .d files at the END of
compilation, so the dir vanished underneath it). Warm runners never hit
it — the stamp short-circuits the clean — which is why the race only
surfaced on this branch's cold cache (lockfile change → new rust-cache
key).

install-zisk now builds the sys crate solo first: its build script runs
once, `make` completes, the stamp lands; both units of the subsequent
parallel build then skip the clean and their `make -j` is a no-op. Same
total work, just ordered. The proper fix is an flock around the make in
pil2-proofman's build.rs — worth an upstream PR.
---
 .github/actions/install-zisk/action.yml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/actions/install-zisk/action.yml b/.github/actions/install-zisk/action.yml
index e0b7fb2e..604fcec8 100644
--- a/.github/actions/install-zisk/action.yml
+++ b/.github/actions/install-zisk/action.yml
@@ -55,6 +55,22 @@ runs:
         curl -L https://raw.githubusercontent.com/0xPolygonHermez/zisk/main/ziskup/install.sh \
           | bash -s -- --cpu --nokey -y --version 0.18.0 --prefix "$HOME/.zisk"
         echo "$HOME/.zisk/bin" >> "$GITHUB_PATH"
+    # Pre-build the proofman C++ sys crate ALONE so its build script runs
+    # exactly once before any parallel zisk-host build. zisk-host pulls
+    # zisk-sdk as both a dependency and a build-dependency, so cargo compiles
+    # proofman-starks-lib-c as two units whose build scripts can run
+    # CONCURRENTLY — and both run `make` inside the SHARED
+    # ~/.cargo/git/checkouts/pil2-proofman-* source dir. On a cold runner the
+    # Makefile stamp is absent, so both units take the `make clean` +
+    # `make -j` path and race: one unit's clean deletes build/ while the
+    # other's g++ is mid-compile ("opening dependency file ….d: No such file
+    # or directory"). Building the crate solo writes the stamp; the second
+    # unit then skips the clean and its `make -j` is a no-op.
+    # (Proper fix is an flock in pil2-proofman's build.rs — upstream.)
+    - name: Pre-build proofman-starks-lib-c (serialize the shared make)
+      shell: bash
+      run: cargo build --release -p proofman-starks-lib-c
+      working-directory: zisk
     # Execute still needs a proving key present: zisk-host calls `client.setup()`
     # (which the SDK runs before the execute branch), and that loads the circuit's
     # const-tree files. We host the fork-matching key in a public S3 bucket

From c298bc7fd6ef43b92c97e2ea08a1e9a9d39645be Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 2 Jul 2026 22:09:10 -0400
Subject: [PATCH 15/27] fix(ci): resolve run.sh tool binaries lazily at their
 use sites
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`resolve_bin ix` ran unconditionally at startup, but bench-main's
zkvm-execute job legitimately has no `ix` anywhere: it restores only the
`.ixe` cache (REUSE_IXE=1 skips the compile step) and builds its zkVM
host via cargo — before the resolve_bin refactor that job never touched
`ix` at all, so the eager check regressed it with "ix not found
(in-tree or PATH)". Resolve per use site instead: the compile step and
the ooc branch resolve `ix`, the aiur branch resolves `bench-typecheck`,
and the zkVM branch needs neither.
---
 .github/scripts/run.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index 0191128b..e4016a39 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -104,14 +104,15 @@ esac
 
 # Tool resolution: prefer the in-tree build (so base measures base's code, PR
 # the PR's), fall back to PATH — CI restores cached binaries onto PATH instead
-# of building in-tree.
+# of building in-tree. Resolved LAZILY at each use site: the zkVM branch needs
+# neither `ix` nor `bench-typecheck` when REUSE_IXE short-circuits the compile
+# (bench-main's zkvm-execute job restores only the `.ixe` cache, no binaries).
 resolve_bin() {  # <name> → prints the path, or fails
   local name="$1" in_tree="$repo/.lake/build/bin/$1"
   if [ -x "$in_tree" ]; then printf '%s' "$in_tree"
   else command -v "$name" || { echo "::error::$name not found (in-tree or PATH)" >&2; return 2; }
   fi
 }
-ix_bin=$(resolve_bin ix) || exit 2
 
 tmp=$(mktemp -d)
 compile_log="$tmp/compile.log"
@@ -121,6 +122,7 @@ ixe="$repo/$benv.ixe"
 if [ "${REUSE_IXE:-0}" = 1 ] && [ "$backend" != compile ] && [ -f "$ixe" ]; then
   echo "reusing existing $ixe (REUSE_IXE)" >&2
 else
+  ix_bin=$(resolve_bin ix) || exit 2
   echo "::group::ix compile $module → $benv.ixe ($backend/$mode)"
   "$ix_bin" compile "$repo/Benchmarks/Compile/$module.lean" \
     --out "$ixe" 2>&1 | tee "$compile_log"
@@ -261,6 +263,7 @@ case "$backend" in
     # dependency closure in anon mode — the same mode and scope as the zkVM
     # execute above, so the delta isolates in-circuit vs out-of-circuit
     # overhead rather than mixing in closure-size or metadata effects.
+    ix_bin=$(resolve_bin ix) || exit 2
     ooc_one() {  # <label> <ix-check-args…>  → prints one JSON object
       local label="$1"; shift
       local log="$tmp/n.out"

From 1b999bb1f094b5c436244af4abbbf4032a5922fb Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 2 Jul 2026 23:00:48 -0400
Subject: [PATCH 16/27] fix(ci): classify alloc-abort proves as OOM, sweep Zisk
 shm, fix vector names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three fixes for the failing bench-main runs:

- run.sh: a heavy prove can die of memory without tripping the watchdog or
  the kernel OOM killer — one huge trace allocation fails and the runtime
  aborts (SIGABRT, exit 134) with an allocator message. looks_like_oom()
  folds that third case into the OOM classification so those constants get
  an OOM row instead of being dropped. Non-OOM failures now log the first
  lines of the tool output so drops are diagnosable from the job log.

- run.sh: after a watchdog group-kill, the Zisk host never runs its
  Drop-time cleanup of /dev/shm/ZISK_* segments (multi-GB; the MT output
  segment alone starts at 6 GB), so the next host launch fails creating its
  own segments before Zisk's startup stale-pid sweep runs — exactly one
  dropped constant after every OOM kill (the alternating OOM/exit-1 pattern
  in the zisk-execute logs). Sweep the debris between constants.

- Vectors.csv: IxVMPrim.nat_pow_big removed (kernel primitive, never
  present in compiled envs); Vector.extract_append._proof_1 →
  ._proof_2 and Array.extract_append._proof_1_1 → Array.extract_append
  (proof-term names are toolchain-dependent; both replacements verified
  against a freshly compiled initStd.ixe and measured heavy:
  fft-cost 23.4e9 / 40.1e9).
---
 .github/scripts/run.sh | 41 +++++++++++++++++++++++++++++++++++------
 Benchmarks/Vectors.csv |  5 ++---
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index e4016a39..5b0ca7cd 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -88,6 +88,22 @@ mark_oom() {  # <results.json> <name>
   fi
 }
 
+# A prove can die of memory three ways: the watchdog's group-kill (marker
+# file), the kernel OOM killer (SIGKILL → exit 137, no marker), or an
+# ALLOCATION-FAILURE ABORT — one huge trace allocation fails while total RSS
+# is still under the watchdog ceiling, and the Rust/Lean runtime aborts
+# (SIGABRT → exit 134) with an allocator message in the log. All three are
+# OOM for the benchmark table.
+looks_like_oom() {  # <exit> <marker> <log>
+  local code="$1" marker="$2" log="$3"
+  [ -f "$marker" ] && return 0
+  [ "$code" -eq 137 ] && return 0
+  [ "$code" -eq 134 ] && grep -qiE \
+    'memory allocation of .* failed|std::bad_alloc|out of memory|(unable|failed) to allocate' \
+    "$log" 2>/dev/null && return 0
+  return 1
+}
+
 # `$benv` is used verbatim for the `.ixe` filename (bench-pr compiles `initStd.ixe`;
 # the bencher jobs reuse the compile job's cached `InitStd.ixe`), and lowercased
 # only to pick the Compile module. `$benv_cc` is the CamelCase form — the
@@ -168,13 +184,12 @@ case "$backend" in
         wait "$bt_pid" 2>/dev/null; bt_exit=$?
         kill "$w_pid" 2>/dev/null || true
         wait "$w_pid" 2>/dev/null || true
-        # Exit 137 (SIGKILL) without our marker = the kernel OOM killer beat
-        # the 3 s sampling window — still an OOM, label it as one.
-        if [ -f "$oom" ] || [ "$bt_exit" -eq 137 ]; then
-          echo "::warning::aiur prove '$c' OOM-killed (marker=$([ -f "$oom" ] && echo watchdog || echo kernel), ceiling ${ceiling_gb} GB)" >&2
+        if looks_like_oom "$bt_exit" "$oom" "$tmp/$slug.log"; then
+          echo "::warning::aiur prove '$c' OOM (exit $bt_exit, marker=$([ -f "$oom" ] && echo watchdog || echo runtime), ceiling ${ceiling_gb} GB)" >&2
           mark_oom "$res" "$c"
         elif [ "$bt_exit" -ne 0 ]; then
           echo "::warning::aiur prove '$c' failed (exit $bt_exit); dropping" >&2
+          sed -n '1,5p' "$tmp/$slug.log" >&2 || true
           continue
         fi
       fi
@@ -209,6 +224,18 @@ case "$backend" in
     # in-session as root; the host children inherit it. Without this the ASM
     # services die with `mmap(rom) errno=11`. SP1 needs no such raise.
     [ "$backend" = zisk ] && sudo prlimit --pid $$ --memlock=unlimited:unlimited
+    # A group-killed Zisk run skips the host's Drop-time cleanup of its
+    # /dev/shm/ZISK_* segments and semaphores (multi-GB — the MT output segment
+    # alone starts at 6 GB), so the NEXT host launch fails creating its own
+    # segments (tmpfs / MAP_LOCKED exhaustion) before Zisk's startup stale-pid
+    # sweep can save it — one dropped constant per watchdog kill. Sweep the
+    # dead run's debris ourselves; nothing zisk-related is alive at call time.
+    zisk_shm_sweep() {
+      [ "$backend" = zisk ] || return 0
+      pkill -KILL -f -- '--shm_prefix ZISK_' 2>/dev/null
+      rm -f /dev/shm/ZISK_* /dev/shm/sem.ZISK_* 2>/dev/null
+      return 0
+    }
     ceiling_gb=${ZKVM_EXECUTE_MAX_RSS_GB:-120}
     rows="$tmp/rows"; mkdir -p "$rows"
     while IFS= read -r c; do
@@ -235,11 +262,13 @@ case "$backend" in
       wait "$zk_pid" 2>/dev/null; zk_exit=$?
       kill "$w_pid" 2>/dev/null || true
       wait "$w_pid" 2>/dev/null || true
-      if [ -f "$oom" ] || [ "$zk_exit" -eq 137 ]; then
-        echo "::warning::$backend execute '$c' OOM-killed (marker=$([ -f "$oom" ] && echo watchdog || echo kernel), ceiling ${ceiling_gb} GB)" >&2
+      [ "$zk_exit" -ne 0 ] && zisk_shm_sweep
+      if looks_like_oom "$zk_exit" "$oom" "$log"; then
+        echo "::warning::$backend execute '$c' OOM (exit $zk_exit, marker=$([ -f "$oom" ] && echo watchdog || echo runtime), ceiling ${ceiling_gb} GB)" >&2
         mark_oom "$res" "$c"
       elif [ "$zk_exit" -ne 0 ]; then
         echo "::warning::$backend execute '$c' failed/timed out (exit $zk_exit); dropping" >&2
+        sed -n '1,5p' "$log" >&2 || true
         continue
       fi
       # The host writes $res only on a clean (zero-failure) run. `$out` is
diff --git a/Benchmarks/Vectors.csv b/Benchmarks/Vectors.csv
index e1b2bb45..7a5d772b 100644
--- a/Benchmarks/Vectors.csv
+++ b/Benchmarks/Vectors.csv
@@ -69,7 +69,7 @@ Array.binSearch,initStd,heavy,1
 Array.qsortOrd,initStd,heavy
 String.split,initStd,heavy,0,1
 Std.Time.Week.Offset.ofMilliseconds,initStd,heavy
-Vector.extract_append._proof_1,initStd,heavy,1,1
+Vector.extract_append._proof_2,initStd,heavy,1,1
 ByteArray.utf8DecodeChar?_utf8EncodeChar_append,initStd,heavy,0,1
 String.append,initStd,cheap,0,1
 _private.Init.Data.Range.Polymorphic.SInt.0.Int8.instRxcHasSize_eq,initStd,heavy,0,1
@@ -77,8 +77,7 @@ _private.Init.Data.Range.Polymorphic.SInt.0.Int16.instRxcHasSize_eq,initStd,heav
 _private.Init.Data.Range.Polymorphic.SInt.0.Int32.instRxcHasSize_eq,initStd,heavy,0,1
 _private.Init.Data.Range.Polymorphic.SInt.0.Int64.instRxcHasSize_eq,initStd,heavy,0,1
 Char.ofOrdinal_le_of_le,initStd,heavy,0,1
-Array.extract_append._proof_1_1,initStd,heavy,0,1
-IxVMPrim.nat_pow_big,initStd,heavy,0,1
+Array.extract_append,initStd,heavy,0,1
 Std.Tactic.BVDecide.BVExpr.bitblast.goCache_Inv_of_Inv._mutual,initStd,heavy,0,1
 Lean.Expr.replace,lean,cheap
 List.Sorted,mathlib,cheap

From 79e7bb66c252ea15f065bcd20b7a0901c87dc9c0 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Thu, 2 Jul 2026 23:08:08 -0400
Subject: [PATCH 17/27] ci: move the zkVM host build gate into ci.yml; comment
 out the sp1 benchmarks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

riscv-bench.yml's two jobs (build + unit-test the Zisk/SP1 hosts, no
execution, no proving key) are cheap enough (~5 min with warm caches) to
gate every PR commit, and ci.yml already has exactly the triggers and
per-ref cancellation they need — so they move there and the standalone
workflow goes away.

sp1 benchmarks are commented out for now — the execute run is too slow:
the sp1 cell of bench-main's zkvm-execute matrix and bench-pr's Install
SP1 step. Uncomment both to re-enable. The \!benchmark backend-subset
selection is unchanged.
---
 .github/workflows/bench-main.yml  |  4 +-
 .github/workflows/bench-pr.yml    | 10 +++--
 .github/workflows/ci.yml          | 53 ++++++++++++++++++++++++
 .github/workflows/riscv-bench.yml | 67 -------------------------------
 docs/benchmarking.md              |  5 +++
 5 files changed, 68 insertions(+), 71 deletions(-)
 delete mode 100644 .github/workflows/riscv-bench.yml

diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index 29b15db3..d93a2f82 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -276,7 +276,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: [zisk, sp1]
+        # backend: [zisk, sp1] — sp1 disabled for now: its execute run is too
+        # slow for a per-push job. Re-add to the list to re-enable.
+        backend: [zisk]
         bench: [InitStd, Mathlib]
     steps:
       - uses: actions/checkout@v6
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index e8908086..e34830cf 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -2,6 +2,8 @@
 # through chosen prover backend(s) and post a main-vs-PR comparison table.
 #
 #   !benchmark ([aiur] [zisk] [sp1] [ooc] [compile] | all) [execute]
+#     (sp1 is temporarily disabled: its install step below is commented out,
+#      so an sp1 cell fails at the install-less execute step)
 #   BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
 #   BENCH_FULL=1                   # run the full curated set, not just primary
 #   BENCH_TIER=cheap|heavy|all     # tier override (default: all)
@@ -210,9 +212,11 @@ jobs:
         uses: actions-rust-lang/setup-rust-toolchain@v1
         with:
           cache-workspaces: ${{ matrix.cell.backend }}
-      - name: Install SP1
-        if: matrix.cell.backend == 'sp1'
-        uses: ./.github/actions/install-sp1
+      # sp1 is temporarily disabled (execute too slow for CI); uncomment to
+      # re-enable, along with bench-main.yml's sp1 matrix cell.
+      # - name: Install SP1
+      #   if: matrix.cell.backend == 'sp1'
+      #   uses: ./.github/actions/install-sp1
       - name: Install Zisk
         if: matrix.cell.backend == 'zisk'
         uses: ./.github/actions/install-zisk
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 68f5d451..4d368c39 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -62,3 +62,56 @@ jobs:
         uses: EmbarkStudios/cargo-deny-action@v2
         with:
           rust-version: ${{ env.RUST_VERSION }}
+
+  # zkVM host build gate: do the Zisk/SP1 hosts (and their guest ELFs, via
+  # each workspace's build.rs) still compile? rust-test doesn't build these
+  # workspaces (special toolchains), and bench-main.yml's zkvm-execute job —
+  # which runs real executions — tolerates per-constant failures by design
+  # (dropped rows, OOM sentinels), so it never turns red on a breakage. These
+  # jobs are the red-X signal, kept cheap: build-only, no execution, and
+  # therefore no Zisk proving key (the key is loaded at runtime by
+  # `client.setup()`, never at build time — skipping it saves a ~3 GB download
+  # + const-tree regeneration per run). SP1 and Zisk build as independent jobs
+  # so they parallelize; each installs only its own toolchain via sp1up /
+  # ziskup (prebuilt binaries). The apt list inside the install actions is the
+  # shared superset both backends need (proofman's C++ links
+  # OpenMPI/OpenMP/GMP/…; SP1's host crates need pkg-config + libssl-dev).
+  sp1-build:
+    name: SP1 host build
+    runs-on: warp-ubuntu-latest-x64-16x
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          cache-workspaces: sp1
+      - uses: ./.github/actions/install-sp1
+      # The precompile-aware SP1 runner-binary is auto-built from the fork git
+      # dep by `sp1-core-executor-runner`'s build script — no manual override.
+      # `cargo test` reuses the build's dep graph and runs the host's unit
+      # tests — the clap surface run.sh drives (`--consts` comma-splitting,
+      # `--consts-file` union/dedup), so a CLI regression reds this gate too.
+      - name: Build + test sp1-host (guest ELF via build.rs)
+        run: |
+          cd sp1
+          cargo build --release --bin sp1-host
+          cargo test --release --bin sp1-host
+
+  zisk-build:
+    name: Zisk host build
+    runs-on: warp-ubuntu-latest-x64-16x
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          cache-workspaces: zisk
+      - uses: ./.github/actions/install-zisk
+        with:
+          proving-key: false
+      # Unit tests: the clap surface run.sh drives, plus the closure auditor
+      # (closure_detects_missing_dep self-skips without an IX_TEST_IXE
+      # fixture — this gate has no Lean build to produce one).
+      - name: Build + test zisk-host (guest ELF via build.rs)
+        run: |
+          cd zisk
+          cargo build --release --bin zisk-host
+          cargo test --release --bin zisk-host
diff --git a/.github/workflows/riscv-bench.yml b/.github/workflows/riscv-bench.yml
deleted file mode 100644
index 862afaa0..00000000
--- a/.github/workflows/riscv-bench.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-name: zkVM host build
-
-# Fast hard gate: do the Zisk/SP1 hosts (and their guest ELFs, via each
-# workspace's build.rs) still compile? Regular CI doesn't build these
-# workspaces (special toolchains), and bench-main.yml's zkvm-execute job —
-# which runs real executions — tolerates per-constant failures by design
-# (dropped rows, OOM sentinels), so it never turns red on a breakage. This
-# workflow is the red-X signal, kept cheap: build-only, no execution, and
-# therefore no Zisk proving key (the key is loaded at runtime by
-# `client.setup()`, never at build time — skipping it saves a ~3 GB download
-# + const-tree regeneration per run).
-on:
-  push:
-    branches: main
-  workflow_dispatch:
-
-permissions:
-  contents: read
-
-# No concurrency group: push-to-main and manual dispatch only — every merged
-# commit gets gated; a later merge must never cancel an in-flight run.
-
-jobs:
-  # SP1 and Zisk build as independent jobs so they parallelize; each installs
-  # only its own toolchain via sp1up / ziskup (prebuilt binaries). The apt
-  # list inside the install actions is the shared superset both backends need
-  # (proofman's C++ links OpenMPI/OpenMP/GMP/…; SP1's host crates need
-  # pkg-config + libssl-dev).
-  sp1-build:
-    name: SP1 host build
-    runs-on: warp-ubuntu-latest-x64-16x
-    steps:
-      - uses: actions/checkout@v6
-      - uses: actions-rust-lang/setup-rust-toolchain@v1
-        with:
-          cache-workspaces: sp1
-      - uses: ./.github/actions/install-sp1
-      # The precompile-aware SP1 runner-binary is auto-built from the fork git
-      # dep by `sp1-core-executor-runner`'s build script — no manual override.
-      # `cargo test` reuses the build's dep graph and runs the host's unit
-      # tests — the clap surface run.sh drives (`--consts` comma-splitting,
-      # `--consts-file` union/dedup), so a CLI regression reds this gate too.
-      - name: Build + test sp1-host (guest ELF via build.rs)
-        run: |
-          cd sp1
-          cargo build --release --bin sp1-host
-          cargo test --release --bin sp1-host
-
-  zisk-build:
-    name: Zisk host build
-    runs-on: warp-ubuntu-latest-x64-16x
-    steps:
-      - uses: actions/checkout@v6
-      - uses: actions-rust-lang/setup-rust-toolchain@v1
-        with:
-          cache-workspaces: zisk
-      - uses: ./.github/actions/install-zisk
-        with:
-          proving-key: false
-      # Unit tests: the clap surface run.sh drives, plus the closure auditor
-      # (closure_detects_missing_dep self-skips without an IX_TEST_IXE
-      # fixture — this gate has no Lean build to produce one).
-      - name: Build + test zisk-host (guest ELF via build.rs)
-        run: |
-          cd zisk
-          cargo build --release --bin zisk-host
-          cargo test --release --bin zisk-host
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index b0ee7d52..ccb567cc 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -124,6 +124,11 @@ own testbed + **workload** (`aiur-check`, `zisk-check`, `sp1-check`,
 `<backend>-check-x64-32x`; the compile job uses `ix-compile-x64-32x`. Every
 bench job runs on the same runner (`warp-ubuntu-latest-x64-32x`).
 
+sp1 benchmarks are temporarily disabled (its execute run is too slow for
+CI); the host still builds + unit-tests on every PR via ci.yml. To
+re-enable, uncomment sp1 in two places: the zkvm-execute matrix cell in
+`bench-main.yml` and the Install SP1 step in `bench-pr.yml`.
+
 Threshold semantics per measure kind:
 - **`constants`** — pinned exactly (0/0). A definitional count; either
   direction is worth flagging (someone added/removed a def).

From 95a8c254b8cc0167e6aebc78e06f6bf8024f6e68 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Fri, 3 Jul 2026 10:13:36 -0400
Subject: [PATCH 18/27] fix(aiur): populate trace rows for
 UnconstrainedBigUintDivMod
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The op (added with the native execution pipeline, #463) pushes two values
onto the value map during execution — the quotient/remainder list-head
pointers — and the constraint generator allocates two auxiliary columns
for them, but the trace populator treated it as a no-op. Every ValIdx and
witness column after the first big-Nat division in a block was therefore
off by two, and trace population panicked with "index out of bounds" at
trace.rs:308 — the exit-134 failures on every big-Nat-heavy prove in the
bench-main aiur job (List.mergeSort, String.split, the SInt
instRxcHasSize_eq family, Multiset.sort, …). Reproducible in seconds
with any Nat-division user, e.g. `bench-typecheck --consts Nat.repr`.

The trace arm now mirrors the execute arm: recompute (q, r) with
num_bigint and resolve the head pointers execution already recorded in
memory[10] via a read-only twin of build_klimbs_u64, pushing both as map
entries + auxiliary columns.

Verified: Nat.repr proves end-to-end (panicked before); `lake test --
--ignored aiur ixvm` passes; clippy clean.
---
 crates/aiur/src/execute.rs | 55 ++++++++++++++++++++++++++++++++++++++
 crates/aiur/src/trace.rs   | 22 +++++++++++++--
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/crates/aiur/src/execute.rs b/crates/aiur/src/execute.rs
index 21f24f23..561f2ee3 100644
--- a/crates/aiur/src/execute.rs
+++ b/crates/aiur/src/execute.rs
@@ -846,6 +846,61 @@ pub fn unconstrained_big_uint_div_mod_helper(
   Ok((q_ptr, r_ptr))
 }
 
+/// Read-only twin of `unconstrained_big_uint_div_mod_helper` for trace
+/// population: recompute `(q, r)` and resolve the list-head pointers the
+/// execution already recorded in `memory[10]` — every node was built there
+/// during execution, so each key must be present.
+pub fn find_unconstrained_big_uint_div_mod(
+  a_ptr: G,
+  b_ptr: G,
+  memory: &FxIndexMap<usize, QueryMap>,
+) -> Result<(G, G), String> {
+  let a_limbs = read_klimbs_u64(memory, a_ptr)?;
+  let b_limbs = read_klimbs_u64(memory, b_ptr)?;
+  let a_big = klimbs_u64_to_biguint(&a_limbs);
+  let b_big = klimbs_u64_to_biguint(&b_limbs);
+  let (q_big, r_big) = if b_big == num_bigint::BigUint::ZERO {
+    (num_bigint::BigUint::ZERO, a_big.clone())
+  } else {
+    (&a_big / &b_big, &a_big % &b_big)
+  };
+  let q_ptr = find_klimbs_u64(memory, &biguint_to_klimbs_u64(&q_big))?;
+  let r_ptr = find_klimbs_u64(memory, &biguint_to_klimbs_u64(&r_big))?;
+  Ok((q_ptr, r_ptr))
+}
+
+/// Read-only twin of `build_klimbs_u64`: resolve the pointer of each
+/// (already-recorded) list node without inserting.
+fn find_klimbs_u64(
+  memory: &FxIndexMap<usize, QueryMap>,
+  limbs: &[u64],
+) -> Result<G, String> {
+  let queries = memory.get(&10).ok_or_else(|| {
+    "memory[10] channel not registered (no List<U64> in program?)".to_string()
+  })?;
+  let nil_key: Vec<G> =
+    std::iter::once(G::ONE).chain((0..9).map(|_| G::ZERO)).collect();
+  let mut tail_ptr = queries
+    .get(&nil_key)
+    .ok_or_else(|| "List<U64> Nil node not recorded".to_string())?
+    .output[0];
+  for limb in limbs.iter().rev() {
+    let mut key: Vec<G> = Vec::with_capacity(10);
+    key.push(G::ZERO); // Cons tag (first variant of ListNode‹U64›)
+    for b in &limb.to_le_bytes() {
+      key.push(G::from_u8(*b));
+    }
+    key.push(tail_ptr);
+    tail_ptr = queries
+      .get(&key)
+      .ok_or_else(|| {
+        format!("List<U64> Cons node for limb {limb} not recorded")
+      })?
+      .output[0];
+  }
+  Ok(tail_ptr)
+}
+
 /// Walk a `List<U64>` chain from `head_ptr` in `memory[10]`, returning the
 /// u64 limbs in head-first order. Each memory[10] entry is the standard Aiur
 /// tagged-enum layout: `[tag, byte0..byte7, next_ptr]`. `tag == 0` = Nil
diff --git a/crates/aiur/src/trace.rs b/crates/aiur/src/trace.rs
index e696e688..5b45d06c 100644
--- a/crates/aiur/src/trace.rs
+++ b/crates/aiur/src/trace.rs
@@ -13,7 +13,9 @@ use rayon::{
 use crate::{
   FxIndexMap, G,
   bytecode::{Block, Ctrl, Function, Op, Toplevel},
-  execute::{IOBuffer, IOKeyInfo, QueryRecord},
+  execute::{
+    IOBuffer, IOKeyInfo, QueryRecord, find_unconstrained_big_uint_div_mod,
+  },
   function_channel,
   gadgets::{bytes1::Bytes1, bytes2::Bytes2},
   memory::Memory,
@@ -558,10 +560,26 @@ impl Op {
           ),
         );
       },
+      Op::UnconstrainedBigUintDivMod(a, b) => {
+        // Mirrors the execute arm and the two auxiliary columns the
+        // constraints allocate: recompute `(q, r)` and resolve the head
+        // pointers execution recorded in memory[10]. Skipping the two map
+        // pushes would shift every later `ValIdx` (and witness column) in
+        // the block.
+        let (q_ptr, r_ptr) = find_unconstrained_big_uint_div_mod(
+          map[*a].0,
+          map[*b].0,
+          &context.query_record.memory_queries,
+        )
+        .expect("BigUint div-mod result not recorded");
+        for f in [q_ptr, r_ptr] {
+          map.push((f, 1));
+          slice.push_auxiliary(index, f);
+        }
+      },
       Op::AssertEq(..)
       | Op::IOSetInfo(..)
       | Op::IOWrite(..)
-      | Op::UnconstrainedBigUintDivMod(..)
       | Op::Debug(..) => {},
     }
   }

From 3880e137970210c93aa16cd6303946bebe335de7 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Fri, 3 Jul 2026 12:16:37 -0400
Subject: [PATCH 19/27] feat(bench): track Aiur proof-size and verify-time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prover changes can trade prove speed against proof size or verification
cost, so Phase 2 now serializes each fresh proof (proof-size, bytes) and
verifies it (verify-time; a verification failure is reported loudly and
drops only that measure). The full-closure path rebuilds the Array-G
claim from proveAddrWithEnv's serialized claim bytes — verify_claim's
input is the 32-G blake3 digest of those bytes, the same recipe as
`ix verify`.

Both measures flow through the neutral JSON into the PR compare table
(bench.py aiur prove metrics + byte/second unit mapping) and onto
bencher (verify-time 10% upper, proof-size 5% upper).

Measured: Nat.add_comm proves in 1.63s, verifies in 0.19s, 33.4 MB proof.
---
 .github/scripts/bench.py         |  5 ++-
 .github/workflows/bench-main.yml |  6 +++
 Benchmarks/Typecheck.lean        | 71 ++++++++++++++++++++++----------
 docs/benchmarking.md             |  6 +--
 4 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index c3aecdf6..895e8925 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -43,7 +43,8 @@
         "default_mode": "prove",
         "testbed": "aiur-check-x64-32x",
         "metrics": {
-            "prove":   ["fft-cost", "execute-time", "prove-time", "peak-rss"],
+            "prove":   ["fft-cost", "execute-time", "prove-time", "verify-time",
+                        "proof-size", "peak-rss"],
             "execute": ["fft-cost", "execute-time", "execute-peak-rss"],
         },
     },
@@ -216,9 +217,11 @@ def _num(d, name, metric):
     "peak-rss": "bytes",
     "execute-peak-rss": "bytes",
     "file-size": "bytes",
+    "proof-size": "bytes",
     # seconds
     "execute-time": "seconds",
     "prove-time": "seconds",
+    "verify-time": "seconds",
     "check-time": "seconds",
     "compile-time": "seconds",
     # large counts (10^6+ typical)
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index d93a2f82..39846bd0 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -251,6 +251,12 @@ jobs:
             --threshold-measure prove-time --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
+            --threshold-measure verify-time --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
+            --threshold-measure proof-size --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.05
+            --threshold-lower-boundary _
             --threshold-measure execute-time --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
diff --git a/Benchmarks/Typecheck.lean b/Benchmarks/Typecheck.lean
index d9bd11ca..d9e16fa9 100644
--- a/Benchmarks/Typecheck.lean
+++ b/Benchmarks/Typecheck.lean
@@ -56,8 +56,11 @@ transitive typecheck) in two phases:
    (Σ width·height·log2(height) over circuits — the proving-cost proxy), and
    `execute-time`.
 2. **Prove** (cheap→expensive by measured fft-cost): the end-to-end STARK prove,
-   recording `prove-time`. With texray on, each prove emits a per-span timeline
-   (`aiur/execute`, `aiur/witness`, `stark/...`) with RAM Δ/peak to stderr.
+   recording `prove-time`, the serialized `proof-size` (bytes), and
+   `verify-time` (verifying the fresh proof) — prover changes can trade speed
+   against proof size or verification cost, so all three are tracked. With
+   texray on, each prove emits a per-span timeline (`aiur/execute`,
+   `aiur/witness`, `stark/...`) with RAM Δ/peak to stderr.
 
 When `--json` is set the file is rewritten after every prove, so an external
 `timeout` still leaves a complete file of the results collected so far (cheapest
@@ -66,12 +69,13 @@ warning, so a single bad name never fails the run. The harness imposes no time
 limit; bound a run with an external `timeout` if needed.
 
 The JSON is a neutral, flat shape (`{ "<name>": { "constants": …, "fft-cost": …,
-"execute-time": …, "execute-peak-rss": …, "prove-time": …, "peak-rss": …,
-"throughput": … } }`). `execute-peak-rss` is the Phase-1 RSS high-water,
-sampled before proving starts, so it is comparable across execute-only and
-prove runs; `prove-time`, `peak-rss` (the prover's high-water), and
-`throughput` appear only for proven constants. Any bencher-specific reshaping
-is the caller's job (see `.github/workflows/bench-main.yml`).
+"execute-time": …, "execute-peak-rss": …, "prove-time": …, "proof-size": …,
+"verify-time": …, "peak-rss": …, "throughput": … } }`). `execute-peak-rss` is
+the Phase-1 RSS high-water, sampled before proving starts, so it is comparable
+across execute-only and prove runs; `prove-time`, `proof-size`, `verify-time`,
+`peak-rss` (the prover's high-water), and `throughput` appear only for proven
+constants. Any bencher-specific reshaping is the caller's job (see
+`.github/workflows/bench-main.yml`).
 -/
 
 open Lean (Json Name)
@@ -99,6 +103,12 @@ structure Result where
   fftCost : Float
   executeSec : Float
   proveSec : Option Float := none
+  /-- Serialized proof size in bytes (`Aiur.Proof.toBytes`). Tracked because
+      prover changes can trade speed against proof size. -/
+  proofSize : Option Nat := none
+  /-- Wall time of `AiurSystem.verify` over the fresh proof — the other side
+      of the same trade-off. `none` if verification failed (reported loudly). -/
+  verifySec : Option Float := none
   /-- Peak resident-set size in bytes (tracing-texray tree sampler), captured
       after the constant's heaviest phase. -/
   peakRss : Option Nat := none
@@ -140,6 +150,12 @@ def Result.toJsonEntry (r : Result) : String × Json :=
     | some p => base ++ [ ("prove-time", jsonRound 6 p)
                         , ("throughput", jsonRound 2 (r.constants.toFloat / p)) ]
     | none => base
+  let fields := match r.proofSize with
+    | some n => fields ++ [ ("proof-size", Lean.toJson n) ]
+    | none => fields
+  let fields := match r.verifySec with
+    | some v => fields ++ [ ("verify-time", jsonRound 6 v) ]
+    | none => fields
   (r.name, Json.mkObj fields)
 
 /-- Time a thunk, returning its value and the elapsed seconds. The result is
@@ -284,21 +300,34 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
         else
           match aiurSystem.proveAddrWithEnv friParameters funIdx envHandle addr.hash with
           | .error e => .error e
-          | .ok (_claimBytes, proof, ioBuf) =>
-            -- The shared envHandle path doesn't return an `Array G`
-            -- claim — adapt to the existing benchmark return shape
-            -- by recomputing the claim digest from the witness's
-            -- input (Phase 2 doesn't read it).
-            .ok (#[], proof, ioBuf)
+          | .ok (claimBytes, proof, ioBuf) =>
+            -- The envHandle path returns the SERIALIZED `Ix.Claim`; rebuild
+            -- the Array-G claim `verify` takes — `verify_claim`'s input is
+            -- the 32-G blake3 digest of those bytes (same recipe as
+            -- `ix verify`).
+            let digest := Address.blake3 claimBytes
+            let claim :=
+              Aiur.buildClaim funIdx (digest.hash.data.map .ofUInt8) #[]
+            .ok (claim, proof, ioBuf)
       match (proveRes : Except String (Array Aiur.G × Aiur.Proof × Aiur.IOBuffer)) with
       | .error e => IO.eprintln s!"  prove {r.name} failed: {e}"; continue
-      | .ok _ => pure ()
-      spent := spent + proveSec
-      let peak ← TracingTexray.peakTreeRssBytes
-      IO.println s!"  {r.name}: prove={proveSec}s (cumulative {spent}s)"
-      ordered := ordered.set! i
-        ({ r with proveSec := some proveSec, peakRss := some peak }, addr)
-      writeJson (ordered.map (·.1))
+      | .ok (claim, proof, _ioBuf) =>
+        spent := spent + proveSec
+        let peak ← TracingTexray.peakTreeRssBytes
+        let proofSize := (Aiur.Proof.toBytes proof).size
+        let (verifyRes, verifySec) ← timed fun _ =>
+          aiurSystem.verify friParameters claim proof
+        let verifySec? ← match verifyRes with
+          | .ok () => pure (some verifySec)
+          | .error e =>
+            IO.eprintln s!"  verify {r.name} FAILED: {e}"
+            pure none
+        IO.println s!"  {r.name}: prove={proveSec}s verify={verifySec}s \
+          proof={proofSize} bytes (cumulative {spent}s)"
+        ordered := ordered.set! i
+          ({ r with proveSec := some proveSec, peakRss := some peak
+                  , proofSize := some proofSize, verifySec := verifySec? }, addr)
+        writeJson (ordered.map (·.1))
     catch e =>
       IO.eprintln s!"  prove {r.name} threw: {e}"
 
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index ccb567cc..08382324 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -18,7 +18,7 @@ the same backend drivers:
 
 | backend | what it measures | metrics |
 |---|---|---|
-| `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove) | `fft-cost`, `execute-time`, `execute-peak-rss`, `prove-time`, `peak-rss` |
+| `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove; each fresh proof is also verified) | `fft-cost`, `execute-time`, `execute-peak-rss`, `prove-time`, `verify-time`, `proof-size`, `peak-rss` |
 | `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `execute-peak-rss` |
 | `ooc` | the same kernel run **out-of-circuit and in parallel** (`ix check-rs`) — far faster | `throughput`, `check-time`, `peak-rss` |
 | `compile` | `ix compile <env>.lean → <env>.ixe` on the current PR — measures the compile step itself, keyed by CamelCase env slug (`InitStd`, `Lean`, `Mathlib`, `FLT`) | `compile-time`, `throughput`, `file-size`, `constants` |
@@ -135,8 +135,8 @@ Threshold semantics per measure kind:
 - **`fft-cost`, `cycles`, `shards`, `max-shard-cycles`** — deterministic but
   directional: `upper 0` (any increase is a real regression), `lower _`
   (drops are legitimate wins — algorithmic improvements, better packing).
-- **`execute-time`, `prove-time`, `check-time`, `compile-time`, `peak-rss`,
-  `execute-peak-rss`, `file-size`** — noisy wall-clock or size measures:
+- **`execute-time`, `prove-time`, `verify-time`, `check-time`, `compile-time`, `peak-rss`,
+  `execute-peak-rss`, `file-size`, `proof-size`** — noisy wall-clock or size measures:
   `upper 0.05–0.10`, `lower _`. `execute-peak-rss` is the execute phase's RSS
   high-water on every backend that has one (bench-typecheck samples it at the
   Phase 1/2 boundary; the zkVM hosts' execute peak carries the same name);

From 3fb8bf7f84fdd0240eb5a3c9afb88689f731829e Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Fri, 3 Jul 2026 12:33:23 -0400
Subject: [PATCH 20/27] =?UTF-8?q?feat(ci):=20env-sharded=20zisk=20execute?=
 =?UTF-8?q?=20via=20ix=20profile=20=E2=86=92=20ix=20shard=20manifests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The constants that OOM as single full-closure leaves (extract_append
proofs, the instRxcHasSize_eq family, …) are only measurable under env
sharding — their checks fit in one manifest shard each, with deps checked
in other shards (docs/zisk-cycle-cost-model.md, finding 4). Wire the
existing offline partitioner into CI instead of any per-constant
sharding:

- compile job: after the compile benchmark, `ix profile <Env>.ixe` →
  `ix shard --max-ram 120` for InitStd/Mathlib; the `.ixes` manifest is
  cached next to the `.ixe` (every restore of that key lists the same
  paths — actions/cache versions entries by path list).
- zisk-host: `--shard-plan --execute --json` now writes one env-level
  row — total cycles, shards, max-shard-cycles, execute-time,
  throughput, execute-peak-rss, and a per-shard `shard-cycles`
  breakdown — keyed by the new `--json-name` (default: manifest stem).
- run.sh: the zkVM guest-run dance (setsid + RAM watchdog + OOM
  classification + shm sweep + merge) is factored into one `zkvm_run`
  helper; when a `.ixes` sits next to the `.ixe` (bench-main only — the
  \!benchmark PR path has none) the zisk branch appends the env-sharded
  run after the per-constant loop.
- bench.py bmf: nested-object flattening generalized — `phases` →
  `phase:<span>` as before, any other dict (e.g. `shard-cycles`) →
  `<key>:<sub>` — so per-shard cycles land on bencher as un-thresholded
  measures while the thresholded aggregates (cycles, shards,
  max-shard-cycles) do the alerting.

zkvm job timeout 60 → 150 min: the whole-env run (own 60m cap inside
run.sh) rides on top of the per-constant loop.
---
 .github/scripts/bench.py         | 18 ++++++----
 .github/scripts/run.sh           | 60 ++++++++++++++++++++------------
 .github/workflows/bench-main.yml | 45 +++++++++++++++++++-----
 .github/workflows/bench-pr.yml   |  6 +++-
 docs/benchmarking.md             | 21 ++++++++---
 zisk/host/src/main.rs            | 54 ++++++++++++++++++++++++++--
 6 files changed, 159 insertions(+), 45 deletions(-)

diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index 895e8925..31c66dd0 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -473,17 +473,21 @@ def cmd_bmf(a):
     for name, entry in (neutral or {}).items():
         if not isinstance(entry, dict):
             continue
-        phases = entry.get("phases")
-        phases = phases if isinstance(phases, dict) else {}
         measures = {}
         for k, v in entry.items():
-            if k in ("phases", "oom"):
+            if k == "oom":
                 continue
-            if isinstance(v, (int, float)) and not isinstance(v, bool):
+            # Nested objects are per-sub-measure breakdowns: `phases` (span →
+            # seconds) flattens to `phase:<span>`; anything else (e.g. the
+            # zisk env row's `shard-cycles`) to `<key>:<sub>`. Both stay
+            # un-thresholded on bencher (dynamic names).
+            if isinstance(v, dict):
+                prefix = "phase" if k == "phases" else k
+                for sub, sv in v.items():
+                    if isinstance(sv, (int, float)) and not isinstance(sv, bool):
+                        measures[f"{prefix}:{sub}"] = {"value": sv}
+            elif isinstance(v, (int, float)) and not isinstance(v, bool):
                 measures[k] = {"value": v}
-        for span, secs in phases.items():
-            if isinstance(secs, (int, float)) and not isinstance(secs, bool):
-                measures[f"phase:{span}"] = {"value": secs}
         if measures:
             out[name] = measures
     with open(a.out, "w") as f:
diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index 5b0ca7cd..72d83311 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -238,23 +238,23 @@ case "$backend" in
     }
     ceiling_gb=${ZKVM_EXECUTE_MAX_RSS_GB:-120}
     rows="$tmp/rows"; mkdir -p "$rows"
-    while IFS= read -r c; do
-      [ -z "$c" ] && continue
-      slug=$(printf '%s' "$c" | tr '/ .:' '____')
-      res="$tmp/$slug.json"; log="$tmp/$slug.log"; spans="$res.spans"; oom="$tmp/$slug.oom"
+    # One watchdog-guarded guest run, keyed `$key` in the results. Full
+    # closures are RAM-unbounded (the ASM microservices mmap multi-GB ROMs on
+    # top of the guest trace), so the same watchdog as the aiur prove path
+    # guards the runner. `exec setsid`: the subshell (whose pid is $!)
+    # replaces itself with the session leader, so the watchdog's group-kill
+    # (`kill -- -$!`) reaches the host and every descendant — without it a
+    # plain subshell wrapper's pgid would be run.sh's own. The host writes
+    # $res only on a clean (zero-failure) run; `$out` is re-merged per run so
+    # a job-level kill keeps completed rows.
+    zkvm_run() {  # <timeout> <key> <host args…>
+      local run_timeout="$1" key="$2"; shift 2
+      local slug; slug=$(printf '%s' "$key" | tr '/ .:' '____')
+      local res="$tmp/$slug.json" log="$tmp/$slug.log" oom="$tmp/$slug.oom"
+      local spans="$res.spans" zk_pid w_pid zk_exit
       rm -f "$oom"
-      # Full-closure check (no --skip-deps) so this is directly comparable to
-      # the ooc `ix check-rs --anon --consts` run — the delta then isolates the
-      # in-circuit-vs-out-of-circuit overhead rather than mixing in subject-
-      # only vs full-closure scope. Full closures are RAM-unbounded (the ASM
-      # microservices mmap multi-GB ROMs on top of the guest trace), so the
-      # same watchdog as the aiur prove path guards the runner.
-      # `exec setsid`: the subshell (whose pid is $!) replaces itself with the
-      # session leader, so the watchdog's group-kill (`kill -- -$!`) reaches
-      # the host and every descendant — without a plain subshell wrapper whose
-      # pgid would be run.sh's own.
-      ( cd "$work" && exec setsid timeout 25m "$bin" --execute --ixe "$ixe" \
-          --consts "$c" --json "$res" --texray ) \
+      ( cd "$work" && exec setsid timeout "$run_timeout" "$bin" --execute \
+          --ixe "$ixe" --json "$res" --texray "$@" ) \
         > "$log" 2>&1 &
       zk_pid=$!
       watch_ram_kill "$zk_pid" "$ceiling_gb" "$oom" &
@@ -264,21 +264,37 @@ case "$backend" in
       wait "$w_pid" 2>/dev/null || true
       [ "$zk_exit" -ne 0 ] && zisk_shm_sweep
       if looks_like_oom "$zk_exit" "$oom" "$log"; then
-        echo "::warning::$backend execute '$c' OOM (exit $zk_exit, marker=$([ -f "$oom" ] && echo watchdog || echo runtime), ceiling ${ceiling_gb} GB)" >&2
-        mark_oom "$res" "$c"
+        echo "::warning::$backend execute '$key' OOM (exit $zk_exit, marker=$([ -f "$oom" ] && echo watchdog || echo runtime), ceiling ${ceiling_gb} GB)" >&2
+        mark_oom "$res" "$key"
       elif [ "$zk_exit" -ne 0 ]; then
-        echo "::warning::$backend execute '$c' failed/timed out (exit $zk_exit); dropping" >&2
+        echo "::warning::$backend execute '$key' failed/timed out (exit $zk_exit); dropping" >&2
         sed -n '1,5p' "$log" >&2 || true
-        continue
+        return 0
       fi
-      # The host writes $res only on a clean (zero-failure) run. `$out` is
-      # re-merged per constant so a job-level kill keeps completed rows.
       merge_phases "$res" "$spans"
       if [ -s "$res" ]; then
         cp "$res" "$rows/$slug.json"
         jq -s 'reduce .[] as $o ({}; . + $o)' "$rows"/*.json > "$out" 2>/dev/null || true
       fi
+    }
+    # Full-closure check (no --skip-deps) so this is directly comparable to
+    # the ooc `ix check-rs --anon --consts` run — the delta then isolates the
+    # in-circuit-vs-out-of-circuit overhead rather than mixing in subject-
+    # only vs full-closure scope.
+    while IFS= read -r c; do
+      [ -z "$c" ] && continue
+      zkvm_run "${ZKVM_EXECUTE_TIMEOUT:-25m}" "$c" --consts "$c"
     done < "$names"
+    # Env-sharded execute (zisk only): when the compile job published a shard
+    # manifest next to the `.ixe` (ix profile → ix shard), execute the WHOLE
+    # env as its manifest shards and merge one env-keyed row — totals plus a
+    # per-shard `shard-cycles` breakdown — alongside the per-constant rows.
+    # Absent manifest (e.g. the !benchmark PR path) → skipped.
+    plan_ixes="${ixe%.ixe}.ixes"
+    if [ "$backend" = zisk ] && [ -f "$plan_ixes" ]; then
+      zkvm_run "${ZISK_ENV_EXECUTE_TIMEOUT:-60m}" "$benv_cc" \
+        --shard-plan "$plan_ixes" --json-name "$benv_cc"
+    fi
     emit_empty
     ;;
 
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index 39846bd0..ab9670d7 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -130,12 +130,28 @@ jobs:
           bash .github/scripts/run.sh . ${{ matrix.bench }} compile compile /dev/null neutral.json
           python3 .github/scripts/bench.py bmf --in neutral.json --out benchmark.json
           cat benchmark.json
-      # Cache the `.ixe` for the prove job (reused, never recompiled there).
-      # Only the matrix jobs the prove job consumes, to stay under the repo cache limit.
+      # Profile the env out of circuit and cut a shard manifest (`ix profile`
+      # → `ix shard`) for the zkvm-execute job's env-sharded run. Profiling is
+      # cache-isolated (sound heartbeats), so it costs a full un-memoized
+      # kernel pass — done once here, next to the freshly compiled `.ixe`,
+      # rather than in every consumer. --max-ram caps each shard's predicted
+      # prove RAM under the runner class's ceiling
+      # (docs/zisk-cycle-cost-model.md) — conservative for execute-only runs.
+      - if: matrix.bench == 'InitStd' || matrix.bench == 'Mathlib'
+        name: Profile + shard plan
+        run: |
+          ix profile ${{ matrix.bench }}.ixe --out ${{ matrix.bench }}.ixprof
+          ix shard ${{ matrix.bench }}.ixprof --max-ram 120 --out ${{ matrix.bench }}.ixes
+      # Cache the `.ixe` + shard manifest for the prove/zkvm jobs (reused,
+      # never regenerated there). Only the matrix jobs those consume, to stay
+      # under the repo cache limit. NB: every restore of this key must list
+      # the SAME paths — actions/cache versions the entry by its path list.
       - if: matrix.bench == 'InitStd' || matrix.bench == 'Mathlib'
         uses: actions/cache/save@v5
         with:
-          path: ${{ matrix.bench }}.ixe
+          path: |
+            ${{ matrix.bench }}.ixe
+            ${{ matrix.bench }}.ixes
           key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
       # Upload compile metrics. Every measure shares the per-workload baseline
       # window (data points since the ix-compile reset tag): file-size/constants
@@ -191,9 +207,12 @@ jobs:
           build: false
           use-github-cache: false
       # Pull the `.ixe` the compile job built — do NOT recompile it here.
+      # (The path list must match the compile job's save exactly.)
       - uses: actions/cache/restore@v5
         with:
-          path: ${{ matrix.bench }}.ixe
+          path: |
+            ${{ matrix.bench }}.ixe
+            ${{ matrix.bench }}.ixes
           key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
           fail-on-cache-miss: true
       # Run each constant in its own process so a clean failure or timeout drops
@@ -278,7 +297,11 @@ jobs:
   zkvm-execute:
     needs: compile
     runs-on: warp-ubuntu-latest-x64-32x
-    timeout-minutes: 60
+    # Per-constant loop (heavy primaries ride the RAM watchdog to OOM rows)
+    # plus the env-sharded whole-env run (its own 60m timeout in run.sh) —
+    # run.sh re-merges results incrementally, so even a job-level timeout
+    # would keep completed rows, but the bencher upload needs the job alive.
+    timeout-minutes: 150
     strategy:
       fail-fast: false
       matrix:
@@ -300,10 +323,14 @@ jobs:
       - name: Install SP1
         if: matrix.backend == 'sp1'
         uses: ./.github/actions/install-sp1
-      # Pull the `.ixe` the compile job built — no recompile (REUSE_IXE).
+      # Pull the `.ixe` + shard manifest the compile job built — no recompile
+      # (REUSE_IXE). run.sh's zisk branch adds the env-sharded execute row
+      # when the `.ixes` is present next to the `.ixe`.
       - uses: actions/cache/restore@v5
         with:
-          path: ${{ matrix.bench }}.ixe
+          path: |
+            ${{ matrix.bench }}.ixe
+            ${{ matrix.bench }}.ixes
           key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
           fail-on-cache-miss: true
       - name: Run ${{ matrix.backend }} execute benchmark
@@ -391,7 +418,9 @@ jobs:
           sudo apt-get update && sudo apt-get install -y time
       - uses: actions/cache/restore@v5
         with:
-          path: ${{ matrix.bench }}.ixe
+          path: |
+            ${{ matrix.bench }}.ixe
+            ${{ matrix.bench }}.ixes
           key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
           fail-on-cache-miss: true
       # run.sh ooc runs `ix check --anon` (whole env, parallel) and emits the
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index e34830cf..2308a9c2 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -272,7 +272,11 @@ jobs:
         id: base-ixe
         uses: actions/cache/restore@v5
         with:
-          path: ${{ steps.envcc.outputs.cc }}.ixe
+          # Path list must match bench-main's compile-job save (actions/cache
+          # versions entries by path list); the `.ixes` itself is unused here.
+          path: |
+            ${{ steps.envcc.outputs.cc }}.ixe
+            ${{ steps.envcc.outputs.cc }}.ixes
           key: bench-ixe-${{ env.BASE_SHA }}-${{ steps.envcc.outputs.cc }}
       # Cached base binaries are usable only when the two `lean-toolchain`
       # files are identical (plain `cmp`), and — for the mathlib env — only
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index 08382324..5a93f144 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -19,7 +19,7 @@ the same backend drivers:
 | backend | what it measures | metrics |
 |---|---|---|
 | `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove; each fresh proof is also verified) | `fft-cost`, `execute-time`, `execute-peak-rss`, `prove-time`, `verify-time`, `proof-size`, `peak-rss` |
-| `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `execute-peak-rss` |
+| `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `execute-peak-rss`; zisk's env-sharded row adds `shards`, `max-shard-cycles`, `shard-cycles:<k>` |
 | `ooc` | the same kernel run **out-of-circuit and in parallel** (`ix check-rs`) — far faster | `throughput`, `check-time`, `peak-rss` |
 | `compile` | `ix compile <env>.lean → <env>.ixe` on the current PR — measures the compile step itself, keyed by CamelCase env slug (`InitStd`, `Lean`, `Mathlib`, `FLT`) | `compile-time`, `throughput`, `file-size`, `constants` |
 
@@ -129,6 +129,17 @@ CI); the host still builds + unit-tests on every PR via ci.yml. To
 re-enable, uncomment sp1 in two places: the zkvm-execute matrix cell in
 `bench-main.yml` and the Install SP1 step in `bench-pr.yml`.
 
+The zisk job additionally executes the **whole env** as its shard-manifest
+partition. The compile job runs `ix profile <Env>.ixe` → `ix shard --max-ram
+120` after the compile benchmark and caches the `.ixes` manifest next to the
+`.ixe`; run.sh's zisk branch picks it up (skipped when absent, e.g. on the
+`!benchmark` PR path) and merges one env-keyed row (`InitStd` / `Mathlib`):
+total `cycles`, `shards`, `max-shard-cycles`, `execute-time`, `throughput`
+(cycles/s), `execute-peak-rss`, plus the per-shard breakdown uploaded as
+`shard-cycles:<k>` measures. This is also how the constants that OOM as
+single full-closure leaves get measured at all — under env sharding each
+check fits in one shard, with deps checked in other shards.
+
 Threshold semantics per measure kind:
 - **`constants`** — pinned exactly (0/0). A definitional count; either
   direction is worth flagging (someone added/removed a def).
@@ -142,9 +153,11 @@ Threshold semantics per measure kind:
   Phase 1/2 boundary; the zkVM hosts' execute peak carries the same name);
   bare `peak-rss` is a prove-phase (or, for ooc, whole-check) peak.
 - **`throughput`** — higher-is-better: `upper _`, `lower 0.05–0.10`.
-- **`phase:<span>`** — uploaded for trend visibility, intentionally left
-  un-thresholded (dynamic names + noise; the PR-comment drill-down is where
-  phase-level attention goes when the drill-down is reinstated).
+- **`phase:<span>`, `shard-cycles:<k>`** — uploaded for trend visibility,
+  intentionally left un-thresholded (dynamic names; a re-partition renames
+  the shard keys). The thresholded aggregates (`shards`, `max-shard-cycles`,
+  total `cycles`) do the alerting; the PR-comment drill-down is where
+  per-phase / per-shard attention goes when that view lands.
 
 All thresholds are windowed to the per-workload
 `bencher-thresholds-reset-<workload>` tag.
diff --git a/zisk/host/src/main.rs b/zisk/host/src/main.rs
index f2d813bc..80807b1e 100644
--- a/zisk/host/src/main.rs
+++ b/zisk/host/src/main.rs
@@ -205,9 +205,17 @@ struct Args {
   skip_deps: bool,
 
   /// Write per-constant results JSON `{ "<name>": { … } }` (accumulated across names).
+  /// With `--shard-plan --execute` it instead gets one env-level row (totals +
+  /// per-shard cycles breakdown).
   #[arg(long)]
   json: Option<PathBuf>,
 
+  /// Benchmark key for the env-level row `--shard-plan --execute --json`
+  /// writes (e.g. the CamelCase env slug CI uses). Defaults to the manifest
+  /// file stem.
+  #[arg(long, requires = "shard_plan")]
+  json_name: Option<String>,
+
   /// Enable tracing-texray; with --json, per-phase spans are written to <json>.spans.
   #[arg(long)]
   texray: bool,
@@ -1318,10 +1326,16 @@ async fn run_shard_plan(
   }
 
   // ---- Execute mode: run each novel shard in the VM for cycles (no proof);
-  // store-covered shards have nothing to execute. ----
+  // store-covered shards have nothing to execute. With `--json`, one
+  // env-level row (keyed by `--json-name`, default the manifest stem)
+  // carries the totals plus a per-shard cycles breakdown under
+  // `shard-cycles` — the CI benchmark's per-shard tracking. ----
   if args.execute {
+    let t0 = Instant::now();
     let mut total_steps = 0u64;
     let mut total_failures = 0u32;
+    let mut max_shard_cycles = 0u64;
+    let mut shard_cycles = serde_json::Map::new();
     for &(idx, g) in &novel {
       let (check_list, sub_env, _cover) = build_inputs(g)?;
       let stdin = leaf_stdin(0, 0, &sub_env, &check_list);
@@ -1331,6 +1345,10 @@ async fn run_shard_plan(
       let publics = ShardPublics::decode(&buf);
       let cycles = result.get_execution_steps();
       total_steps += cycles;
+      max_shard_cycles = max_shard_cycles.max(cycles);
+      // 1-based zero-padded keys: matches --only-shard's numbering and keeps
+      // the flattened bencher measure list (`shard-cycles:<k>`) sorted.
+      shard_cycles.insert(format!("{:02}", idx + 1), serde_json::json!(cycles));
       total_failures = total_failures.saturating_add(publics.failures);
       println!(
         "  [shard {idx}] {} work items, failures={}, cycles={cycles}",
@@ -1338,10 +1356,38 @@ async fn run_shard_plan(
         publics.failures,
       );
     }
+    let execute_secs = t0.elapsed().as_secs_f64();
+    tracing_texray::json_sink::record_manual("zisk/execute", execute_secs);
     println!("total cycles: {total_steps}, failures: {total_failures}");
     if total_failures > 0 {
       bail!("kernel typecheck produced {total_failures} failure(s)");
     }
+    if let Some(path) = &args.json {
+      let name = args.json_name.clone().unwrap_or_else(|| {
+        manifest_path
+          .file_stem()
+          .map(|s| s.to_string_lossy().into_owned())
+          .unwrap_or_else(|| "env".to_string())
+      });
+      let tput = if execute_secs > 0.0 {
+        total_steps as f64 / execute_secs
+      } else {
+        0.0
+      };
+      write_json_entry(
+        path,
+        &name,
+        serde_json::json!({
+          "cycles": total_steps,
+          "shards": novel.len(),
+          "max-shard-cycles": max_shard_cycles,
+          "execute-time": (execute_secs * 1e6).round() / 1e6,
+          "throughput": tput.round(),
+          "execute-peak-rss": peak_rss_bytes(),
+          "shard-cycles": shard_cycles,
+        }),
+      )?;
+    }
     return Ok(());
   }
 
@@ -1738,8 +1784,10 @@ async fn main() -> Result<()> {
   if consts.is_empty() && args.skip_deps {
     bail!("--skip-deps requires constants via --consts or --consts-file");
   }
-  if consts.is_empty() && args.json.is_some() {
-    bail!("--json requires constants via --consts or --consts-file");
+  if consts.is_empty() && args.json.is_some() && args.shard_plan.is_none() {
+    bail!(
+      "--json requires constants via --consts/--consts-file, or --shard-plan"
+    );
   }
 
   // ---- Plan every input up front (parse + shard). ----

From bc3c982d89099bd4c9e165974cf6a124d90aec02 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Fri, 3 Jul 2026 12:54:40 -0400
Subject: [PATCH 21/27] feat(ci): fill partial bencher misses from a targeted
 base run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fetch-main previously treated main.json as all-or-nothing: any name
bencher lacked at the base SHA (typically constants the PR itself adds
to Vectors.csv) silently rendered n/a. It now writes the uncovered
subset to --missing-out and still exits 0; bench-pr runs the base
checkout on JUST those names and merges under bencher's rows (bencher
wins overlaps — it is the canonical main side), so a brand-new constant
gets a real main-vs-PR delta on its first \!benchmark. The full-set base
fallback (exit 3, SHA not ingested) is unchanged.
---
 .github/scripts/bench.py       | 23 +++++++++++
 .github/workflows/bench-pr.yml | 71 +++++++++++++++++++++++++---------
 2 files changed, 75 insertions(+), 19 deletions(-)

diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index 31c66dd0..66e1851b 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -506,6 +506,11 @@ def cmd_fetch_main(a):
     locally; 2 = permanent config error ((backend, mode) not in BACKEND_TABLE,
     i.e. table / bench-main.yml drift) — the caller fails the cell loudly
     instead of paying the fallback forever.
+
+    A PARTIAL miss (bencher answered, but some --names entries have no data —
+    e.g. constants the PR adds to Vectors.csv) still exits 0: main.json holds
+    what bencher had, and --missing-out lists the uncovered names so the
+    caller can measure just those against the base checkout and merge.
     """
     entry = BACKEND_TABLE.get(a.backend)
     testbed = entry["testbed"] if entry and a.mode in entry["metrics"] else None
@@ -585,6 +590,20 @@ def _get_json(url, attempts=3):
     if not out:
         print(f"fetch-main: reports found but no matching benchmarks in --names")
         raise SystemExit(3)
+    # Names the PR side selected (its Vectors.csv) that bencher has no data
+    # for at this SHA — typically constants the PR itself adds to the CSV.
+    # The caller runs the base checkout on JUST these and merges, so a new
+    # constant still gets a real main-vs-PR delta on its first !benchmark.
+    # Computed against names.txt verbatim (not the ENV_CC-augmented `wanted`):
+    # the env-keyed row is an admit-filter, not a per-constant expectation.
+    if a.missing_out:
+        name_set = set(open(a.names).read().split()) if a.names else set()
+        missing = sorted(name_set - set(out))
+        with open(a.missing_out, "w") as f:
+            f.write("\n".join(missing) + ("\n" if missing else ""))
+        if missing:
+            print(f"fetch-main: {len(missing)} name(s) not on bencher @ "
+                  f"{a.sha[:8]} (base run will measure): " + ", ".join(missing))
     with open(a.out, "w") as f:
         json.dump(out, f)
     print(f"fetch-main: {len(out)} constant(s) from bencher for {a.backend}/{a.mode}")
@@ -623,6 +642,10 @@ def main():
                     help="Cell env; admits the env-keyed row (ooc whole-env) "
                          "past the --names filter.")
     fm.add_argument("--names", help="Only fetch benchmarks whose names appear in this file.")
+    fm.add_argument("--missing-out", dest="missing_out",
+                    help="Write the --names entries bencher had no data for "
+                         "(one per line; empty file when none) — the subset "
+                         "the caller should measure against the base checkout.")
     fm.add_argument("--out", required=True)
     fm.set_defaults(fn=cmd_fetch_main)
 
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index 2308a9c2..4056b2d7 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -16,8 +16,10 @@
 # (the same job bench-main.yml uploads under testbed `ix-compile-*`). The
 # optional bare `execute` token flips `aiur` to execute-only (skips Phase 2);
 # on the other backends it's a no-op. main's numbers come from bencher.dev;
-# the workflow falls back to re-running the base SHA locally only when bencher
-# hasn't ingested it yet (freshly-pushed main whose CI is still running).
+# the workflow re-runs the base SHA locally only for what bencher can't supply:
+# the full set when the SHA isn't ingested yet (freshly-pushed main whose CI is
+# still running), or just the missing names when the PR's Vectors.csv selects
+# constants main was never benched on (constants the PR itself adds).
 name: Benchmark pull requests
 
 on:
@@ -224,11 +226,19 @@ jobs:
       # ---------- main side ----------
       # Try bencher.dev first (bench-main.yml has uploaded main's numbers).
       # fetch-main's exit codes are load-bearing: 3 = transient (base SHA not
-      # ingested yet) → fall back to a local base run; anything else (2 =
-      # backend/mode has no main testbed — a BACKEND_TABLE / bench-main.yml
-      # drift) is a permanent misconfiguration that a local rebuild can never
-      # fix, so fail the cell loudly instead of silently paying the fallback
-      # on every future run.
+      # ingested yet) → fall back to a local base run of the full set; anything
+      # else (2 = backend/mode has no main testbed — a BACKEND_TABLE /
+      # bench-main.yml drift) is a permanent misconfiguration that a local
+      # rebuild can never fix, so fail the cell loudly instead of silently
+      # paying the fallback on every future run.
+      #
+      # Partial miss (exit 0 + non-empty missing.txt): the PR's Vectors.csv
+      # selects names main was never benched on — typically constants the PR
+      # itself adds. Bencher's numbers stand for the covered set; the base
+      # checkout runs JUST the missing names and merges below, so a brand-new
+      # constant still gets a real main-vs-PR delta on its first !benchmark.
+      # `run-base` gates every base-side step; `base-names` picks the file the
+      # base run measures (missing.txt = partial, names.txt = full fallback).
       - name: Fetch main from bencher
         id: bencher
         run: |
@@ -236,16 +246,26 @@ jobs:
           python3 .github/scripts/bench.py fetch-main \
             --sha "$BASE_SHA" --backend "$BACKEND" --mode "$MODE" --env "$BENV" \
             --names "$GITHUB_WORKSPACE/names.txt" \
+            --missing-out "$GITHUB_WORKSPACE/missing.txt" \
             --out "$GITHUB_WORKSPACE/main.json"
           rc=$?
           set -e
           case $rc in
-            0) echo "source=bencher" >> "$GITHUB_OUTPUT" ;;
-            3) echo "source=ran" >> "$GITHUB_OUTPUT" ;;
+            0) if [ -s "$GITHUB_WORKSPACE/missing.txt" ]; then
+                 echo "source=bencher + base run ($(wc -l < "$GITHUB_WORKSPACE/missing.txt") new)" >> "$GITHUB_OUTPUT"
+                 echo "run-base=true" >> "$GITHUB_OUTPUT"
+                 echo "base-names=missing.txt" >> "$GITHUB_OUTPUT"
+               else
+                 echo "source=bencher" >> "$GITHUB_OUTPUT"
+                 echo "run-base=false" >> "$GITHUB_OUTPUT"
+               fi ;;
+            3) echo "source=ran" >> "$GITHUB_OUTPUT"
+               echo "run-base=true" >> "$GITHUB_OUTPUT"
+               echo "base-names=names.txt" >> "$GITHUB_OUTPUT" ;;
             *) echo "::error::fetch-main: permanent config error (exit $rc) — check BACKEND_TABLE in bench.py vs bench-main.yml testbeds"; exit "$rc" ;;
           esac
-      - name: Checkout base (bencher had no data)
-        if: steps.bencher.outputs.source == 'ran'
+      - name: Checkout base (bencher data missing or partial)
+        if: steps.bencher.outputs.run-base == 'true'
         uses: actions/checkout@v6
         with:
           ref: ${{ env.BASE_SHA }}
@@ -255,20 +275,20 @@ jobs:
       # for a from-scratch base build. The `.ixe` cache key/path use the
       # CamelCase env slug (bench-main's matrix.bench).
       - name: Compute env slug
-        if: steps.bencher.outputs.source == 'ran'
+        if: steps.bencher.outputs.run-base == 'true'
         id: envcc
         run: |
           benv="$BENV"
           echo "cc=${benv^}" >> "$GITHUB_OUTPUT"
       - name: Restore base binaries (bench-main build cache)
-        if: steps.bencher.outputs.source == 'ran'
+        if: steps.bencher.outputs.run-base == 'true'
         id: base-bins
         uses: actions/cache/restore@v5
         with:
           path: ~/.local/bin
           key: bench-bins-${{ env.BASE_SHA }}
       - name: Restore base .ixe (bench-main compile cache)
-        if: steps.bencher.outputs.source == 'ran'
+        if: steps.bencher.outputs.run-base == 'true'
         id: base-ixe
         uses: actions/cache/restore@v5
         with:
@@ -283,7 +303,7 @@ jobs:
       # when the `.ixe` was also restored (otherwise base's `ix compile` needs
       # mathlib oleans that only the full build fetches).
       - name: Resolve base binaries
-        if: steps.bencher.outputs.source == 'ran'
+        if: steps.bencher.outputs.run-base == 'true'
         id: base-src
         run: |
           cached=false
@@ -298,7 +318,7 @@ jobs:
           echo "cached=$cached" >> "$GITHUB_OUTPUT"
           echo "base binaries: $([ "$cached" = true ] && echo restored from bench-main cache || echo building from source)"
       - name: Build base (ix, bench-typecheck)
-        if: steps.bencher.outputs.source == 'ran' && steps.base-src.outputs.cached != 'true'
+        if: steps.bencher.outputs.run-base == 'true' && steps.base-src.outputs.cached != 'true'
         uses: leanprover/lean-action@v1
         with:
           lake-package-directory: base
@@ -311,8 +331,8 @@ jobs:
       # changes the benchmark CLIs themselves, the base binaries reject the new
       # flags and every constant drops — compare then renders the loud
       # "main produced no results" note instead of a silent all-n/a table.
-      - name: Run backend on base → main.json
-        if: steps.bencher.outputs.source == 'ran'
+      - name: Run backend on base → merge into main.json
+        if: steps.bencher.outputs.run-base == 'true'
         run: |
           if [ "${{ steps.base-ixe.outputs.cache-hit }}" = true ]; then
             mv "${{ steps.envcc.outputs.cc }}.ixe" "base/$BENV.ixe"
@@ -324,7 +344,20 @@ jobs:
             export PATH="$PWD/base/.lake/build/bin:$PATH"
           fi
           bash .github/scripts/run.sh base "$BENV" "$BACKEND" "$MODE" \
-            "$GITHUB_WORKSPACE/names.txt" "$GITHUB_WORKSPACE/main.json"
+            "$GITHUB_WORKSPACE/${{ steps.bencher.outputs.base-names }}" \
+            "$GITHUB_WORKSPACE/base.json"
+          # Partial fallback: bencher already supplied main.json for the
+          # covered names; fill only the gaps from the base run. Bencher wins
+          # any overlap (e.g. ooc's always-emitted whole-env row) — it is the
+          # canonical main-side number. Full fallback: main.json doesn't exist
+          # (fetch-main exits 3 before writing), so the base run IS main.json.
+          if [ -s "$GITHUB_WORKSPACE/main.json" ]; then
+            jq -s '.[0] + .[1]' "$GITHUB_WORKSPACE/base.json" "$GITHUB_WORKSPACE/main.json" \
+              > "$GITHUB_WORKSPACE/main.merged" \
+              && mv "$GITHUB_WORKSPACE/main.merged" "$GITHUB_WORKSPACE/main.json"
+          else
+            mv "$GITHUB_WORKSPACE/base.json" "$GITHUB_WORKSPACE/main.json"
+          fi
 
       # ---------- PR side ----------
       - name: Run backend on PR → pr.json

From 7dee4ccab119f37fedf313ac07cb7287ad91a3d3 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Fri, 3 Jul 2026 12:54:52 -0400
Subject: [PATCH 22/27] feat(ci): PR side cuts its own zisk shard manifest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A PR can change the kernel's cost profile, so the env-sharded zisk run
must not inherit main's partition. run.sh now cuts the manifest in-place
(ix profile → ix shard) whenever one isn't already sitting next to the
.ixe — the profiler counts heartbeats rather than wall time, so an
unchanged tree re-partitions deterministically and per-shard comparisons
stay meaningful. bench-main's compile-job manifest keeps pre-empting the
generation on the main side; the \!benchmark PR side profiles its own
tree (manifest cached per head SHA under bench-pr-ixes-*). The base
fallback reuses bench-main's cached manifest and pays the env-sharded
run only on a full bencher miss — a partial miss means bencher already
holds main's env row (ZISK_ENV_SHARD=0 skips it).

bench-pr benchmark timeout 120 → 180 min for the zisk cell's worst case
(per-constant loop + profile + the env execute's own 60m cap).
---
 .github/scripts/run.sh         | 36 ++++++++++++++++++++++++++--------
 .github/workflows/bench-pr.yml | 32 +++++++++++++++++++++++++++++-
 docs/benchmarking.md           | 18 ++++++++++++-----
 3 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index 72d83311..f130e0ee 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -285,15 +285,35 @@ case "$backend" in
       [ -z "$c" ] && continue
       zkvm_run "${ZKVM_EXECUTE_TIMEOUT:-25m}" "$c" --consts "$c"
     done < "$names"
-    # Env-sharded execute (zisk only): when the compile job published a shard
-    # manifest next to the `.ixe` (ix profile → ix shard), execute the WHOLE
-    # env as its manifest shards and merge one env-keyed row — totals plus a
-    # per-shard `shard-cycles` breakdown — alongside the per-constant rows.
-    # Absent manifest (e.g. the !benchmark PR path) → skipped.
+    # Env-sharded execute (zisk only): execute the WHOLE env as its manifest
+    # shards and merge one env-keyed row — totals plus a per-shard
+    # `shard-cycles` breakdown — alongside the per-constant rows. The
+    # manifest comes from bench-main's compile-job cache when it was restored
+    # next to the `.ixe`; otherwise (the !benchmark PR side, a cold base
+    # fallback) it is cut fresh HERE — each side profiles its own tree, since
+    # a PR can change the cost profile, and the profiler counts heartbeats
+    # (not wall time) so an unchanged tree re-partitions deterministically.
+    # ZISK_ENV_SHARD=0 skips the whole run (the partial base fallback, where
+    # bencher already holds main's env row).
     plan_ixes="${ixe%.ixe}.ixes"
-    if [ "$backend" = zisk ] && [ -f "$plan_ixes" ]; then
-      zkvm_run "${ZISK_ENV_EXECUTE_TIMEOUT:-60m}" "$benv_cc" \
-        --shard-plan "$plan_ixes" --json-name "$benv_cc"
+    if [ "$backend" = zisk ] && [ "${ZISK_ENV_SHARD:-1}" = 1 ]; then
+      if [ ! -f "$plan_ixes" ]; then
+        if ix_bin=$(resolve_bin ix 2>/dev/null); then
+          echo "::group::ix profile + shard → $plan_ixes"
+          "$ix_bin" profile "$ixe" --out "${ixe%.ixe}.ixprof" \
+            && "$ix_bin" shard "${ixe%.ixe}.ixprof" \
+                 --max-ram "${SHARD_MAX_RAM_GB:-120}" --out "$plan_ixes" \
+            || { echo "::warning::ix profile/shard failed; skipping env-sharded run" >&2
+                 rm -f "$plan_ixes"; }
+          echo "::endgroup::"
+        else
+          echo "::warning::no ix binary to cut $plan_ixes; skipping env-sharded run" >&2
+        fi
+      fi
+      if [ -f "$plan_ixes" ]; then
+        zkvm_run "${ZISK_ENV_EXECUTE_TIMEOUT:-60m}" "$benv_cc" \
+          --shard-plan "$plan_ixes" --json-name "$benv_cc"
+      fi
     fi
     emit_empty
     ;;
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index 4056b2d7..af1fae59 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -131,7 +131,9 @@ jobs:
   benchmark:
     needs: [setup, build]
     runs-on: ${{ matrix.cell.runner }}
-    timeout-minutes: 120
+    # Wide enough for the zisk cell's worst case: per-constant loop + PR-side
+    # `ix profile` + the env-sharded execute (own 60m cap inside run.sh).
+    timeout-minutes: 180
     strategy:
       fail-fast: false
       matrix:
@@ -207,6 +209,16 @@ jobs:
         with:
           path: ${{ matrix.cell.env }}.ixe
           key: bench-pr-ixe-${{ env.HEAD_SHA }}-${{ matrix.cell.env }}
+      # The PR's shard manifest (run.sh cuts it via ix profile → ix shard for
+      # the env-sharded zisk run — the PR side partitions its OWN tree). Cached
+      # separately from the `.ixe`: only zisk cells produce it.
+      - name: Restore PR shard manifest
+        if: matrix.cell.backend == 'zisk'
+        id: pr-ixes
+        uses: actions/cache/restore@v5
+        with:
+          path: ${{ matrix.cell.env }}.ixes
+          key: bench-pr-ixes-${{ env.HEAD_SHA }}-${{ matrix.cell.env }}
       # zkVM cells additionally need the Rust toolchain + the backend's toolchain
       # and system deps (the shared composite install actions).
       - name: Set up zkVM Rust toolchain
@@ -336,6 +348,9 @@ jobs:
         run: |
           if [ "${{ steps.base-ixe.outputs.cache-hit }}" = true ]; then
             mv "${{ steps.envcc.outputs.cc }}.ixe" "base/$BENV.ixe"
+            # bench-main's shard manifest rides the same cache entry; with it
+            # in place run.sh skips re-profiling the base tree.
+            mv "${{ steps.envcc.outputs.cc }}.ixes" "base/$BENV.ixes" 2>/dev/null || true
             export REUSE_IXE=1
           fi
           if [ "${{ steps.base-src.outputs.cached }}" = true ]; then
@@ -343,6 +358,10 @@ jobs:
           else
             export PATH="$PWD/base/.lake/build/bin:$PATH"
           fi
+          # The zisk env-sharded run is only worth paying on the FULL
+          # fallback — a partial miss means bencher already holds main's
+          # env-keyed row (it is never in missing.txt).
+          export ZISK_ENV_SHARD=$([ "${{ steps.bencher.outputs.base-names }}" = names.txt ] && echo 1 || echo 0)
           bash .github/scripts/run.sh base "$BENV" "$BACKEND" "$MODE" \
             "$GITHUB_WORKSPACE/${{ steps.bencher.outputs.base-names }}" \
             "$GITHUB_WORKSPACE/base.json"
@@ -375,6 +394,17 @@ jobs:
         with:
           path: ${{ matrix.cell.env }}.ixe
           key: bench-pr-ixe-${{ env.HEAD_SHA }}-${{ matrix.cell.env }}
+      # hashFiles guard: skip (instead of erroring) when profile/shard failed
+      # and run.sh dropped the env-sharded run.
+      - name: Save PR shard manifest
+        if: >-
+          matrix.cell.backend == 'zisk' &&
+          steps.pr-ixes.outputs.cache-hit != 'true' &&
+          hashFiles(format('{0}.ixes', matrix.cell.env)) != ''
+        uses: actions/cache/save@v5
+        with:
+          path: ${{ matrix.cell.env }}.ixes
+          key: bench-pr-ixes-${{ env.HEAD_SHA }}-${{ matrix.cell.env }}
 
       # ---------- compare ----------
       - name: Build comparison table
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index 5a93f144..26e6e43b 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -130,16 +130,24 @@ re-enable, uncomment sp1 in two places: the zkvm-execute matrix cell in
 `bench-main.yml` and the Install SP1 step in `bench-pr.yml`.
 
 The zisk job additionally executes the **whole env** as its shard-manifest
-partition. The compile job runs `ix profile <Env>.ixe` → `ix shard --max-ram
-120` after the compile benchmark and caches the `.ixes` manifest next to the
-`.ixe`; run.sh's zisk branch picks it up (skipped when absent, e.g. on the
-`!benchmark` PR path) and merges one env-keyed row (`InitStd` / `Mathlib`):
-total `cycles`, `shards`, `max-shard-cycles`, `execute-time`, `throughput`
+partition, merging one env-keyed row (`InitStd` / `Mathlib`): total
+`cycles`, `shards`, `max-shard-cycles`, `execute-time`, `throughput`
 (cycles/s), `execute-peak-rss`, plus the per-shard breakdown uploaded as
 `shard-cycles:<k>` measures. This is also how the constants that OOM as
 single full-closure leaves get measured at all — under env sharding each
 check fits in one shard, with deps checked in other shards.
 
+The manifest (`ix profile <Env>.ixe` → `ix shard --max-ram 120` →
+`<Env>.ixes`) is cut once per tree: bench-main's compile job generates it
+after the compile benchmark and caches it next to the `.ixe`; on the
+`!benchmark` PR path, run.sh cuts it in-place with the side's own `ix`
+(cached per head SHA) — each side partitions its **own** tree, because a PR
+can change the cost profile, and the profiler counts heartbeats rather than
+wall time so an unchanged tree re-partitions deterministically. The base
+fallback reuses bench-main's cached manifest and only pays the env-sharded
+run on a full bencher miss (a partial miss means bencher already holds
+main's env row; `ZISK_ENV_SHARD=0`).
+
 Threshold semantics per measure kind:
 - **`constants`** — pinned exactly (0/0). A definitional count; either
   direction is worth flagging (someone added/removed a def).

From 3e97acff04b9c2aadd035f546d2c2abaa199bdeb Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Fri, 3 Jul 2026 13:09:25 -0400
Subject: [PATCH 23/27] feat(ci): per-shard execute-time and peak RAM in the
 zisk env row
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The shard breakdown carried cycles only. Each shard now also records its
guest execute time (SDK-reported) and its RAM high-water — the texray
sampler's peak is reset before every shard, so the windows are
independent and the env row's execute-peak-rss becomes their max (the
run's execution-phase high-water; setup RAM no longer counted). Uploaded
as shard-time:<k> / shard-peak-rss:<k> alongside shard-cycles:<k>, all
un-thresholded like phase:<span>.

No prove-side equivalent yet: zisk proving isn't wired up in CI at all
(needs a GPU runner), so there is no prove-time for any zisk row.
---
 docs/benchmarking.md  | 16 +++++++++-------
 zisk/host/src/main.rs | 31 +++++++++++++++++++++++++++----
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index 26e6e43b..e122ecd3 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -132,8 +132,9 @@ re-enable, uncomment sp1 in two places: the zkvm-execute matrix cell in
 The zisk job additionally executes the **whole env** as its shard-manifest
 partition, merging one env-keyed row (`InitStd` / `Mathlib`): total
 `cycles`, `shards`, `max-shard-cycles`, `execute-time`, `throughput`
-(cycles/s), `execute-peak-rss`, plus the per-shard breakdown uploaded as
-`shard-cycles:<k>` measures. This is also how the constants that OOM as
+(cycles/s), `execute-peak-rss` (max over the per-shard windows), plus the
+per-shard breakdown uploaded as `shard-cycles:<k>` / `shard-time:<k>` /
+`shard-peak-rss:<k>` measures. This is also how the constants that OOM as
 single full-closure leaves get measured at all — under env sharding each
 check fits in one shard, with deps checked in other shards.
 
@@ -161,11 +162,12 @@ Threshold semantics per measure kind:
   Phase 1/2 boundary; the zkVM hosts' execute peak carries the same name);
   bare `peak-rss` is a prove-phase (or, for ooc, whole-check) peak.
 - **`throughput`** — higher-is-better: `upper _`, `lower 0.05–0.10`.
-- **`phase:<span>`, `shard-cycles:<k>`** — uploaded for trend visibility,
-  intentionally left un-thresholded (dynamic names; a re-partition renames
-  the shard keys). The thresholded aggregates (`shards`, `max-shard-cycles`,
-  total `cycles`) do the alerting; the PR-comment drill-down is where
-  per-phase / per-shard attention goes when that view lands.
+- **`phase:<span>`, `shard-{cycles,time,peak-rss}:<k>`** — uploaded for
+  trend visibility, intentionally left un-thresholded (dynamic names; a
+  re-partition renames the shard keys). The thresholded aggregates
+  (`shards`, `max-shard-cycles`, total `cycles`) do the alerting; the
+  PR-comment drill-down is where per-phase / per-shard attention goes when
+  that view lands.
 
 All thresholds are windowed to the per-workload
 `bencher-thresholds-reset-<workload>` tag.
diff --git a/zisk/host/src/main.rs b/zisk/host/src/main.rs
index 80807b1e..f3b1adaa 100644
--- a/zisk/host/src/main.rs
+++ b/zisk/host/src/main.rs
@@ -1335,25 +1335,44 @@ async fn run_shard_plan(
     let mut total_steps = 0u64;
     let mut total_failures = 0u32;
     let mut max_shard_cycles = 0u64;
+    let mut max_shard_peak: Option<u64> = None;
     let mut shard_cycles = serde_json::Map::new();
+    let mut shard_time = serde_json::Map::new();
+    let mut shard_peak_rss = serde_json::Map::new();
     for &(idx, g) in &novel {
       let (check_list, sub_env, _cover) = build_inputs(g)?;
       let stdin = leaf_stdin(0, 0, &sub_env, &check_list);
+      // Windowed RAM high-water: reset before each shard so the per-shard
+      // peaks are independent; the env row's execute-peak-rss is their max.
+      tracing_texray::rss_sampler::reset_peak_tree_rss();
       let result = client.execute(&SHARD_PROGRAM, stdin).run()?.await?;
       let mut buf = [0u8; SHARD_PUBLICS_LEN];
       result.get_public_values_slice(&mut buf);
       let publics = ShardPublics::decode(&buf);
       let cycles = result.get_execution_steps();
+      let exec_secs = result.get_execution_time() as f64 / 1000.0;
+      let peak = peak_rss_bytes();
       total_steps += cycles;
       max_shard_cycles = max_shard_cycles.max(cycles);
+      max_shard_peak = max_shard_peak.max(peak);
       // 1-based zero-padded keys: matches --only-shard's numbering and keeps
-      // the flattened bencher measure list (`shard-cycles:<k>`) sorted.
-      shard_cycles.insert(format!("{:02}", idx + 1), serde_json::json!(cycles));
+      // the flattened bencher measure list (`shard-cycles:<k>`, …) sorted.
+      let key = format!("{:02}", idx + 1);
+      shard_cycles.insert(key.clone(), serde_json::json!(cycles));
+      shard_time.insert(key.clone(), serde_json::json!(exec_secs));
+      if let Some(p) = peak {
+        shard_peak_rss.insert(key, serde_json::json!(p));
+      }
       total_failures = total_failures.saturating_add(publics.failures);
       println!(
-        "  [shard {idx}] {} work items, failures={}, cycles={cycles}",
+        "  [shard {idx}] {} work items, failures={}, cycles={cycles}, \
+         {exec_secs:.1}s, peak {}",
         g.len(),
         publics.failures,
+        peak.map_or("?".to_string(), |p| format!(
+          "{:.2} GiB",
+          p as f64 / (1 << 30) as f64
+        )),
       );
     }
     let execute_secs = t0.elapsed().as_secs_f64();
@@ -1383,8 +1402,12 @@ async fn run_shard_plan(
           "max-shard-cycles": max_shard_cycles,
           "execute-time": (execute_secs * 1e6).round() / 1e6,
           "throughput": tput.round(),
-          "execute-peak-rss": peak_rss_bytes(),
+          // Max over the per-shard windows == the run's execution-phase
+          // high-water (setup RAM excluded by the resets above).
+          "execute-peak-rss": max_shard_peak,
           "shard-cycles": shard_cycles,
+          "shard-time": shard_time,
+          "shard-peak-rss": shard_peak_rss,
         }),
       )?;
     }

From 2bcce38cc863e36c72809eed53bb3f64964f65cb Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Fri, 3 Jul 2026 15:53:51 -0400
Subject: [PATCH 24/27] feat(ix): compile --consts seed selection + `ix shard
 extract` subcommand
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two ways to get a closure-only `.ixe` for one constant:

- `ix compile <file.lean> --consts <n1,n2,…>` (+ --consts-file): seed the
  compile by exact constant name instead of the whole import env, with
  transitive deps via the existing collectDeps — same names vocabulary as
  `ix check --consts`. Mutually exclusive with --module.

- `ix shard extract <env.ixe> --consts <n1,n2,…> --out <sub.ixe>`: the
  sharding pipeline's scoping step — cut the closure out of an EXISTING
  env without recompiling from source. The output carries the closure's
  genuine constant bytes, blobs, and reducibility hints (build_sub_env,
  now shared via sub_env_of), plus a name→address entry per closure
  constant so `--consts`-style tools still resolve. Metadata is dropped
  (real ConstantMeta references name addresses throughout its tree and
  would need the full hash-consed name index) — extracted envs serve the
  anon pipeline; meta-mode tools need the source env. A mutual-block
  member extracts its whole block.

Verified against a fresh initStd.ixe: both forms produce envs whose
bench-typecheck fft-cost is bit-identical to the full-env run
(content addressing at work), and extract → ix profile → ix shard
composes cleanly (Int8.instRxcHasSize_eq: 2.1 MB closure env,
16.08e9 steps → 11 shards at the 120 GiB cap).
---
 Ix/Cli/CompileCmd.lean         |  33 ++++++++-
 Ix/Cli/ShardCmd.lean           |  49 +++++++++++++
 Ix/KernelCheck.lean            |  14 ++++
 crates/ffi/src/kernel.rs       | 127 +++++++++++++++++++++++++++++++++
 crates/kernel/src/anon_work.rs |  60 ++++++++++++++++
 5 files changed, 282 insertions(+), 1 deletion(-)

diff --git a/Ix/Cli/CompileCmd.lean b/Ix/Cli/CompileCmd.lean
index b9b3a38b..19a75862 100644
--- a/Ix/Cli/CompileCmd.lean
+++ b/Ix/Cli/CompileCmd.lean
@@ -58,7 +58,36 @@ def runCompileCmd (p : Cli.Parsed) : IO UInt32 := do
   -- Seeds pass through `collectDeps` for the transitive-dep closure.
   -- Flag name is `--module` (not `--ns`) because the match is against
   -- the source module name, not the decl's own namespace.
-  let constList ← match p.flag? "module" with
+  -- `--consts` / `--consts-file`: seed by EXACT constant name, transitive
+  -- deps via `collectDeps` — a closure-only env (e.g. one benchmark constant
+  -- + deps) instead of the whole import env. Resolution tries `String.toName`
+  -- first, then a displayed-form scan so `_private`/numeric components
+  -- round-trip. Mutually exclusive with `--module`; `--exclude` doesn't
+  -- apply (the seed list is already explicit).
+  let constsSeeds ← Ix.Cli.ConstsFile.gather p
+  if !constsSeeds.isEmpty && (p.flag? "module").isSome then
+    p.printError "error: --consts/--consts-file and --module are mutually exclusive"
+    return 1
+  let constList ←
+    if !constsSeeds.isEmpty then do
+      let mut seeds : List Lean.Name := []
+      let mut missing : List String := []
+      for n in constsSeeds do
+        let name := n.toName
+        if leanEnv.constants.contains name then
+          seeds := name :: seeds
+        else
+          match leanEnv.constants.toList.find? (fun (m, _) => toString m == n) with
+          | some (m, _) => seeds := m :: seeds
+          | none => missing := n :: missing
+      if !missing.isEmpty then
+        p.printError s!"error: no constant(s) named {missing} in the environment"
+        return 1
+      IO.println s!"[compile] consts: {seeds.length} seed constant(s)"
+      let closed := collectDeps leanEnv seeds
+      IO.println s!"[compile] consts: {closed.length} constants after transitive-dep closure"
+      pure closed
+    else match p.flag? "module" with
     | none =>
       if excludeSet.isEmpty then pure leanEnv.constants.toList
       else
@@ -115,6 +144,8 @@ def compileCmd : Cli.Cmd := `[Cli|
 
   FLAGS:
     out            : String; "Output path for serialized Ixon.Env bytes; defaults to the lowercased input file stem with `.ixe` (e.g. CompileMathlib.lean -> compilemathlib.ixe)"
+    consts         : String; "Comma-separated EXACT constant names to compile (transitive deps pulled in automatically) instead of the whole import env — e.g. `Nat.add_comm`. Same flag/shape as `ix check --consts`. Mutually exclusive with --module; --exclude does not apply."
+    "consts-file"  : String; "Additionally read seed constant names from a file (one per line; `#` comments and blank lines ignored). Unions with --consts."
     module         : String; "Comma-separated module-name prefixes to filter on (e.g. 'Tests.Ix.Kernel.TutorialDefs,Tests.Ix.Kernel.NatReduction'). Match is against the SOURCE MODULE a constant came from (via `Lean.Environment.getModuleIdxFor?`), not the constant's own name — so macro-emitted decls that register under unqualified names still get caught when their host module's name matches. Transitive deps are pulled in automatically."
     exclude        : String; "Comma-separated exact Lean.Name(s) to strip from the seed set. Excluded names that are still referenced by another seed will reappear via the transitive-dep closure."
     "exclude-file" : String; "Path to a file with one Lean.Name per line to strip from the seed set. Same semantics as --exclude; same line format as `ix check --consts-file`."
diff --git a/Ix/Cli/ShardCmd.lean b/Ix/Cli/ShardCmd.lean
index 002e9cd7..8bdee963 100644
--- a/Ix/Cli/ShardCmd.lean
+++ b/Ix/Cli/ShardCmd.lean
@@ -14,10 +14,19 @@
   manifest and prints a what-if report (per-shard cost + total cross-shard
   ingress). The partitioner is self-contained — no external graph-library
   dependency.
+
+  `ix shard extract <path.ixe> --consts <n1,n2,…>`: the pipeline's scoping
+  step — extract the named constants' dependency closure from a serialized
+  env into a standalone `.ixe`, without recompiling from source. The output
+  carries the closure's genuine constant bytes, blobs, and reducibility
+  hints, plus each closure constant's name→address entry, so it composes
+  with everything that consumes a `.ixe` (`ix profile` → `ix shard`,
+  `ix check-rs --consts`, the zkVM hosts, `bench-typecheck`).
 -/
 module
 public import Cli
 public import Ix.KernelCheck
+public import Ix.Cli.ConstsFile
 
 public section
 
@@ -25,6 +34,43 @@ open Ix.KernelCheck
 
 namespace Ix.Cli.ShardCmd
 
+def runShardExtractCmd (p : Cli.Parsed) : IO UInt32 := do
+  let some pathArg := p.positionalArg? "path"
+    | p.printError "error: must specify <path> to a .ixe file"
+      return 1
+  let envPath := pathArg.as! String
+  let names ← Ix.Cli.ConstsFile.gather p
+  if names.isEmpty then
+    p.printError "error: pass at least one name via --consts or --consts-file"
+    return 1
+  let outPath : String :=
+    match p.flag? "out" with
+    | some flag => flag.as! String
+    -- Default output mirrors the first constant's slug next to the source
+    -- env: `init.ixe --consts Nat.add_comm` → `nat_add_comm.ixe`.
+    | none =>
+      let slug := names[0]!.map fun c =>
+        if c.isAlphanum then c.toLower else '_'
+      s!"{slug}.ixe"
+  let quiet := !(p.flag? "verbose" |>.isSome)
+  rsEnvExtractFFI envPath names outPath quiet
+  IO.println s!"[extract] wrote {outPath} ({names.size} root name(s))"
+  return 0
+
+def shardExtractCmd : Cli.Cmd := `[Cli|
+  "extract" VIA runShardExtractCmd;
+  "Extract named constants + their dependency closure from a `.ixe` into a standalone `.ixe`"
+
+  FLAGS:
+    consts        : String; "Comma-separated EXACT constant names (displayed form) to extract, e.g. `Nat.add_comm,String.append`. Same flag/shape as `ix check-rs --consts`. A mutual-block member extracts its whole block."
+    "consts-file" : String; "Additionally read names from a file (one per line; `#` comments and blank lines ignored). Unions with --consts."
+    out           : String; "Output `.ixe` path. Defaults to a slug of the first name (e.g. `nat_add_comm.ixe`)."
+    verbose;                "Print extraction details to stderr."
+
+  ARGS:
+    path : String; "Path to the source `.ixe` (e.g. from `ix compile`)."
+]
+
 def runShardCmd (p : Cli.Parsed) : IO UInt32 := do
   let some pathArg := p.positionalArg? "path"
     | p.printError "error: must specify <path> to a .ixprof file"
@@ -87,6 +133,9 @@ def shardCmd : Cli.Cmd := `[Cli|
 
   ARGS:
     path : String; "Path to a .ixprof produced by `ix profile`"
+
+  SUBCOMMANDS:
+    shardExtractCmd
 ]
 
 end
diff --git a/Ix/KernelCheck.lean b/Ix/KernelCheck.lean
index 2213c062..38c5037b 100644
--- a/Ix/KernelCheck.lean
+++ b/Ix/KernelCheck.lean
@@ -168,6 +168,20 @@ opaque rsCheckAnonConstsFFI :
     @& String →                          -- fail-out path ("" = none)
     IO (Array (String × Option CheckError))
 
+/-- FFI: extract the named constants' dependency closure from a serialized
+    env into a standalone `.ixe` — genuine constant bytes, blobs, and
+    reducibility hints, plus the closure constants' Named entries so names
+    still resolve — without recompiling from source. Names resolve like
+    `rsCheckAnonConstsFFI` (displayed form); a mutual-block member pulls its
+    whole block. Errors on an unresolvable name. -/
+@[extern "rs_env_extract"]
+opaque rsEnvExtractFFI :
+    @& String →                          -- source .ixe path
+    @& Array String →                    -- constant names (displayed form)
+    @& String →                          -- output .ixe path
+    @& Bool →                            -- quiet
+    IO Unit
+
 /-- FFI: profile a `.ixe` out of circuit, writing a `.ixprof` sidecar with
     per-block heartbeats + the delta-unfold graph (the sharding cost model,
     see `plans/sharding.md`). Runs the anon kernel over every checkable target.
diff --git a/crates/ffi/src/kernel.rs b/crates/ffi/src/kernel.rs
index 6438f8a6..d4791e46 100644
--- a/crates/ffi/src/kernel.rs
+++ b/crates/ffi/src/kernel.rs
@@ -1860,6 +1860,133 @@ pub extern "C" fn rs_kernel_check_anon_consts(
   build_anon_result_array(&addrs_for_return, &results)
 }
 
+/// FFI: extract the named constants' dependency closure from a serialized
+/// env into a standalone `.ixe` — genuine constant bytes, blobs, and
+/// reducibility hints (via the anon view), plus every closure constant's
+/// Named entry (via the full view) so names still resolve downstream — all
+/// without recompiling from source. Each name extracts its covering work
+/// item (a mutual-block member pulls the whole block).
+#[unsafe(no_mangle)]
+pub extern "C" fn rs_env_extract(
+  env_path: LeanString<LeanBorrowed<'_>>,
+  names: LeanArray<LeanBorrowed<'_>>,
+  out_path: LeanString<LeanBorrowed<'_>>,
+  quiet: LeanBool<LeanBorrowed<'_>>,
+) -> LeanIOResult<LeanOwned> {
+  use ix_kernel::anon_work::{
+    block_of_addr, build_anon_work, build_sub_env_named, work_block_addr,
+  };
+
+  let quiet = quiet.to_bool();
+  let path = env_path.to_string();
+  let out = out_path.to_string();
+  let names_vec: Vec<String> = names.map(|obj| obj.as_string().to_string());
+  if names_vec.is_empty() {
+    return LeanIOResult::error_string(
+      "rs_env_extract: no constant names given",
+    );
+  }
+
+  let bytes = match std::fs::read(&path) {
+    Ok(bytes) => bytes,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_env_extract: failed to read {path}: {e}"
+      ));
+    },
+  };
+  let mut slice: &[u8] = &bytes;
+  let full = match IxonEnv::get(&mut slice) {
+    Ok(env) => env,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_env_extract: failed to deserialize {path}: {e}"
+      ));
+    },
+  };
+  // Resolve displayed names → addresses through the full env's `named`
+  // metadata (the anon view discards it).
+  let by_name: FxHashMap<String, Address> = full
+    .named
+    .iter()
+    .map(|e| (e.key().to_string(), e.value().addr.clone()))
+    .collect();
+  let mut resolved: Vec<Address> = Vec::with_capacity(names_vec.len());
+  let mut missing: Vec<&str> = Vec::new();
+  for n in &names_vec {
+    match by_name.get(n.as_str()) {
+      Some(a) => resolved.push(a.clone()),
+      None => missing.push(n),
+    }
+  }
+  if !missing.is_empty() {
+    return LeanIOResult::error_string(&format!(
+      "rs_env_extract: no constant(s) named [{}] in {path}",
+      missing.join(", ")
+    ));
+  }
+
+  let mut slice: &[u8] = &bytes;
+  let anon = match IxonEnv::get_anon(&mut slice) {
+    Ok(env) => env,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_env_extract: failed to deserialize (anon) {path}: {e}"
+      ));
+    },
+  };
+
+  // Roots: each name's covering work item's proven targets (standalone →
+  // itself; a mutual-block member → every sibling, checked atomically).
+  let work = match build_anon_work(&anon) {
+    Ok(work) => work,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_env_extract: build_anon_work: {e}"
+      ));
+    },
+  };
+  let by_block: FxHashMap<Address, usize> = work
+    .iter()
+    .enumerate()
+    .map(|(i, w)| (work_block_addr(&anon, w), i))
+    .collect();
+  let mut roots: Vec<Address> = Vec::new();
+  for addr in &resolved {
+    let block = block_of_addr(&anon, addr);
+    match by_block.get(&block) {
+      Some(&i) => roots.extend(work[i].proven_targets()),
+      None => {
+        return LeanIOResult::error_string(&format!(
+          "rs_env_extract: no work item covers block {}…",
+          &block.hex()[..16]
+        ));
+      },
+    }
+  }
+
+  let sub_bytes = match build_sub_env_named(&anon, &full, &roots) {
+    Ok(b) => b,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!("rs_env_extract: {e}"));
+    },
+  };
+  if let Err(e) = std::fs::write(&out, &sub_bytes) {
+    return LeanIOResult::error_string(&format!(
+      "rs_env_extract: failed to write {out}: {e}"
+    ));
+  }
+  if !quiet {
+    eprintln!(
+      "[rs_env_extract] {} name(s) → {} ({} bytes) from {path}",
+      names_vec.len(),
+      out,
+      sub_bytes.len(),
+    );
+  }
+  LeanIOResult::ok(LeanOwned::box_usize(0))
+}
+
 // ===========================================================================
 // Sharding profiler: run the anon kernel out of circuit over a `.ixe`,
 // recording per-block heartbeats + the delta-unfold graph into a `.ixprof`.
diff --git a/crates/kernel/src/anon_work.rs b/crates/kernel/src/anon_work.rs
index be0aae24..1f5be74d 100644
--- a/crates/kernel/src/anon_work.rs
+++ b/crates/kernel/src/anon_work.rs
@@ -279,6 +279,17 @@ pub fn build_sub_env(
   source: &IxonEnv,
   roots: &[Address],
 ) -> Result<Vec<u8>, String> {
+  let sub = sub_env_of(source, roots);
+  let mut buf = Vec::new();
+  sub.put(&mut buf).map_err(|e| format!("sub-env serialize: {e}"))?;
+  Ok(buf)
+}
+
+/// The in-memory closure sub-env behind [`build_sub_env`]: copy the BFS
+/// dependency closure of `roots` (genuine bytes + blobs + reducibility
+/// hints), no Named section.
+#[cfg(not(target_arch = "riscv64"))]
+fn sub_env_of(source: &IxonEnv, roots: &[Address]) -> IxonEnv {
   let closure = closure_addrs(source, roots);
   let mut sub = IxonEnv::new();
   for addr in &closure {
@@ -296,6 +307,55 @@ pub fn build_sub_env(
       sub.anon_hints.insert(addr.clone(), *h);
     }
   }
+  sub
+}
+
+/// [`build_sub_env`] plus a name→address entry for every closure constant
+/// named in the FULL view of the same env — a standalone `.ixe` whose names
+/// still resolve (for `--consts`-style tools), extracted without recompiling
+/// from source.
+///
+/// METADATA IS DROPPED: each copied entry is `Named::with_addr` (empty
+/// `ConstantMeta`), because real metadata references name addresses
+/// throughout its tree and carrying it would require the full env's
+/// hash-consed name index. The extract serves the ANON pipeline
+/// (`check-rs --anon`, the zkVM hosts, `ix profile`/`ix shard`,
+/// `bench-typecheck`), where metadata is never read and reducibility hints
+/// travel in the `anon_hints` section instead. Meta-mode tools need the
+/// source env.
+#[cfg(not(target_arch = "riscv64"))]
+pub fn build_sub_env_named(
+  source: &IxonEnv,
+  full: &IxonEnv,
+  roots: &[Address],
+) -> Result<Vec<u8>, String> {
+  use ix_common::env::NameData;
+
+  let sub = sub_env_of(source, roots);
+  // The Named section serializes keys as name HASHES resolved through the
+  // names section, so each key's component chain must be interned too.
+  fn intern_chain(sub: &IxonEnv, name: &ix_common::env::Name) {
+    let addr = Address::from_blake3_hash(*name.get_hash());
+    if sub.get_name(&addr).is_some() {
+      return;
+    }
+    match name.as_data() {
+      NameData::Anonymous(_) => {},
+      NameData::Str(parent, _, _) | NameData::Num(parent, _, _) => {
+        intern_chain(sub, parent);
+      },
+    }
+    sub.store_name(addr, name.clone());
+  }
+  for e in full.named.iter() {
+    if sub.get_const(&e.value().addr).is_some() {
+      intern_chain(&sub, e.key());
+      sub.register_name(
+        e.key().clone(),
+        ixon::env::Named::with_addr(e.value().addr.clone()),
+      );
+    }
+  }
   let mut buf = Vec::new();
   sub.put(&mut buf).map_err(|e| format!("sub-env serialize: {e}"))?;
   Ok(buf)

From 7d27c6ba5f27607b93c62837982a463af7d4b362 Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Fri, 3 Jul 2026 15:53:51 -0400
Subject: [PATCH 25/27] feat(ci): closure-sharded zisk execute for heavy
 primaries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the whole-env sharded run (wrong scope: all of InitStd is
1.94e12 steps / 1263 shards, and its biggest atomic block is INFEASIBLE
under the cap) with per-constant closure sharding through the canonical
pipeline. For each heavy-tier primary, run.sh runs `ix shard extract`
(closure-only env, no recompile) → `ix profile` → `ix shard --max-ram
120`, then one `zisk-host --shard-plan --execute` run executes the
shards sequentially — the constant's row carries totals plus the
per-shard shard-{cycles,time,peak-rss}:<k> breakdown, uploaded to
bencher like every other measure. Cheap primaries keep the single-leaf
--consts run.

- run.sh: cut_closure_shards (also exposed as the `cutshards` backend so
  bench-main's compile job pre-cuts through the same code path) +
  heavy-tier dispatch in the zisk loop (ZISK_HEAVY_NAMES). A constant
  whose partition still can't fit (atomic mutual block over the cap)
  OOMs under the RAM watchdog: honest OOM row, remaining shards skipped,
  loop proceeds to the next constant. Cutting failures fall back to the
  single-leaf run.
- bench.py manifest --heavy-out: the selected heavy-tier names, from
  Vectors.csv's tier column.
- bench-main: the compile job pre-cuts zkshards-<Bench>/ next to the
  fresh .ixe (it has ix + the toolchain; the zkvm job stays Lean-free)
  and ships it to the zkvm job in the sha-keyed bench-ixe-* cache entry.
- bench-pr: the PR side cuts its own artifacts fresh every run (cheap —
  seconds per closure — and a PR can change the cost profile; profiling
  counts heartbeats, so an unchanged tree re-partitions
  deterministically); the base fallback reuses bench-main's pre-cut dir.
---
 .github/scripts/bench.py         |  13 +++-
 .github/scripts/run.sh           | 108 ++++++++++++++++++++-----------
 .github/workflows/bench-main.yml |  55 +++++++++-------
 .github/workflows/bench-pr.yml   |  40 +++---------
 docs/benchmarking.md             |  46 ++++++++-----
 5 files changed, 154 insertions(+), 108 deletions(-)

diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index 66e1851b..cd15397e 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -175,7 +175,7 @@ def cmd_manifest(a):
     # every primary (RAM watchdog catches OOMs), so all primaries are selected
     # here regardless of tier.
     tier = a.tier or ("cheap" if (a.mode == "prove" and not a.primary) else "all")
-    names = []
+    names, heavy = [], []
     with open(a.csv) as f:
         for line in f:
             row = line.rstrip("\n")
@@ -199,8 +199,16 @@ def cmd_manifest(a):
             if a.shard == "1" and shard != "1":
                 continue
             names.append(name)
+            if ctier == "heavy":
+                heavy.append(name)
     with open(a.out, "w") as f:
         f.write("\n".join(names) + ("\n" if names else ""))
+    # The selected names that are heavy-tier — the subset the zisk cells run
+    # through the closure-sharded pipeline (ix extract → profile → shard)
+    # instead of a single full-closure leaf.
+    if a.heavy_out:
+        with open(a.heavy_out, "w") as f:
+            f.write("\n".join(heavy) + ("\n" if heavy else ""))
     print(f"count={len(names)}\ntier={tier}")
 
 
@@ -625,6 +633,9 @@ def main():
                         "which doesn't consume Vectors.csv).")
     m.add_argument("--primary", action="store_true",
                    help="Restrict to the primary subset (the primary=1 column).")
+    m.add_argument("--heavy-out", dest="heavy_out",
+                   help="Also write the selected heavy-tier names (one per "
+                        "line) — the subset zisk runs closure-sharded.")
     m.set_defaults(fn=cmd_manifest)
 
     b = sub.add_parser("bmf")
diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index f130e0ee..533c3fb1 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -7,8 +7,8 @@
 #   run.sh <repo_dir> <env> <backend> <mode> <names_file> <out_json>
 #     repo_dir : checked-out worktree (has .lake/build/bin/{ix,bench-typecheck})
 #     env      : initStd | lean | mathlib  (any case; used verbatim for <env>.ixe)
-#     backend  : aiur | zisk | sp1 | ooc | compile
-#     mode     : execute | prove | compile
+#     backend  : aiur | zisk | sp1 | ooc | compile | cutshards
+#     mode     : execute | prove | compile (ignored by cutshards)
 #
 # `ix` / `bench-typecheck` come from <repo_dir> (so base measures base's code, PR
 # the PR's — the caller puts <repo_dir>/.lake/build/bin on PATH). For the
@@ -133,6 +133,35 @@ resolve_bin() {  # <name> → prints the path, or fails
 tmp=$(mktemp -d)
 compile_log="$tmp/compile.log"
 
+# Closure-shard artifacts for the zisk heavy tier: `ix shard extract` cuts a
+# standalone closure-only env (no recompile), `ix profile` → `ix shard` cut
+# its manifest (the canonical partitioner: profiled heartbeats + min-cut,
+# capped by predicted RAM). One dir per env; slugs must match the zkvm loop's
+# result keys (same `tr` set).
+shards_dir_for_env() { printf '%s' "$repo/zkshards-$benv"; }
+cut_closure_shards() {  # <name> <slug> → 0 when <dir>/$slug.{ixe,ixes} are ready
+  local c="$1" slug="$2" dir ix_bin rc
+  dir=$(shards_dir_for_env)
+  [ -f "$dir/$slug.ixes" ] && [ -f "$dir/$slug.ixe" ] && return 0
+  ix_bin=$(resolve_bin ix 2>/dev/null) || {
+    echo "::warning::no ix binary to cut closure shards for '$c'" >&2
+    return 1
+  }
+  mkdir -p "$dir"
+  echo "::group::ix shard extract + profile + shard: $c"
+  "$ix_bin" shard extract "$ixe" --consts "$c" --out "$dir/$slug.ixe" \
+    && "$ix_bin" profile "$dir/$slug.ixe" --out "$dir/$slug.ixprof" \
+    && "$ix_bin" shard "$dir/$slug.ixprof" \
+         --max-ram "${SHARD_MAX_RAM_GB:-120}" --out "$dir/$slug.ixes"
+  rc=$?
+  echo "::endgroup::"
+  [ "$rc" -eq 0 ] || {
+    echo "::warning::extract/profile/shard failed for '$c'" >&2
+    rm -f "$dir/$slug.ixes"
+    return 1
+  }
+}
+
 # `compile` backend needs a fresh compile to measure — never honor REUSE_IXE.
 ixe="$repo/$benv.ixe"
 if [ "${REUSE_IXE:-0}" = 1 ] && [ "$backend" != compile ] && [ -f "$ixe" ]; then
@@ -247,14 +276,14 @@ case "$backend" in
     # plain subshell wrapper's pgid would be run.sh's own. The host writes
     # $res only on a clean (zero-failure) run; `$out` is re-merged per run so
     # a job-level kill keeps completed rows.
-    zkvm_run() {  # <timeout> <key> <host args…>
-      local run_timeout="$1" key="$2"; shift 2
+    zkvm_run() {  # <timeout> <key> <ixe> <host args…>
+      local run_timeout="$1" key="$2" run_ixe="$3"; shift 3
       local slug; slug=$(printf '%s' "$key" | tr '/ .:' '____')
       local res="$tmp/$slug.json" log="$tmp/$slug.log" oom="$tmp/$slug.oom"
       local spans="$res.spans" zk_pid w_pid zk_exit
       rm -f "$oom"
       ( cd "$work" && exec setsid timeout "$run_timeout" "$bin" --execute \
-          --ixe "$ixe" --json "$res" --texray "$@" ) \
+          --ixe "$run_ixe" --json "$res" --texray "$@" ) \
         > "$log" 2>&1 &
       zk_pid=$!
       watch_ram_kill "$zk_pid" "$ceiling_gb" "$oom" &
@@ -277,44 +306,38 @@ case "$backend" in
         jq -s 'reduce .[] as $o ({}; . + $o)' "$rows"/*.json > "$out" 2>/dev/null || true
       fi
     }
+    # Closure-sharded pipeline for the heavy tier (zisk only). A heavy
+    # constant's full closure blows the runner's RAM as a single leaf, so it
+    # runs as its shard-manifest partition instead: `ix shard extract` cuts a
+    # standalone closure-only env, `ix profile` → `ix shard` cut the manifest
+    # (the canonical partitioner: profiled heartbeats + min-cut, capped by
+    # predicted RAM), and one `--shard-plan` host run executes the shards
+    # sequentially, emitting the constant's row (totals + per-shard
+    # breakdown). bench-main pre-cuts the artifacts in the compile job and
+    # ships them via cache; the PR side cuts its own — a PR can change the
+    # cost profile, and profiling counts heartbeats (not wall time) so an
+    # unchanged tree re-partitions deterministically. If cutting isn't
+    # possible (no ix binary, or a failure), fall back to the single-leaf
+    # run — the watchdog then records the honest OOM row.
+    heavy_file="${ZISK_HEAVY_NAMES:-}"
+    is_heavy() {
+      [ -n "$heavy_file" ] && [ -f "$heavy_file" ] && grep -qxF "$1" "$heavy_file"
+    }
+    shards_dir=$(shards_dir_for_env)
     # Full-closure check (no --skip-deps) so this is directly comparable to
     # the ooc `ix check-rs --anon --consts` run — the delta then isolates the
     # in-circuit-vs-out-of-circuit overhead rather than mixing in subject-
     # only vs full-closure scope.
     while IFS= read -r c; do
       [ -z "$c" ] && continue
-      zkvm_run "${ZKVM_EXECUTE_TIMEOUT:-25m}" "$c" --consts "$c"
-    done < "$names"
-    # Env-sharded execute (zisk only): execute the WHOLE env as its manifest
-    # shards and merge one env-keyed row — totals plus a per-shard
-    # `shard-cycles` breakdown — alongside the per-constant rows. The
-    # manifest comes from bench-main's compile-job cache when it was restored
-    # next to the `.ixe`; otherwise (the !benchmark PR side, a cold base
-    # fallback) it is cut fresh HERE — each side profiles its own tree, since
-    # a PR can change the cost profile, and the profiler counts heartbeats
-    # (not wall time) so an unchanged tree re-partitions deterministically.
-    # ZISK_ENV_SHARD=0 skips the whole run (the partial base fallback, where
-    # bencher already holds main's env row).
-    plan_ixes="${ixe%.ixe}.ixes"
-    if [ "$backend" = zisk ] && [ "${ZISK_ENV_SHARD:-1}" = 1 ]; then
-      if [ ! -f "$plan_ixes" ]; then
-        if ix_bin=$(resolve_bin ix 2>/dev/null); then
-          echo "::group::ix profile + shard → $plan_ixes"
-          "$ix_bin" profile "$ixe" --out "${ixe%.ixe}.ixprof" \
-            && "$ix_bin" shard "${ixe%.ixe}.ixprof" \
-                 --max-ram "${SHARD_MAX_RAM_GB:-120}" --out "$plan_ixes" \
-            || { echo "::warning::ix profile/shard failed; skipping env-sharded run" >&2
-                 rm -f "$plan_ixes"; }
-          echo "::endgroup::"
-        else
-          echo "::warning::no ix binary to cut $plan_ixes; skipping env-sharded run" >&2
-        fi
-      fi
-      if [ -f "$plan_ixes" ]; then
-        zkvm_run "${ZISK_ENV_EXECUTE_TIMEOUT:-60m}" "$benv_cc" \
-          --shard-plan "$plan_ixes" --json-name "$benv_cc"
+      slug=$(printf '%s' "$c" | tr '/ .:' '____')
+      if [ "$backend" = zisk ] && is_heavy "$c" && cut_closure_shards "$c" "$slug"; then
+        zkvm_run "${ZKVM_EXECUTE_TIMEOUT:-25m}" "$c" "$shards_dir/$slug.ixe" \
+          --shard-plan "$shards_dir/$slug.ixes" --json-name "$c"
+      else
+        zkvm_run "${ZKVM_EXECUTE_TIMEOUT:-25m}" "$c" "$ixe" --consts "$c"
       fi
-    fi
+    done < "$names"
     emit_empty
     ;;
 
@@ -359,6 +382,19 @@ case "$backend" in
     emit_empty
     ;;
 
+  cutshards)
+    # Pre-cut the closure-shard artifacts for every name (bench-main's
+    # compile job — it has `ix` + the Lean toolchain next to the fresh
+    # `.ixe`, so the zkvm job stays Lean-free and just restores the dir).
+    # Exactly the artifacts the zisk branch cuts lazily when absent.
+    while IFS= read -r c; do
+      [ -z "$c" ] && continue
+      slug=$(printf '%s' "$c" | tr '/ .:' '____')
+      cut_closure_shards "$c" "$slug" || true
+    done < "$names"
+    emit_empty
+    ;;
+
   compile)
     # `ix compile <env>.lean → <env>.ixe` is the benchmark; the compile step
     # above always ran fresh for this backend (REUSE_IXE ignored) and teed to
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index ab9670d7..fbfbbf28 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -130,28 +130,33 @@ jobs:
           bash .github/scripts/run.sh . ${{ matrix.bench }} compile compile /dev/null neutral.json
           python3 .github/scripts/bench.py bmf --in neutral.json --out benchmark.json
           cat benchmark.json
-      # Profile the env out of circuit and cut a shard manifest (`ix profile`
-      # → `ix shard`) for the zkvm-execute job's env-sharded run. Profiling is
-      # cache-isolated (sound heartbeats), so it costs a full un-memoized
-      # kernel pass — done once here, next to the freshly compiled `.ixe`,
-      # rather than in every consumer. --max-ram caps each shard's predicted
-      # prove RAM under the runner class's ceiling
-      # (docs/zisk-cycle-cost-model.md) — conservative for execute-only runs.
+      # Pre-cut the zisk closure-shard artifacts for the heavy primaries
+      # (`ix extract` → `ix profile` → `ix shard`, via run.sh's cutshards
+      # backend — the same code path the !benchmark PR side runs lazily).
+      # Done here, next to the fresh `.ixe` with `ix` + the Lean toolchain
+      # on hand, so the zkvm job stays Lean-free and just restores the dir.
       - if: matrix.bench == 'InitStd' || matrix.bench == 'Mathlib'
-        name: Profile + shard plan
+        name: Cut closure shards for heavy primaries
+        env:
+          REUSE_IXE: "1"
         run: |
-          ix profile ${{ matrix.bench }}.ixe --out ${{ matrix.bench }}.ixprof
-          ix shard ${{ matrix.bench }}.ixprof --max-ram 120 --out ${{ matrix.bench }}.ixes
-      # Cache the `.ixe` + shard manifest for the prove/zkvm jobs (reused,
-      # never regenerated there). Only the matrix jobs those consume, to stay
-      # under the repo cache limit. NB: every restore of this key must list
-      # the SAME paths — actions/cache versions the entry by its path list.
+          benv="${{ matrix.bench }}"; benv="${benv,}"
+          python3 .github/scripts/bench.py manifest \
+            --csv Benchmarks/Vectors.csv --env "$benv" --mode execute --primary \
+            --heavy-out heavy.txt --out /dev/null
+          echo "cutting closure shards for $(wc -l < heavy.txt) heavy primaries:"; cat heavy.txt
+          bash .github/scripts/run.sh . "${{ matrix.bench }}" cutshards execute heavy.txt /dev/null
+      # Cache the `.ixe` + closure-shard artifacts for the prove/zkvm jobs
+      # (reused, never regenerated there). Only the matrix jobs those
+      # consume, to stay under the repo cache limit. NB: every restore of
+      # this key must list the SAME paths — actions/cache versions the entry
+      # by its path list.
       - if: matrix.bench == 'InitStd' || matrix.bench == 'Mathlib'
         uses: actions/cache/save@v5
         with:
           path: |
             ${{ matrix.bench }}.ixe
-            ${{ matrix.bench }}.ixes
+            zkshards-${{ matrix.bench }}
           key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
       # Upload compile metrics. Every measure shares the per-workload baseline
       # window (data points since the ix-compile reset tag): file-size/constants
@@ -212,7 +217,7 @@ jobs:
         with:
           path: |
             ${{ matrix.bench }}.ixe
-            ${{ matrix.bench }}.ixes
+            zkshards-${{ matrix.bench }}
           key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
           fail-on-cache-miss: true
       # Run each constant in its own process so a clean failure or timeout drops
@@ -323,14 +328,15 @@ jobs:
       - name: Install SP1
         if: matrix.backend == 'sp1'
         uses: ./.github/actions/install-sp1
-      # Pull the `.ixe` + shard manifest the compile job built — no recompile
-      # (REUSE_IXE). run.sh's zisk branch adds the env-sharded execute row
-      # when the `.ixes` is present next to the `.ixe`.
+      # Pull the `.ixe` + the pre-cut closure-shard artifacts the compile job
+      # built — no recompile (REUSE_IXE), no `ix` needed here: run.sh's zisk
+      # branch runs each heavy primary as its shard-manifest partition when
+      # its artifacts are present in zkshards-<Bench>/.
       - uses: actions/cache/restore@v5
         with:
           path: |
             ${{ matrix.bench }}.ixe
-            ${{ matrix.bench }}.ixes
+            zkshards-${{ matrix.bench }}
           key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
           fail-on-cache-miss: true
       - name: Run ${{ matrix.backend }} execute benchmark
@@ -338,12 +344,13 @@ jobs:
           REUSE_IXE: "1"
         run: |
           # The primary subset in execute mode = all primaries for the env
-          # (cheap + heavy); execute handles the heavy ones, unlike prove.
+          # (cheap + heavy); heavy ones run closure-sharded on zisk.
           benv="${{ matrix.bench }}"; benv="${benv,}"   # InitStd→initStd
           python3 .github/scripts/bench.py manifest \
             --csv Benchmarks/Vectors.csv --env "$benv" --mode execute --primary \
-            --out names.txt
-          echo "executing $(wc -l < names.txt) primary constants:"; cat names.txt
+            --heavy-out heavy.txt --out names.txt
+          echo "executing $(wc -l < names.txt) primary constants ($(wc -l < heavy.txt) heavy):"; cat names.txt
+          export ZISK_HEAVY_NAMES="$GITHUB_WORKSPACE/heavy.txt"
           bash .github/scripts/run.sh . ${{ matrix.bench }} ${{ matrix.backend }} execute \
             names.txt neutral.json
           # neutral → Bencher Metric Format (phases flattened; the boolean
@@ -420,7 +427,7 @@ jobs:
         with:
           path: |
             ${{ matrix.bench }}.ixe
-            ${{ matrix.bench }}.ixes
+            zkshards-${{ matrix.bench }}
           key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
           fail-on-cache-miss: true
       # run.sh ooc runs `ix check --anon` (whole env, parallel) and emits the
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index af1fae59..9e5a65ac 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -174,7 +174,12 @@ jobs:
             --csv Benchmarks/Vectors.csv --env "$BENV" --mode "$MODE" \
             --backend "$BACKEND" \
             --tier "$TIER" --shard "$SHARD" $PRIMARY --out "$GITHUB_WORKSPACE/names.txt" \
+            --heavy-out "$GITHUB_WORKSPACE/heavy.txt" \
             | tee -a "$GITHUB_OUTPUT"
+          # Heavy-tier names run closure-sharded on zisk (run.sh cuts the
+          # artifacts with `ix extract` → `ix profile` → `ix shard` when
+          # they aren't already cached).
+          echo "ZISK_HEAVY_NAMES=$GITHUB_WORKSPACE/heavy.txt" >> "$GITHUB_ENV"
 
       # Restore the once-built PR binaries (see the build job) and stage them
       # into .bins/pr — ~/.local/bin is reused below for the base side's cache
@@ -209,16 +214,6 @@ jobs:
         with:
           path: ${{ matrix.cell.env }}.ixe
           key: bench-pr-ixe-${{ env.HEAD_SHA }}-${{ matrix.cell.env }}
-      # The PR's shard manifest (run.sh cuts it via ix profile → ix shard for
-      # the env-sharded zisk run — the PR side partitions its OWN tree). Cached
-      # separately from the `.ixe`: only zisk cells produce it.
-      - name: Restore PR shard manifest
-        if: matrix.cell.backend == 'zisk'
-        id: pr-ixes
-        uses: actions/cache/restore@v5
-        with:
-          path: ${{ matrix.cell.env }}.ixes
-          key: bench-pr-ixes-${{ env.HEAD_SHA }}-${{ matrix.cell.env }}
       # zkVM cells additionally need the Rust toolchain + the backend's toolchain
       # and system deps (the shared composite install actions).
       - name: Set up zkVM Rust toolchain
@@ -305,10 +300,10 @@ jobs:
         uses: actions/cache/restore@v5
         with:
           # Path list must match bench-main's compile-job save (actions/cache
-          # versions entries by path list); the `.ixes` itself is unused here.
+          # versions entries by path list).
           path: |
             ${{ steps.envcc.outputs.cc }}.ixe
-            ${{ steps.envcc.outputs.cc }}.ixes
+            zkshards-${{ steps.envcc.outputs.cc }}
           key: bench-ixe-${{ env.BASE_SHA }}-${{ steps.envcc.outputs.cc }}
       # Cached base binaries are usable only when the two `lean-toolchain`
       # files are identical (plain `cmp`), and — for the mathlib env — only
@@ -348,9 +343,9 @@ jobs:
         run: |
           if [ "${{ steps.base-ixe.outputs.cache-hit }}" = true ]; then
             mv "${{ steps.envcc.outputs.cc }}.ixe" "base/$BENV.ixe"
-            # bench-main's shard manifest rides the same cache entry; with it
-            # in place run.sh skips re-profiling the base tree.
-            mv "${{ steps.envcc.outputs.cc }}.ixes" "base/$BENV.ixes" 2>/dev/null || true
+            # bench-main's pre-cut closure shards ride the same cache entry;
+            # in place, run.sh skips re-cutting them for the base tree.
+            mv "zkshards-${{ steps.envcc.outputs.cc }}" "base/zkshards-$BENV" 2>/dev/null || true
             export REUSE_IXE=1
           fi
           if [ "${{ steps.base-src.outputs.cached }}" = true ]; then
@@ -358,10 +353,6 @@ jobs:
           else
             export PATH="$PWD/base/.lake/build/bin:$PATH"
           fi
-          # The zisk env-sharded run is only worth paying on the FULL
-          # fallback — a partial miss means bencher already holds main's
-          # env-keyed row (it is never in missing.txt).
-          export ZISK_ENV_SHARD=$([ "${{ steps.bencher.outputs.base-names }}" = names.txt ] && echo 1 || echo 0)
           bash .github/scripts/run.sh base "$BENV" "$BACKEND" "$MODE" \
             "$GITHUB_WORKSPACE/${{ steps.bencher.outputs.base-names }}" \
             "$GITHUB_WORKSPACE/base.json"
@@ -394,17 +385,6 @@ jobs:
         with:
           path: ${{ matrix.cell.env }}.ixe
           key: bench-pr-ixe-${{ env.HEAD_SHA }}-${{ matrix.cell.env }}
-      # hashFiles guard: skip (instead of erroring) when profile/shard failed
-      # and run.sh dropped the env-sharded run.
-      - name: Save PR shard manifest
-        if: >-
-          matrix.cell.backend == 'zisk' &&
-          steps.pr-ixes.outputs.cache-hit != 'true' &&
-          hashFiles(format('{0}.ixes', matrix.cell.env)) != ''
-        uses: actions/cache/save@v5
-        with:
-          path: ${{ matrix.cell.env }}.ixes
-          key: bench-pr-ixes-${{ env.HEAD_SHA }}-${{ matrix.cell.env }}
 
       # ---------- compare ----------
       - name: Build comparison table
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index e122ecd3..4caf314e 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -19,7 +19,7 @@ the same backend drivers:
 | backend | what it measures | metrics |
 |---|---|---|
 | `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove; each fresh proof is also verified) | `fft-cost`, `execute-time`, `execute-peak-rss`, `prove-time`, `verify-time`, `proof-size`, `peak-rss` |
-| `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `execute-peak-rss`; zisk's env-sharded row adds `shards`, `max-shard-cycles`, `shard-cycles:<k>` |
+| `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `execute-peak-rss`; zisk's closure-sharded heavy rows add `shards`, `max-shard-cycles`, `shard-{cycles,time,peak-rss}:<k>` |
 | `ooc` | the same kernel run **out-of-circuit and in parallel** (`ix check-rs`) — far faster | `throughput`, `check-time`, `peak-rss` |
 | `compile` | `ix compile <env>.lean → <env>.ixe` on the current PR — measures the compile step itself, keyed by CamelCase env slug (`InitStd`, `Lean`, `Mathlib`, `FLT`) | `compile-time`, `throughput`, `file-size`, `constants` |
 
@@ -129,25 +129,37 @@ CI); the host still builds + unit-tests on every PR via ci.yml. To
 re-enable, uncomment sp1 in two places: the zkvm-execute matrix cell in
 `bench-main.yml` and the Install SP1 step in `bench-pr.yml`.
 
-The zisk job additionally executes the **whole env** as its shard-manifest
-partition, merging one env-keyed row (`InitStd` / `Mathlib`): total
+**Heavy primaries run closure-sharded on zisk.** A heavy constant's full
+closure blows the runner's RAM as a single guest leaf, so it executes as its
+shard-manifest partition instead: `ix shard extract <Env>.ixe --consts <name>`
+cuts a standalone closure-only env (no recompile; anon-faithful — identical
+addresses and fft-cost as the full env), `ix profile` → `ix shard --max-ram
+120` cut the manifest (the canonical partitioner: profiled heartbeats +
+min-cut, capped by predicted RAM), and one `zisk-host --shard-plan` run
+executes the shards sequentially. The constant's row then carries total
 `cycles`, `shards`, `max-shard-cycles`, `execute-time`, `throughput`
 (cycles/s), `execute-peak-rss` (max over the per-shard windows), plus the
 per-shard breakdown uploaded as `shard-cycles:<k>` / `shard-time:<k>` /
-`shard-peak-rss:<k>` measures. This is also how the constants that OOM as
-single full-closure leaves get measured at all — under env sharding each
-check fits in one shard, with deps checked in other shards.
-
-The manifest (`ix profile <Env>.ixe` → `ix shard --max-ram 120` →
-`<Env>.ixes`) is cut once per tree: bench-main's compile job generates it
-after the compile benchmark and caches it next to the `.ixe`; on the
-`!benchmark` PR path, run.sh cuts it in-place with the side's own `ix`
-(cached per head SHA) — each side partitions its **own** tree, because a PR
-can change the cost profile, and the profiler counts heartbeats rather than
-wall time so an unchanged tree re-partitions deterministically. The base
-fallback reuses bench-main's cached manifest and only pays the env-sharded
-run on a full bencher miss (a partial miss means bencher already holds
-main's env row; `ZISK_ENV_SHARD=0`).
+`shard-peak-rss:<k>` measures. Cheap primaries keep the plain single-leaf
+`--consts` run.
+
+The artifacts live in `zkshards-<Env>/` and are cut once per tree:
+bench-main's compile job pre-cuts them (run.sh's `cutshards` backend — it
+has `ix`, the toolchain, and the fresh `.ixe`; the zkvm job stays Lean-free)
+and ships them in the `bench-ixe-*` cache; on the `!benchmark` PR path,
+run.sh cuts them fresh in-place with the side's own `ix` (cheap: seconds per
+closure) — each side partitions its **own** tree, because
+a PR can change the cost profile, and the profiler counts heartbeats rather
+than wall time, so an unchanged tree re-partitions deterministically. The
+heavy set comes from Vectors.csv's tier column via `bench.py manifest
+--heavy-out` (`ZISK_HEAVY_NAMES`).
+
+A constant whose partition still can't fit — an atomic mutual block above
+the cap (`ix shard` flags it INFEASIBLE) — OOMs its shard under the RAM
+watchdog like any other over-ceiling run: the constant gets the honest OOM
+row, its remaining shards are skipped, and the loop proceeds to the next
+constant. If cutting fails entirely (no `ix` on PATH, extract error), the
+constant falls back to the single-leaf run.
 
 Threshold semantics per measure kind:
 - **`constants`** — pinned exactly (0/0). A definitional count; either

From 469436d66d25ae353a0ea56063233ac6ee70204d Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Fri, 3 Jul 2026 16:33:49 -0400
Subject: [PATCH 26/27] =?UTF-8?q?feat(bench):=20reject=20typecheck=20failu?=
 =?UTF-8?q?res=20loudly=20=E2=80=94=20fail=20fast,=20=E2=9D=8C=20row,=20re?=
 =?UTF-8?q?d=20job?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A constant the kernel rejects is a correctness regression, not a
benchmark datum. End to end:

- zisk-host: every sharded loop (shard-plan execute + prove, whole-env
  execute + leaf prove) bails on the FIRST failing shard via the shared
  reject_failures helper — mirroring the OOM kill, which also cancels
  the constant's remaining shards — instead of accumulating failures
  across the full manifest. sp1 and the single-leaf paths already bailed
  immediately.
- run.sh: a zkVM failure whose log carries the host's "kernel typecheck
  produced" abort records the neutral `{"failed": true}` sentinel (an
  ::error:: annotation, not a silent drop); ooc does the same when
  ##check## reports failures. bench-typecheck marks Phase-1 check errors
  failed and skips them in Phase 2. Failure warnings now print the log's
  head AND tail (the host's abort lands at the end of a mid-manifest
  log).
- bench.py compare: `failed` renders ❌ cells (outranking OOM) plus a
  bold "FAILED TO TYPECHECK on the <side> side" note under the table;
  bmf strips the sentinel so rejected rows never reach bencher.
- workflows: prove/zkvm/ooc jobs (bench-main) and the benchmark cell
  (bench-pr) exit nonzero when the neutral JSON carries a failed row —
  AFTER the clean rows upload / the table posts, so the red X lands on
  the commit/PR without losing the report.
---
 .github/scripts/bench.py         | 27 ++++++++++++++++++---
 .github/scripts/run.sh           | 21 ++++++++++++----
 .github/workflows/bench-main.yml | 27 +++++++++++++++++++++
 .github/workflows/bench-pr.yml   | 10 ++++++++
 Benchmarks/Typecheck.lean        | 13 +++++++++-
 docs/benchmarking.md             |  9 +++++++
 zisk/host/src/main.rs            | 41 ++++++++++++++++++++++----------
 7 files changed, 128 insertions(+), 20 deletions(-)

diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
index cd15397e..27e595ba 100644
--- a/.github/scripts/bench.py
+++ b/.github/scripts/bench.py
@@ -406,18 +406,32 @@ def emit(text):
     def _oom(d, n):
         return isinstance(d.get(n), dict) and d[n].get("oom") is True
 
+    def _failed(d, n):
+        return isinstance(d.get(n), dict) and d[n].get("failed") is True
+
     regressed, improved = set(), set()
+    failures = []  # (name, side) — typecheck failures, surfaced loudly below
     worst = None  # (badness, dp, name, metric)
     for n in names:
         cells = [f"`{n}`"]
         main_oom, pr_oom = _oom(main_d, n), _oom(pr_d, n)
+        main_failed, pr_failed = _failed(main_d, n), _failed(pr_d, n)
+        if main_failed:
+            failures.append((n, "main"))
+        if pr_failed:
+            failures.append((n, "PR"))
         for m in metrics:
             mv, pv = _num(main_d, n, m), _num(pr_d, n, m)
             # An OOM entry may still carry real Phase-1 measurements (run.sh
             # merges the sentinel into whatever was recorded before the kill);
             # render those, and OOM only for the metrics the kill prevented.
-            mv_h = "OOM" if (main_oom and mv is None) else _human(mv, m)
-            pv_h = "OOM" if (pr_oom and pv is None) else _human(pv, m)
+            # A typecheck FAILURE outranks everything — the constant is
+            # rejected, not benchmarked. Spell it out in the cell: a bare ❌
+            # would read as any generic failure.
+            mv_h = ("❌ failed typecheck" if main_failed
+                    else "OOM" if (main_oom and mv is None) else _human(mv, m))
+            pv_h = ("❌ failed typecheck" if pr_failed
+                    else "OOM" if (pr_oom and pv is None) else _human(pv, m))
             dp = _delta(mv, pv)
             bad = _badness(dp, m)
             cell = "n/a" if dp is None else f"{dp:+.1f}%"
@@ -438,6 +452,13 @@ def _oom(d, n):
         rows.append("| " + " | ".join(cells) + " |")
 
     out = ([title, ""] if title else []) + rows + [""]
+    # Typecheck failures first and loud — a constant the kernel REJECTS is a
+    # correctness signal, not a benchmark blip.
+    for n, side in failures:
+        out.append(f"❌ **`{n}` FAILED TO TYPECHECK on the {side} side** — "
+                   "the kernel rejected it; see the workflow logs.")
+    if failures:
+        out.append("")
     s = (f"_{len(names)} constants · {len(regressed)} regressed · "
          f"{len(improved)} improved (|Δ| > {a.threshold:g}% on any metric)._")
     if worst and worst[0] is not None and worst[0] > a.threshold:
@@ -483,7 +504,7 @@ def cmd_bmf(a):
             continue
         measures = {}
         for k, v in entry.items():
-            if k == "oom":
+            if k in ("oom", "failed"):
                 continue
             # Nested objects are per-sub-measure breakdowns: `phases` (span →
             # seconds) flattens to `phase:<span>`; anything else (e.g. the
diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
index 533c3fb1..9acba71f 100644
--- a/.github/scripts/run.sh
+++ b/.github/scripts/run.sh
@@ -218,7 +218,7 @@ case "$backend" in
           mark_oom "$res" "$c"
         elif [ "$bt_exit" -ne 0 ]; then
           echo "::warning::aiur prove '$c' failed (exit $bt_exit); dropping" >&2
-          sed -n '1,5p' "$tmp/$slug.log" >&2 || true
+          { sed -n '1,5p' "$tmp/$slug.log"; echo "  …"; tail -n 3 "$tmp/$slug.log"; } >&2 || true
           continue
         fi
       fi
@@ -295,9 +295,17 @@ case "$backend" in
       if looks_like_oom "$zk_exit" "$oom" "$log"; then
         echo "::warning::$backend execute '$key' OOM (exit $zk_exit, marker=$([ -f "$oom" ] && echo watchdog || echo runtime), ceiling ${ceiling_gb} GB)" >&2
         mark_oom "$res" "$key"
+      elif [ "$zk_exit" -ne 0 ] && grep -q 'kernel typecheck produced' "$log" 2>/dev/null; then
+        # The kernel REJECTED the constant (the host fails fast and aborts
+        # any remaining shards). Record the `failed` sentinel — compare
+        # renders a ❌ row + loud note, and the workflow fails at the end.
+        echo "::error::$backend: '$key' FAILED TO TYPECHECK — kernel rejected it" >&2
+        tail -n 3 "$log" >&2 || true
+        jq -n --arg n "$key" '{($n): {failed: true}}' > "$res"
       elif [ "$zk_exit" -ne 0 ]; then
         echo "::warning::$backend execute '$key' failed/timed out (exit $zk_exit); dropping" >&2
-        sed -n '1,5p' "$log" >&2 || true
+        # Head for early failures (name resolution), tail for late ones.
+        { sed -n '1,5p' "$log"; echo "  …"; tail -n 3 "$log"; } >&2 || true
         return 0
       fi
       merge_phases "$res" "$spans"
@@ -361,8 +369,13 @@ case "$backend" in
       line=$(grep '^##check##' "$log" | tail -1)
       ems=$(echo "$line" | awk '{print $2}'); fl=$(echo "$line" | awk '{print $4}')
       tot=$(echo "$line" | awk '{print $5}'); rss=$(echo "$line" | awk '{print $6}')
-      { [ -n "${tot:-}" ] && [ "${fl:-1}" = 0 ]; } \
-        || { echo "::warning::ooc '$label': bad ##check## / failures; dropping" >&2; return; }
+      if [ -n "${fl:-}" ] && [ "$fl" != 0 ]; then
+        echo "::error::ooc: '$label' FAILED TO TYPECHECK — kernel rejected $fl item(s)" >&2
+        jq -n --arg n "$label" '{($n): {failed: true}}'
+        return
+      fi
+      [ -n "${tot:-}" ] \
+        || { echo "::warning::ooc '$label': bad ##check## line; dropping" >&2; return; }
       local cs tp
       cs=$(awk -v e="$ems" 'BEGIN{printf "%.3f", e/1000}')
       tp=$(awk -v t="$tot" -v e="$ems" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }')
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index fbfbbf28..2b480b96 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -293,6 +293,17 @@ jobs:
             --threshold-measure throughput --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
             --threshold-lower-boundary 0.10
+      # A typecheck failure is a correctness regression, not a benchmark blip.
+      # The failing constants never reach bencher (bmf strips the sentinel and
+      # drops the empty entry) and the clean rows are uploaded above — now
+      # fail the job LOUDLY so the red X lands on the commit.
+      - name: Fail on typecheck failures
+        run: |
+          failed=$(jq -r 'to_entries[] | select(.value.failed == true) | .key' neutral.json)
+          if [ -n "$failed" ]; then
+            for c in $failed; do echo "::error::$c FAILED TO TYPECHECK"; done
+            exit 1
+          fi
 
   # Execute the same constants through the Zisk and SP1 zkVM hosts and track
   # cycles / execute-time / throughput / execute-peak-rss (and shards / max-shard-cycles
@@ -389,6 +400,14 @@ jobs:
             --threshold-measure throughput --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
             --threshold-lower-boundary 0.10
+      # See the prove job's twin step: reject → red X, after the clean upload.
+      - name: Fail on typecheck failures
+        run: |
+          failed=$(jq -r 'to_entries[] | select(.value.failed == true) | .key' neutral.json)
+          if [ -n "$failed" ]; then
+            for c in $failed; do echo "::error::$c FAILED TO TYPECHECK"; done
+            exit 1
+          fi
 
   # Out-of-circuit Rust kernel typecheck — the same kernel as the zkVM guest, but
   # run out-of-circuit and in parallel (`--workers` defaults to the core count),
@@ -468,3 +487,11 @@ jobs:
             --threshold-measure peak-rss --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
+      # See the prove job's twin step: reject → red X, after the clean upload.
+      - name: Fail on typecheck failures
+        run: |
+          failed=$(jq -r 'to_entries[] | select(.value.failed == true) | .key' neutral.json)
+          if [ -n "$failed" ]; then
+            for c in $failed; do echo "::error::$c FAILED TO TYPECHECK"; done
+            exit 1
+          fi
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index 9e5a65ac..e44b4861 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -404,6 +404,16 @@ jobs:
           name: table-${{ env.LABEL }}
           path: out/table-${{ env.LABEL }}.md
           if-no-files-found: warn
+      # A PR-side typecheck failure is a correctness regression: fail the
+      # cell LOUDLY (red X on the PR) — after the table upload, so the
+      # comment still posts with the constant's ❌ row and note.
+      - name: Fail on PR-side typecheck failures
+        run: |
+          failed=$(jq -r 'to_entries[] | select(.value.failed == true) | .key' "$GITHUB_WORKSPACE/pr.json")
+          if [ -n "$failed" ]; then
+            for c in $failed; do echo "::error::$c FAILED TO TYPECHECK on the PR side"; done
+            exit 1
+          fi
 
   comment:
     needs: [setup, benchmark]
diff --git a/Benchmarks/Typecheck.lean b/Benchmarks/Typecheck.lean
index d9e16fa9..04992eac 100644
--- a/Benchmarks/Typecheck.lean
+++ b/Benchmarks/Typecheck.lean
@@ -102,6 +102,11 @@ structure Result where
   constants : Nat
   fftCost : Float
   executeSec : Float
+  /-- The kernel REJECTED the constant (Phase-1 check error). The JSON entry
+      is the bare `{"failed": true}` sentinel — a rejected constant is a
+      correctness signal, not a benchmark datum — and Phase 2 skips it.
+      bench.py compare renders a ❌ row plus a loud note. -/
+  failed : Bool := false
   proveSec : Option Float := none
   /-- Serialized proof size in bytes (`Aiur.Proof.toBytes`). Tracked because
       prover changes can trade speed against proof size. -/
@@ -134,6 +139,7 @@ def jsonRound (d : Nat) (f : Float) : Json :=
 /-- Neutral, flat results object: `name → { constants, fft-cost, execute-time,
     prove-time?, throughput? }`. No bencher-specific shaping. -/
 def Result.toJsonEntry (r : Result) : String × Json :=
+  if r.failed then (r.name, Json.mkObj [("failed", Json.bool true)]) else
   let base : List (String × Json) :=
     [ ("constants", Lean.toJson r.constants)
     , ("fft-cost", jsonRound 0 r.fftCost)
@@ -246,7 +252,11 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
         else
           compiled.bytecode.checkAddrWithEnv funIdx envHandle addr.hash
       match res with
-      | .error e => IO.eprintln s!"  execute {label} failed: {e}"
+      | .error e =>
+        IO.eprintln s!"  ❌ {label} FAILED TO TYPECHECK: {e}"
+        execed := execed.push
+          ({ name := label, constants := 0, fftCost := 0, executeSec := 0,
+             failed := true }, addr)
       | .ok (_, _, queryCounts) =>
         let stats := Aiur.computeStats compiled queryCounts
         let constants := (IxVM.ClaimHarness.closureFrom ixonEnv addr).size
@@ -289,6 +299,7 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
   let mut spent : Float := 0.0
   for i in [:ordered.size] do
     let (r, addr) := ordered[i]!
+    if r.failed then continue
     try
       let (proveRes, proveSec) ← timed fun _ =>
         if skipDeps then
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index 4caf314e..868e824e 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -30,6 +30,15 @@ every ~3 s and `SIGKILL`s the tree if it exceeds `AIUR_PROVE_MAX_RSS_GB`
 constant records the neutral `{"oom": true}` sentinel and `bench.py compare`
 renders `OOM` cells (with `n/a` Δ%) in the table for that row.
 
+A **typecheck failure** is a correctness regression, not a benchmark blip,
+and surfaces loudly everywhere: the tool fails fast (a zisk shard failure
+aborts the constant's remaining shards, like an OOM kill), the constant
+records the `{"failed": true}` sentinel instead of numbers, `compare`
+renders ❌ cells plus a bold "FAILED TO TYPECHECK" note under the table,
+the failing rows never reach bencher, and the workflow job exits nonzero —
+after the clean rows have been uploaded (bench-main) or the table posted
+(bench-pr) — so the red X lands on the commit/PR.
+
 The `ooc` backend reports two views: the **whole env** (`ix check-rs --anon`,
 keyed by env) and a **per-primary full-closure check** (`ix check-rs --anon
 --consts <name>`, keyed by constant). The per-primary view runs the constant's
diff --git a/zisk/host/src/main.rs b/zisk/host/src/main.rs
index f3b1adaa..9243fc73 100644
--- a/zisk/host/src/main.rs
+++ b/zisk/host/src/main.rs
@@ -232,6 +232,23 @@ fn peak_rss_bytes() -> Option<u64> {
   }
 }
 
+/// Fail FAST on a guest typecheck failure: a rejected constant rejects the
+/// whole workload, so bail before spending cycles (or proofs) on the
+/// remaining shards — mirroring the OOM kill, which also cancels the rest.
+/// Callers write no `--json` row for a rejected workload; the CI harness
+/// keys off this message ("kernel typecheck produced") to record the
+/// `failed` sentinel.
+fn reject_failures(publics: &ShardPublics, ctx: &str) -> Result<()> {
+  if publics.failures > 0 {
+    bail!(
+      "kernel typecheck produced {} failure(s) in {ctx}; \
+       aborting remaining shards",
+      publics.failures
+    );
+  }
+  Ok(())
+}
+
 /// Append the per-constant entry `{ "<name>": <metrics> }` to the neutral
 /// results JSON at `path`. If the file exists, its object is loaded and the new
 /// key is merged in (overwriting on collision), so a multi-const run
@@ -1333,7 +1350,6 @@ async fn run_shard_plan(
   if args.execute {
     let t0 = Instant::now();
     let mut total_steps = 0u64;
-    let mut total_failures = 0u32;
     let mut max_shard_cycles = 0u64;
     let mut max_shard_peak: Option<u64> = None;
     let mut shard_cycles = serde_json::Map::new();
@@ -1363,7 +1379,6 @@ async fn run_shard_plan(
       if let Some(p) = peak {
         shard_peak_rss.insert(key, serde_json::json!(p));
       }
-      total_failures = total_failures.saturating_add(publics.failures);
       println!(
         "  [shard {idx}] {} work items, failures={}, cycles={cycles}, \
          {exec_secs:.1}s, peak {}",
@@ -1374,13 +1389,11 @@ async fn run_shard_plan(
           p as f64 / (1 << 30) as f64
         )),
       );
+      reject_failures(&publics, &format!("shard {idx}"))?;
     }
     let execute_secs = t0.elapsed().as_secs_f64();
     tracing_texray::json_sink::record_manual("zisk/execute", execute_secs);
-    println!("total cycles: {total_steps}, failures: {total_failures}");
-    if total_failures > 0 {
-      bail!("kernel typecheck produced {total_failures} failure(s)");
-    }
+    println!("total cycles: {total_steps}, failures: 0");
     if let Some(path) = &args.json {
       let name = args.json_name.clone().unwrap_or_else(|| {
         manifest_path
@@ -1455,6 +1468,7 @@ async fn run_shard_plan(
       leaf_ms as f64 / 1000.0,
       result.get_execution_steps(),
     );
+    reject_failures(&publics, &format!("shard {idx}"))?;
     // Bind each leaf: its committed subject must equal the env-derived merkle
     // root over the constants it certified. A guest that proved a different set
     // than the manifest assigned would commit a different root and fail here.
@@ -1915,7 +1929,6 @@ async fn main() -> Result<()> {
   if args.execute {
     let mut total_steps: u64 = 0;
     let mut total_exec_ms: u64 = 0;
-    let mut total_failures: u32 = 0;
     for plan in &plans {
       let num_shards = plan.shards.len();
       for (i, &(start, end)) in plan.shards.iter().enumerate() {
@@ -1927,19 +1940,22 @@ async fn main() -> Result<()> {
         let cycles = result.get_execution_steps();
         total_steps += cycles;
         total_exec_ms += result.get_execution_time();
-        total_failures = total_failures.saturating_add(publics.failures);
         println!(
           "  [{} shard {}/{num_shards}] range [{start}, {end}), failures={}, cycles={cycles}",
           plan.label,
           i + 1,
           publics.failures,
         );
+        reject_failures(
+          &publics,
+          &format!("{} shard {}/{num_shards}", plan.label, i + 1),
+        )?;
       }
     }
     let total_exec = Duration::from_millis(total_exec_ms);
     let throughput =
       grand_target_count as f64 / total_exec.as_secs_f64().max(f64::EPSILON);
-    println!("failures: {total_failures}");
+    println!("failures: 0");
     println!("cycles: {total_steps}");
     println!("inputs: {}", plans.len());
     println!("work items: {grand_total_items}");
@@ -1949,9 +1965,6 @@ async fn main() -> Result<()> {
       total_exec.as_secs_f64(),
       throughput.human_throughput("consts"),
     );
-    if total_failures > 0 {
-      bail!("kernel typecheck produced {total_failures} failure(s)");
-    }
     return Ok(());
   }
 
@@ -2004,6 +2017,10 @@ async fn main() -> Result<()> {
         (leaf_ms as f64) / 1000.0,
         result.get_execution_steps(),
       );
+      reject_failures(
+        &publics,
+        &format!("{} leaf {}/{num_shards}", plan.label, i + 1),
+      )?;
       leaf_proof_bytes.push(result.get_proof_bytes()?);
       input_publics.push(publics);
       last_leaf_result = Some(result);

From f1ff88a781162cee986d5cdc252a6bdb0a0445fb Mon Sep 17 00:00:00 2001
From: samuelburnham <45365069+samuelburnham@users.noreply.github.com>
Date: Fri, 3 Jul 2026 16:38:11 -0400
Subject: [PATCH 27/27] test(ci): trigger bench-main on branch pushes
 (TEMPORARY)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop this commit before merging. The zkVM build gate needs no branch
trigger — it lives in ci.yml now and runs on pull_request.
---
 .github/workflows/bench-main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index 2b480b96..71e12911 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -17,7 +17,7 @@ name: Benchmark main
 
 on:
   push:
-    branches: main
+    branches: [main, sb/ci-benchmarks]   # TEMPORARY: test on this branch
   workflow_dispatch:
 
 permissions: