diff --git a/.github/actions/bencher-track/action.yml b/.github/actions/bencher-track/action.yml
index 498f19af..e77b2b9e 100644
--- a/.github/actions/bencher-track/action.yml
+++ b/.github/actions/bencher-track/action.yml
@@ -12,7 +12,7 @@ inputs:
     description: Bencher testbed slug.
     required: true
   workload:
-    description: Workload key for the `bencher-thresholds-reset-<workload>` tag (e.g. ix-compile, aiur).
+    description: Workload key for the `bencher-thresholds-reset-<workload>` tag (ix-compile, aiur-check, zisk-check, sp1-check, ooc-check).
     required: true
   file:
     description: Bencher Metric Format JSON file to upload.
diff --git a/.github/actions/install-sp1/action.yml b/.github/actions/install-sp1/action.yml
new file mode 100644
index 00000000..91ad4d6b
--- /dev/null
+++ b/.github/actions/install-sp1/action.yml
@@ -0,0 +1,31 @@
+name: Install SP1
+description: >-
+  Install the system build deps and the SP1 zkVM toolchain (sp1up) needed to
+  build and run the SP1 host. Assumes a Rust toolchain is already set up.
+
+runs:
+  using: composite
+  steps:
+    # The shared zkVM apt superset (the ZisK book's full Ubuntu list — the
+    # prebuilt cargo tooling and proofman's C++ link OpenMPI/OpenMP/GMP/
+    # nlohmann-json/nasm/secp256k1/… — plus pkg-config + libssl-dev for SP1's
+    # host crates). The Nix shells provided this; a bare runner doesn't.
+    - name: Install system build deps
+      shell: bash
+      run: |
+        # Some warpbuild images ship an unreachable azure mirror that hangs
+        # `apt-get update`; drop it first (no-op elsewhere).
+        sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true
+        sudo apt-get update
+        sudo apt-get install -y \
+          xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \
+          nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \
+          libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \
+          openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \
+          pkg-config libssl-dev
+    - name: Install SP1 toolchain (sp1up, latest)
+      shell: bash
+      run: |
+        curl -L https://sp1up.succinct.xyz | bash
+        ~/.sp1/bin/sp1up
+        echo "$HOME/.sp1/bin" >> "$GITHUB_PATH"
diff --git a/.github/actions/install-zisk/action.yml b/.github/actions/install-zisk/action.yml
new file mode 100644
index 00000000..604fcec8
--- /dev/null
+++ b/.github/actions/install-zisk/action.yml
@@ -0,0 +1,95 @@
+name: Install Zisk
+description: >-
+  Install the system build deps, the ZisK zkVM toolchain (ziskup, CPU build),
+  and — unless `proving-key: false` — the fork-matching proving key needed to
+  RUN the Zisk host. Execute needs the key too (zisk-host's `client.setup()`
+  loads the circuit's const-tree files before either the execute or the prove
+  branch), but BUILDING the host does not, so build-only callers skip the
+  ~3 GB download + const-tree regeneration. Assumes a Rust toolchain is
+  already set up.
+
+inputs:
+  proving-key:
+    description: >-
+      Install the fork-matching proving key (required to execute or prove;
+      not needed to build). Set false for build-only jobs.
+    required: false
+    default: "true"
+
+runs:
+  using: composite
+  steps:
+    # The shared zkVM apt superset — the ZisK book's full Ubuntu list (prebuilt
+    # cargo-zisk + proofman's C++ link OpenMPI/OpenMP/GMP/nlohmann-json/nasm/
+    # secp256k1/…), kept identical to install-sp1 so a host that links both is
+    # covered. The Nix shells provided this; a bare runner doesn't.
+    - name: Install system build deps
+      shell: bash
+      run: |
+        # Some warpbuild images ship an unreachable azure mirror that hangs
+        # `apt-get update`; drop it first (no-op elsewhere).
+        sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true
+        sudo apt-get update
+        sudo apt-get install -y \
+          xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \
+          nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \
+          libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \
+          openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \
+          pkg-config libssl-dev
+    # `--version 0.18.0` pins the toolchain to match our deps. Our host links the
+    # argumentcomputer/zisk `blake3-precompile` fork, which is based on v0.18.0
+    # (its cargo-zisk has `check-setup`, used below to regenerate the key's
+    # const-trees). Without the pin, ziskup installs `releases/latest`, which
+    # resolves to upstream `v1.0.0-alpha` — a different circuit whose cargo-zisk
+    # dropped the `check-setup` subcommand, breaking the key step. `--cpu` picks
+    # the CPU build (no GPU on the runner) and `--nokey` skips ziskup's key
+    # install — both avoid its interactive /dev/tty prompts. We keep `--nokey`
+    # because the upstream `zisk-setup` bucket only carries the upstream circuit's
+    # key; our fork has a different circuit (extra Blake3f AIR), so we restore the
+    # fork-matching key from our own S3 in the next step. `--prefix $HOME/.zisk`
+    # pins the install where cargo-zisk's ZiskPaths fallback looks (the runner
+    # sets XDG_CONFIG_HOME, which would otherwise relocate it).
+    - name: Install Zisk toolchain (ziskup, pinned v0.18.0)
+      shell: bash
+      run: |
+        curl -L https://raw.githubusercontent.com/0xPolygonHermez/zisk/main/ziskup/install.sh \
+          | bash -s -- --cpu --nokey -y --version 0.18.0 --prefix "$HOME/.zisk"
+        echo "$HOME/.zisk/bin" >> "$GITHUB_PATH"
+    # Pre-build the proofman C++ sys crate ALONE so its build script runs
+    # exactly once before any parallel zisk-host build. zisk-host pulls
+    # zisk-sdk as both a dependency and a build-dependency, so cargo compiles
+    # proofman-starks-lib-c as two units whose build scripts can run
+    # CONCURRENTLY — and both run `make` inside the SHARED
+    # ~/.cargo/git/checkouts/pil2-proofman-* source dir. On a cold runner the
+    # Makefile stamp is absent, so both units take the `make clean` +
+    # `make -j` path and race: one unit's clean deletes build/ while the
+    # other's g++ is mid-compile ("opening dependency file ….d: No such file
+    # or directory"). Building the crate solo writes the stamp; the second
+    # unit then skips the clean and its `make -j` is a no-op.
+    # (Proper fix is an flock in pil2-proofman's build.rs — upstream.)
+    - name: Pre-build proofman-starks-lib-c (serialize the shared make)
+      shell: bash
+      run: cargo build --release -p proofman-starks-lib-c
+      working-directory: zisk
+    # Execute still needs a proving key present: zisk-host calls `client.setup()`
+    # (which the SDK runs before the execute branch), and that loads the circuit's
+    # const-tree files. We host the fork-matching key in a public S3 bucket
+    # WITHOUT the const-trees — exactly like Zisk's released
+    # `zisk-provingkey-*.tar.gz` on `storage.googleapis.com/zisk-setup` — and
+    # regenerate them here with `cargo-zisk check-setup -a`, which is how `ziskup`
+    # itself populates them. That keeps the artifact ~3 GB (gzip) instead of
+    # ~48 GB. The object name carries the fork rev so a circuit change can't
+    # silently reuse a stale key. Public bucket → plain curl, no AWS creds.
+    - name: Restore Zisk proving key (fork circuit) from S3
+      if: inputs.proving-key == 'true'
+      shell: bash
+      run: |
+        mkdir -p "$HOME/.zisk"
+        curl -fSL --retry 3 \
+          https://argument-zisk-setup.s3.amazonaws.com/zisk-provingkey-blake3-8f9e24d5-cpu.tar.gz \
+          -o /tmp/zisk-provingkey.tar.gz
+        tar -C "$HOME/.zisk" -xzf /tmp/zisk-provingkey.tar.gz
+        rm -f /tmp/zisk-provingkey.tar.gz
+        # Regenerate the const-tree files omitted from the artifact (CPU build, so
+        # no --gpu). This is the "may take a while" step ziskup prints.
+        cargo-zisk check-setup --proving-key "$HOME/.zisk/provingKey" -a
diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py
new file mode 100644
index 00000000..27e595ba
--- /dev/null
+++ b/.github/scripts/bench.py
@@ -0,0 +1,704 @@
+#!/usr/bin/env python3
+"""All data-wrangling for the `!benchmark` PR workflow, as subcommands:
+
+  parse       COMMENT_BODY env → matrix + config (writes $GITHUB_OUTPUT)
+  manifest    Benchmarks/Vectors.csv → the constant names for one cell
+  bmf         neutral results JSON → Bencher Metric Format (bench-main.yml)
+  fetch-main  base SHA + cell → main.json pulled from bencher.dev
+  compare     main.json + pr.json → a Markdown main-vs-PR table
+  comment     per-cell table files → the final PR comment body
+
+The neutral results JSON every backend normalises to (see run.sh) is
+`{ "<name>": { "<metric>": <number>, ... }, ... }`. Most metrics are
+lower-is-better (a positive Δ% is a regression); the exceptions live in
+HIGHER_IS_BETTER (throughput), where the polarity is flipped.
+"""
+import argparse
+import glob
+import json
+import os
+import time
+import urllib.parse
+import urllib.request
+
+
+# ─────────────────── backend identity table ────────────────────
+# Single source of truth for what each backend is:
+#   default_mode — what `!benchmark <backend>` runs. The bare `execute` token
+#     switches to the "execute" metrics entry when one exists (only aiur has
+#     a real choice: `prove` is the full pipeline; `execute` skips Phase 2 via
+#     `--execute-only`).
+#   testbed — the bencher testbed bench-main.yml uploads main's numbers to.
+#     MUST match that workflow's `testbed:` strings; fetch-main fails a cell
+#     loudly (exit 2) when a (backend, mode) has no entry here, so drift shows
+#     up as a red cell instead of a silent local-rebuild fallback.
+#   metrics — compare-table columns per supported mode. aiur's execute entry
+#     reads the SAME testbed as prove (bencher stores only prove runs; the
+#     execute-side columns — incl. execute-peak-rss, sampled at the Phase 1/2
+#     boundary — are extracted from that JSON, apples-to-apples).
+# `compile` benchmarks `ix compile <env>.lean → <env>.ixe`; its benchmark name
+# on bencher is the CamelCase env slug (ENV_CC below).
+BACKEND_TABLE = {
+    "aiur": {
+        "default_mode": "prove",
+        "testbed": "aiur-check-x64-32x",
+        "metrics": {
+            "prove":   ["fft-cost", "execute-time", "prove-time", "verify-time",
+                        "proof-size", "peak-rss"],
+            "execute": ["fft-cost", "execute-time", "execute-peak-rss"],
+        },
+    },
+    "zisk": {
+        "default_mode": "execute",
+        "testbed": "zisk-check-x64-32x",
+        "metrics": {
+            "execute": ["cycles", "execute-time", "throughput", "execute-peak-rss"],
+        },
+    },
+    "sp1": {
+        "default_mode": "execute",
+        "testbed": "sp1-check-x64-32x",
+        "metrics": {
+            "execute": ["cycles", "execute-time", "throughput", "execute-peak-rss"],
+        },
+    },
+    "ooc": {
+        "default_mode": "execute",
+        "testbed": "ooc-check-x64-32x",
+        "metrics": {
+            "execute": ["throughput", "check-time", "peak-rss"],
+        },
+    },
+    "compile": {
+        "default_mode": "compile",
+        "testbed": "ix-compile-x64-32x",
+        "metrics": {
+            "compile": ["compile-time", "throughput", "file-size", "constants"],
+        },
+    },
+}
+BACKENDS = tuple(BACKEND_TABLE)
+ENVS = ("initStd", "lean", "mathlib")
+# CamelCase benchmark key per env — must match bench-main.yml's matrix.bench
+# values (the names bencher stores env-keyed rows under: ooc whole-env,
+# compile). One explicit table, not a first-letter-upper derivation, so an
+# env whose CamelCase isn't mechanical (e.g. a future `flt` → `FLT`) can't
+# silently diverge from the workflow.
+ENV_CC = {"initStd": "InitStd", "lean": "Lean", "mathlib": "Mathlib"}
+CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_FULL"}
+PASSTHROUGH_KEYS = {"RUST_LOG", "WITHOUT_VK_VERIFICATION", "RUSTFLAGS"}
+
+
+RUNNER = "warp-ubuntu-latest-x64-32x"
+
+
+def cmd_parse(_a):
+    body = os.environ.get("COMMENT_BODY", "")
+    lines = [ln.replace("\r", "") for ln in body.splitlines()]
+    cmd = next((ln for ln in lines if "!benchmark" in ln), "")
+    toks = cmd.split("!benchmark", 1)[1].split() if "!benchmark" in cmd else []
+
+    backends, execute_flag = [], False
+    for t in (t.lower() for t in toks):
+        if t == "all":
+            backends = list(BACKENDS)
+        elif t in BACKENDS and t not in backends:
+            backends.append(t)
+        elif t == "execute":
+            execute_flag = True
+    if not backends:
+        backends = ["aiur"]
+
+    cfg, passthrough = {}, []
+    for ln in lines[(lines.index(cmd) + 1) if cmd in lines else 0:]:
+        s = ln.strip()
+        if not s or "=" not in s:
+            continue
+        key, val = (x.strip() for x in s.split("=", 1))
+        if key in CONFIG_KEYS:
+            cfg[key] = val
+        elif key in PASSTHROUGH_KEYS:
+            passthrough.append(f"{key}={val}")
+
+    envs = [e.strip() for e in cfg.get("BENCH_ENVS", "initStd").split(",") if e.strip()]
+    envs = [e for e in envs if e in ENVS] or ["initStd"]
+    tier = cfg.get("BENCH_TIER", "")
+    if tier not in ("cheap", "heavy", "all"):
+        tier = ""             # empty ⇒ derived from mode at manifest time
+    shard = "1" if cfg.get("BENCH_SHARD") == "1" else "0"
+    full = "1" if cfg.get("BENCH_FULL") == "1" else "0"  # full set vs primary subset
+
+    def mode_for(b):
+        # The bare `execute` token selects a backend's execute entry when it
+        # has one — a real switch only for aiur (everything else already
+        # defaults to execute, or has no execute mode at all: compile).
+        if execute_flag and "execute" in BACKEND_TABLE[b]["metrics"]:
+            return "execute"
+        return BACKEND_TABLE[b]["default_mode"]
+
+    cells = []
+    for b in backends:
+        m = mode_for(b)
+        for e in envs:
+            cells.append({"backend": b, "env": e, "mode": m,
+                          "runner": RUNNER,
+                          "label": f"{b}-{e}-{m}"})
+
+    modes = " ".join(f"{b}={mode_for(b)}" for b in backends)
+    summary = (f"backends: `{modes}` · envs: `{','.join(envs)}` · "
+               f"set: `{'full' if full == '1' else 'primary'}` · "
+               f"tier: `{tier or 'auto'}` · shard: `{shard}`")
+    if passthrough:
+        summary += " · env: `" + " ".join(passthrough) + "`"
+
+    with open(os.environ.get("GITHUB_OUTPUT", "/dev/stdout"), "a") as f:
+        f.write(f"matrix={json.dumps(cells)}\n")
+        f.write(f"tier={tier}\nshard={shard}\nfull={full}\n")
+        f.write(f"config-summary={summary}\n")
+        f.write("passthrough-env<<PTENV\n" + "\n".join(passthrough)
+                + ("\n" if passthrough else "") + "PTENV\n")
+    print(summary)
+    print(json.dumps(cells, indent=2))
+
+
+# ──────────────────────── manifest ────────────────────────
+def cmd_manifest(a):
+    # `compile` doesn't consume Vectors.csv — the "benchmark name" on bencher
+    # is the CamelCase env slug (`initStd` → `InitStd`), one per cell.
+    if a.backend == "compile":
+        with open(a.out, "w") as f:
+            f.write(ENV_CC[a.env] + "\n")
+        print(f"count=1\ntier=n/a")
+        return
+    # prove defaults to the cheap tier to keep the full set bounded; the curated
+    # primary subset is exempt — run.sh's aiur prove path attempts prove for
+    # every primary (RAM watchdog catches OOMs), so all primaries are selected
+    # here regardless of tier.
+    tier = a.tier or ("cheap" if (a.mode == "prove" and not a.primary) else "all")
+    names, heavy = [], []
+    with open(a.csv) as f:
+        for line in f:
+            row = line.rstrip("\n")
+            if not row or row.startswith("#"):
+                continue
+            cols = row.split(",")
+            if cols[0] == "name" or len(cols) < 3:
+                continue
+            # `shard_target` and `primary` default to "0" when the column is
+            # omitted, so rows can drop trailing zero fields (most only carry
+            # the first three).
+            name, env, ctier = cols[:3]
+            shard = cols[3] if len(cols) >= 4 else "0"
+            rep = cols[4] if len(cols) >= 5 else "0"
+            if env != a.env:
+                continue
+            if a.primary and rep != "1":
+                continue
+            if tier in ("cheap", "heavy") and ctier != tier:
+                continue
+            if a.shard == "1" and shard != "1":
+                continue
+            names.append(name)
+            if ctier == "heavy":
+                heavy.append(name)
+    with open(a.out, "w") as f:
+        f.write("\n".join(names) + ("\n" if names else ""))
+    # The selected names that are heavy-tier — the subset the zisk cells run
+    # through the closure-sharded pipeline (ix extract → profile → shard)
+    # instead of a single full-closure leaf.
+    if a.heavy_out:
+        with open(a.heavy_out, "w") as f:
+            f.write("\n".join(heavy) + ("\n" if heavy else ""))
+    print(f"count={len(names)}\ntier={tier}")
+
+
+# ───────────────────────── compare ─────────────────────────
+def _num(d, name, metric):
+    v = d.get(name, {}).get(metric)
+    return v if isinstance(v, (int, float)) else None
+
+
+# Per-metric formatting kind. Metric names are the neutral-JSON keys the tools
+# emit (see BACKEND_TABLE). Unknown metrics fall through to `_human_auto`.
+_METRIC_KIND = {
+    # bytes
+    "peak-rss": "bytes",
+    "execute-peak-rss": "bytes",
+    "file-size": "bytes",
+    "proof-size": "bytes",
+    # seconds
+    "execute-time": "seconds",
+    "prove-time": "seconds",
+    "verify-time": "seconds",
+    "check-time": "seconds",
+    "compile-time": "seconds",
+    # large counts (10^6+ typical)
+    "fft-cost": "count",
+    "cycles": "count",
+    "steps": "count",
+    "max-shard-cycles": "count",
+    "throughput": "count",
+    # small integers
+    "constants": "int",
+    "shards": "int",
+}
+
+
+def _human_bytes(v):
+    v = float(v)
+    for unit in ("B", "KiB", "MiB", "GiB", "TiB"):
+        if abs(v) < 1024:
+            return f"{int(v):,} {unit}" if unit == "B" else f"{v:,.2f} {unit}"
+        v /= 1024
+    return f"{v:,.2f} PiB"
+
+
+def _human_seconds(v):
+    v = float(v)
+    if abs(v) < 1e-3:
+        return f"{v * 1e6:.1f} µs"
+    if abs(v) < 1:
+        return f"{v * 1e3:.1f} ms"
+    if abs(v) < 60:
+        return f"{v:.3f} s"
+    m, s = divmod(v, 60)
+    return f"{int(m)}m {s:.1f}s"
+
+
+def _human_count(v):
+    v = float(v)
+    if abs(v) < 1000:
+        return f"{int(v):,}" if v == int(v) else f"{v:.3f}"
+    for unit in ("K", "M", "B", "T"):
+        v /= 1000
+        if abs(v) < 1000:
+            return f"{v:.2f}{unit}"
+    return f"{v:.2f}Q"
+
+
+def _human_auto(v):
+    if isinstance(v, int) or (isinstance(v, float) and v.is_integer()):
+        return f"{int(v):,}"
+    return f"{v:,.3f}"
+
+
+def _human(v, metric=None):
+    if v is None:
+        return "n/a"
+    kind = _METRIC_KIND.get(metric, "auto")
+    if kind == "bytes":   return _human_bytes(v)
+    if kind == "seconds": return _human_seconds(v)
+    if kind == "count":   return _human_count(v)
+    if kind == "int":     return f"{int(v):,}"
+    return _human_auto(v)
+
+
+# Metrics where a LARGER value is the improvement. Everything else is
+# lower-is-better (times, RAM, cycles, fft-cost, sizes).
+HIGHER_IS_BETTER = {"throughput"}
+
+
+def _delta(main, pr):
+    if main is None or pr is None or main == 0:
+        return None
+    return (pr - main) / main * 100.0
+
+
+def _badness(dp, metric):
+    """Signed regression magnitude: positive ⇒ the PR got worse on `metric`.
+    For lower-is-better metrics that's a positive Δ%; for higher-is-better
+    (throughput) it's a negative Δ%."""
+    if dp is None:
+        return None
+    return -dp if metric in HIGHER_IS_BETTER else dp
+
+
+# Ratio direction words per metric kind (grew, shrank). Rates and times read
+# as faster/slower; sizes as larger/smaller; counts (cycles, fft-cost, …) as
+# more/fewer — "1.15× slower" is meaningless for a byte or count metric.
+_RATIO_WORDS = {
+    "seconds": ("slower", "faster"),
+    "bytes":   ("larger", "smaller"),
+    "count":   ("more", "fewer"),
+    "int":     ("more", "fewer"),
+}
+
+
+def _ratio(main, pr, metric):
+    """(factor, direction word) with `factor` always ≥ 1.0. Wording follows
+    the metric's kind and polarity: throughput (a rate) and the time metrics
+    read as faster/slower, sizes as larger/smaller, counts as more/fewer.
+    Returns None if either side is missing or non-positive."""
+    if main is None or pr is None or main <= 0 or pr <= 0:
+        return None
+    grew = pr >= main
+    factor = pr / main if grew else main / pr
+    if metric in HIGHER_IS_BETTER:      # rate: more per second = faster
+        return (factor, "faster" if grew else "slower")
+    kind = _METRIC_KIND.get(metric, "auto")
+    words = _RATIO_WORDS.get(kind, ("larger", "smaller"))
+    return (factor, words[0] if grew else words[1])
+
+
+def _load(path):
+    try:
+        with open(path) as f:
+            d = json.load(f)
+        return d if isinstance(d, dict) else {}
+    except (FileNotFoundError, json.JSONDecodeError):
+        return {}
+
+
+# TODO: re-add the per-constant Aiur phase (sub-span) drill-down. run.sh's
+# `merge_phases` still folds tracing-texray JSON-Lines into `phases: { span:
+# seconds }` on each entry; the compare renderer previously emitted a
+# collapsible `<details>` block per constant showing main-vs-PR per-span deltas
+# so a regression could be traced to `aiur/execute`, `aiur/witness`,
+# `stark/fri_open`, etc. Removed while the compare surface is being stabilised;
+# reinstate once we've settled on the primary table's flag/threshold semantics.
+
+
+def cmd_compare(a):
+    metrics = a.metric or BACKEND_TABLE.get(a.backend, {}).get("metrics", {}).get(a.mode)
+    if not metrics:
+        raise SystemExit("compare: pass --metric or a known --backend/--mode")
+    title = a.title
+    if title is None and a.backend:
+        src = a.main_source or "unknown"
+        cnt = f"{a.count} constants · " if a.count else ""
+        title = f"### `{a.backend}` · `{a.env}` · `{a.mode}` — {cnt}main from: {src}"
+
+    def emit(text):
+        if a.out:
+            open(a.out, "w").write(text + "\n")
+        else:
+            print(text)
+
+    main_d, pr_d = _load(a.main), _load(a.pr)
+    names = sorted(set(main_d) | set(pr_d))
+    if not names:
+        emit((title or "") + "\n\n_No results were produced (every constant failed, "
+             "timed out, or was dropped). See the workflow logs._")
+        return
+    # One side empty while the other measured is almost always a broken side
+    # (e.g. the base-run fallback hit a CLI-incompatible base), not a real
+    # all-regressed/all-new comparison — say so instead of a silent n/a column.
+    side_note = ""
+    if not main_d:
+        side_note = ("\n\n_⚠️ main produced no results — the base-side run "
+                     "failed entirely (often a CLI-incompatible base binary "
+                     "when bencher had no data). Deltas unavailable; see the "
+                     "workflow logs._")
+    elif not pr_d:
+        side_note = ("\n\n_⚠️ the PR side produced no results — every "
+                     "constant failed or was dropped. See the workflow logs._")
+
+    primary = metrics[0]
+    names.sort(key=lambda n: (0, -v) if (v := (_num(pr_d, n, primary)
+               if _num(pr_d, n, primary) is not None else _num(main_d, n, primary))) is not None
+               else (1, 0))
+
+    head = ["constant"]
+    for m in metrics:
+        head += [f"{m} (main)", f"{m} (PR)", "Δ%"]
+    rows = ["| " + " | ".join(head) + " |", "|" + "|".join(["---"] * len(head)) + "|"]
+
+    def _oom(d, n):
+        return isinstance(d.get(n), dict) and d[n].get("oom") is True
+
+    def _failed(d, n):
+        return isinstance(d.get(n), dict) and d[n].get("failed") is True
+
+    regressed, improved = set(), set()
+    failures = []  # (name, side) — typecheck failures, surfaced loudly below
+    worst = None  # (badness, dp, name, metric)
+    for n in names:
+        cells = [f"`{n}`"]
+        main_oom, pr_oom = _oom(main_d, n), _oom(pr_d, n)
+        main_failed, pr_failed = _failed(main_d, n), _failed(pr_d, n)
+        if main_failed:
+            failures.append((n, "main"))
+        if pr_failed:
+            failures.append((n, "PR"))
+        for m in metrics:
+            mv, pv = _num(main_d, n, m), _num(pr_d, n, m)
+            # An OOM entry may still carry real Phase-1 measurements (run.sh
+            # merges the sentinel into whatever was recorded before the kill);
+            # render those, and OOM only for the metrics the kill prevented.
+            # A typecheck FAILURE outranks everything — the constant is
+            # rejected, not benchmarked. Spell it out in the cell: a bare ❌
+            # would read as any generic failure.
+            mv_h = ("❌ failed typecheck" if main_failed
+                    else "OOM" if (main_oom and mv is None) else _human(mv, m))
+            pv_h = ("❌ failed typecheck" if pr_failed
+                    else "OOM" if (pr_oom and pv is None) else _human(pv, m))
+            dp = _delta(mv, pv)
+            bad = _badness(dp, m)
+            cell = "n/a" if dp is None else f"{dp:+.1f}%"
+            if dp is not None:
+                # Ratio only when the change is big enough that "1.18× slower"
+                # carries new signal beyond the percentage — sub-5% deltas would
+                # just add `(1.03× slower)` noise to the cell.
+                r = _ratio(mv, pv, m)
+                if r is not None and r[0] >= 1.05:
+                    cell += f" ({r[0]:.2f}× {r[1]})"
+                if bad > a.threshold:
+                    cell += " ⚠️"; regressed.add(n)
+                elif bad < -a.threshold:
+                    cell += " 🟢"; improved.add(n)
+                if worst is None or bad > worst[0]:
+                    worst = (bad, dp, n, m)
+            cells += [mv_h, pv_h, cell]
+        rows.append("| " + " | ".join(cells) + " |")
+
+    out = ([title, ""] if title else []) + rows + [""]
+    # Typecheck failures first and loud — a constant the kernel REJECTS is a
+    # correctness signal, not a benchmark blip.
+    for n, side in failures:
+        out.append(f"❌ **`{n}` FAILED TO TYPECHECK on the {side} side** — "
+                   "the kernel rejected it; see the workflow logs.")
+    if failures:
+        out.append("")
+    s = (f"_{len(names)} constants · {len(regressed)} regressed · "
+         f"{len(improved)} improved (|Δ| > {a.threshold:g}% on any metric)._")
+    if worst and worst[0] is not None and worst[0] > a.threshold:
+        s += f" Worst: `{worst[2]}` `{worst[3]}` {worst[1]:+.1f}%."
+    out.append(s)
+    if side_note:
+        out.append(side_note.strip())
+    # TODO: emit per-constant phase drill-down (see the TODO by _phase_details).
+    emit("\n".join(out))
+
+
+# ───────────────────────── comment ─────────────────────────
+def cmd_comment(a):
+    commit = f"[`{a.head[:7]}`]({a.repo_url}/commit/{a.head})"
+    parts = [f"## `!benchmark` — main vs {commit}", "", a.summary, ""]
+    tables = sorted(glob.glob(os.path.join(a.tables, "table-*.md")))
+    if tables:
+        for t in tables:
+            parts += [open(t).read().rstrip(), ""]
+    else:
+        parts += ["_No result tables were produced — see the workflow logs._", ""]
+    parts.append(f"[Workflow logs]({a.repo_url}/actions/runs/{a.run_id})")
+    open(a.out, "w").write("\n".join(parts) + "\n")
+    print(open(a.out).read())
+
+
+# ──────────────────────── bmf ─────────────────────────
+def cmd_bmf(a):
+    """Neutral results JSON → Bencher Metric Format.
+
+    One converter for every bench-main.yml upload site (previously four
+    hand-copied jq pipelines): flattens each entry's `phases` object into
+    `phase:<span>` measures, strips the boolean `oom` sentinel (BMF values
+    must be numeric — one boolean would fail the whole `bencher run` upload;
+    the sentinel is for the PR comparison table only), and drops entries left
+    with no measures.
+    """
+    with open(a.infile) as f:
+        neutral = json.load(f)
+    out = {}
+    for name, entry in (neutral or {}).items():
+        if not isinstance(entry, dict):
+            continue
+        measures = {}
+        for k, v in entry.items():
+            if k in ("oom", "failed"):
+                continue
+            # Nested objects are per-sub-measure breakdowns: `phases` (span →
+            # seconds) flattens to `phase:<span>`; anything else (e.g. the
+            # zisk env row's `shard-cycles`) to `<key>:<sub>`. Both stay
+            # un-thresholded on bencher (dynamic names).
+            if isinstance(v, dict):
+                prefix = "phase" if k == "phases" else k
+                for sub, sv in v.items():
+                    if isinstance(sv, (int, float)) and not isinstance(sv, bool):
+                        measures[f"{prefix}:{sub}"] = {"value": sv}
+            elif isinstance(v, (int, float)) and not isinstance(v, bool):
+                measures[k] = {"value": v}
+        if measures:
+            out[name] = measures
+    with open(a.out, "w") as f:
+        json.dump(out, f, indent=1)
+    print(f"bmf: {len(out)} benchmark(s) → {a.out}")
+
+
+# ─────────────────────── fetch-main ──────────────────────
+def cmd_fetch_main(a):
+    """Pull the base SHA's neutral results JSON from bencher.dev.
+
+    The testbed comes from BACKEND_TABLE — supported (backend, mode) pairs are
+    exactly the table's metrics keys. Exit codes are load-bearing for
+    bench-pr.yml: 3 = transient (bencher has no report at that hash yet, or
+    the API failed after retries) — the caller falls back to running main
+    locally; 2 = permanent config error ((backend, mode) not in BACKEND_TABLE,
+    i.e. table / bench-main.yml drift) — the caller fails the cell loudly
+    instead of paying the fallback forever.
+
+    A PARTIAL miss (bencher answered, but some --names entries have no data —
+    e.g. constants the PR adds to Vectors.csv) still exits 0: main.json holds
+    what bencher had, and --missing-out lists the uncovered names so the
+    caller can measure just those against the base checkout and merge.
+    """
+    entry = BACKEND_TABLE.get(a.backend)
+    testbed = entry["testbed"] if entry and a.mode in entry["metrics"] else None
+    if not testbed:
+        print(f"fetch-main: no main testbed for {a.backend}/{a.mode}")
+        raise SystemExit(2)
+    wanted = set(open(a.names).read().split()) if a.names else None
+    # ooc's headline row is keyed by the CamelCase env slug (not a Vectors.csv
+    # constant), so names.txt alone would filter it out — admit it explicitly.
+    if wanted is not None and a.env:
+        wanted.add(ENV_CC.get(a.env, a.env))
+    # TODO: support any base/PR branch, not just `main`. Today bench-main.yml
+    # only runs on push to main and this query hardcodes `branch=main`, so a PR
+    # against a non-main base branch (e.g. a long-running feature branch) always
+    # falls through to the local base-run path. To generalise: (1) let
+    # bench-main.yml (or a sibling) upload reports for other tracked branches,
+    # (2) plumb `--branch` here from `github.base_ref` in bench-pr.yml, (3) fall
+    # back to `main` when the base branch has no bencher data.
+    # Bencher stores the git hash at `branch.head.version.hash`.
+    def _report_hash(r):
+        return (((r.get("branch") or {}).get("head") or {}).get("version") or {}).get("hash")
+
+    def _get_json(url, attempts=3):
+        for i in range(attempts):
+            try:
+                with urllib.request.urlopen(url, timeout=15) as f:
+                    return json.load(f)
+            except Exception as e:
+                if i == attempts - 1:
+                    raise
+                print(f"fetch-main: attempt {i + 1} failed ({e}); retrying")
+                time.sleep(2 ** i)
+
+    # Page newest-first until the base SHA's reports are found (a matrix env
+    # uploads one report each, all within one push's CI window, so once we've
+    # matched and a later page yields nothing new we're past it). A transient
+    # API error is retried before the expensive local-base fallback fires.
+    per_page = 255
+    at_sha, page = [], 1
+    while page <= 8:  # 2040 newest reports — far beyond a realistic backlog
+        params = {"branch": "main", "testbed": testbed,
+                  "per_page": per_page, "page": page}
+        url = ("https://api.bencher.dev/v0/projects/ix/reports?"
+               + urllib.parse.urlencode(params))
+        try:
+            reports = _get_json(url)
+        except Exception as e:
+            print(f"fetch-main: bencher API error: {e}")
+            raise SystemExit(3)
+        matches = [r for r in reports if _report_hash(r) == a.sha]
+        if at_sha and not matches:
+            break            # past the SHA's window
+        at_sha += matches
+        if len(reports) < per_page:
+            break            # end of data
+        page += 1
+    if not at_sha:
+        print(f"fetch-main: no reports for {a.backend}/{a.mode} @ {a.sha[:8]}")
+        raise SystemExit(3)
+    # Matrix envs upload separately to the same testbed at the same commit,
+    # each contributing its own benchmark subset — aggregate across reports.
+    # Filter/emit by `name` (Bencher's `slug` is a lower-kebab-cased derivation
+    # that would mangle Lean names like `Nat.add_comm` → `nat-add-comm`).
+    out = {}
+    for r in at_sha:
+        for iteration in r.get("results", []):
+            for bench in iteration:
+                name = bench["benchmark"]["name"]
+                if wanted is not None and name not in wanted:
+                    continue
+                metrics = {
+                    m["measure"]["name"]: m["metric"]["value"]
+                    for m in bench.get("measures", [])
+                }
+                if metrics:
+                    out[name] = metrics
+    if not out:
+        print(f"fetch-main: reports found but no matching benchmarks in --names")
+        raise SystemExit(3)
+    # Names the PR side selected (its Vectors.csv) that bencher has no data
+    # for at this SHA — typically constants the PR itself adds to the CSV.
+    # The caller runs the base checkout on JUST these and merges, so a new
+    # constant still gets a real main-vs-PR delta on its first !benchmark.
+    # Computed against names.txt verbatim (not the ENV_CC-augmented `wanted`):
+    # the env-keyed row is an admit-filter, not a per-constant expectation.
+    if a.missing_out:
+        name_set = set(open(a.names).read().split()) if a.names else set()
+        missing = sorted(name_set - set(out))
+        with open(a.missing_out, "w") as f:
+            f.write("\n".join(missing) + ("\n" if missing else ""))
+        if missing:
+            print(f"fetch-main: {len(missing)} name(s) not on bencher @ "
+                  f"{a.sha[:8]} (base run will measure): " + ", ".join(missing))
+    with open(a.out, "w") as f:
+        json.dump(out, f)
+    print(f"fetch-main: {len(out)} constant(s) from bencher for {a.backend}/{a.mode}")
+
+
+# ────────────────────────── cli ──────────────────────────
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    sub.add_parser("parse").set_defaults(fn=cmd_parse)
+
+    m = sub.add_parser("manifest")
+    m.add_argument("--csv", required=True); m.add_argument("--env", required=True)
+    m.add_argument("--mode", required=True); m.add_argument("--tier", default="")
+    m.add_argument("--shard", default="0"); m.add_argument("--out", required=True)
+    m.add_argument("--backend", default="",
+                   help="Backend for this cell (used to special-case `compile`, "
+                        "which doesn't consume Vectors.csv).")
+    m.add_argument("--primary", action="store_true",
+                   help="Restrict to the primary subset (the primary=1 column).")
+    m.add_argument("--heavy-out", dest="heavy_out",
+                   help="Also write the selected heavy-tier names (one per "
+                        "line) — the subset zisk runs closure-sharded.")
+    m.set_defaults(fn=cmd_manifest)
+
+    b = sub.add_parser("bmf")
+    b.add_argument("--in", dest="infile", required=True,
+                   help="Neutral results JSON (run.sh output).")
+    b.add_argument("--out", required=True,
+                   help="Bencher Metric Format JSON for `bencher run`.")
+    b.set_defaults(fn=cmd_bmf)
+
+    fm = sub.add_parser("fetch-main")
+    fm.add_argument("--sha", required=True)
+    fm.add_argument("--backend", required=True)
+    fm.add_argument("--mode", required=True)
+    fm.add_argument("--env", default="",
+                    help="Cell env; admits the env-keyed row (ooc whole-env) "
+                         "past the --names filter.")
+    fm.add_argument("--names", help="Only fetch benchmarks whose names appear in this file.")
+    fm.add_argument("--missing-out", dest="missing_out",
+                    help="Write the --names entries bencher had no data for "
+                         "(one per line; empty file when none) — the subset "
+                         "the caller should measure against the base checkout.")
+    fm.add_argument("--out", required=True)
+    fm.set_defaults(fn=cmd_fetch_main)
+
+    c = sub.add_parser("compare")
+    c.add_argument("--main", required=True); c.add_argument("--pr", required=True)
+    c.add_argument("--metric", action="append", default=[])
+    c.add_argument("--threshold", type=float, default=3.0)
+    c.add_argument("--title"); c.add_argument("--backend"); c.add_argument("--env")
+    c.add_argument("--mode"); c.add_argument("--count"); c.add_argument("--main-source", default="")
+    c.add_argument("--out")
+    c.set_defaults(fn=cmd_compare)
+
+    cm = sub.add_parser("comment")
+    cm.add_argument("--tables", required=True); cm.add_argument("--summary", default="")
+    cm.add_argument("--head", required=True); cm.add_argument("--repo-url", required=True)
+    cm.add_argument("--run-id", required=True); cm.add_argument("--out", required=True)
+    cm.set_defaults(fn=cmd_comment)
+
+    a = ap.parse_args()
+    a.fn(a)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh
new file mode 100644
index 00000000..9acba71f
--- /dev/null
+++ b/.github/scripts/run.sh
@@ -0,0 +1,439 @@
+#!/usr/bin/env bash
+# Compile one library env to a `.ixe` from a checked-out repo (unless REUSE_IXE),
+# then benchmark the given backend, emitting the neutral results JSON
+#   { "<name>": { "<metric>": <number>, ... }, ... }
+# that bench.py compare / the bencher jobs consume.
+#
+#   run.sh <repo_dir> <env> <backend> <mode> <names_file> <out_json>
+#     repo_dir : checked-out worktree (has .lake/build/bin/{ix,bench-typecheck})
+#     env      : initStd | lean | mathlib  (any case; used verbatim for <env>.ixe)
+#     backend  : aiur | zisk | sp1 | ooc | compile | cutshards
+#     mode     : execute | prove | compile (ignored by cutshards)
+#
+# `ix` / `bench-typecheck` come from <repo_dir> (so base measures base's code, PR
+# the PR's — the caller puts <repo_dir>/.lake/build/bin on PATH). For the
+# per-constant backends (aiur, zisk, sp1, ooc), each name is its own subprocess
+# so a failure/timeout drops only that row. The `compile` backend is per-env
+# (the env slug is the benchmark name) and measures the compile step directly.
+# Only JSON is written to stdout — tool output and `::warning::`/`::notice::`
+# go to logs / stderr so they never corrupt the merged JSON stream.
+set -uo pipefail
+
+repo=${1:?repo_dir}; benv=${2:?env}; backend=${3:?backend}; mode=${4:?mode}
+names=${5:?names}; out=${6:?out}
+# Absolute repo path: the zkVM branch cd's into the host workspace, so the .ixe
+# path passed to the host must not be relative to the original cwd.
+repo=$(cd "$repo" && pwd)
+: > "$out"
+emit_empty() { [ -s "$out" ] || echo '{}' > "$out"; }
+
+# Fold a tool's per-phase span timings (tracing-texray JSONL, one
+# `{"span":"…","seconds":N}` per closed span, possibly repeated per shard) into
+# its per-constant results file under a `phases` object, summed by span name —
+# the source bench.py renders as the comparative drill-down. No-op if the tool
+# emitted no spans.
+merge_phases() {  # <results.json> <spans.jsonl>
+  local res="$1" spans="$2" ph
+  [ -s "$spans" ] || return 0
+  ph=$(jq -s 'reduce .[] as $o ({}; .[$o.span] += $o.seconds)' "$spans" 2>/dev/null)
+  [ -n "$ph" ] && [ "$ph" != "{}" ] || return 0
+  jq --argjson ph "$ph" 'map_values(. + {phases: $ph})' "$res" > "$res.p" \
+    && mv "$res.p" "$res" || true
+}
+
+# Background RAM watchdog. Every ~3 s, sum RSS across `root_pid` and every
+# descendant (via `ps -eo pid,ppid,rss` + a small BFS); when the total exceeds
+# `max_gb`, touch `marker` and SIGKILL the whole process GROUP (the root is
+# started with `setsid`, so `kill -- -pid` reaches every descendant, not just
+# depth-1 children). Callers detect the kill by testing `-f "$marker"` after
+# wait. The 3 s cadence can lose a fast spike to the kernel OOM killer first —
+# callers treat exit 137 without a marker as OOM too.
+watch_ram_kill() {  # <root_pid> <max_gb> <marker>
+  local root_pid=$1 max_gb=$2 marker=$3
+  local max_kb=$((max_gb * 1024 * 1024)) total_kb
+  while kill -0 "$root_pid" 2>/dev/null; do
+    total_kb=$(ps -eo pid,ppid,rss --no-headers 2>/dev/null | awk -v root="$root_pid" '
+      { rss[$1]=$3; parent[$1]=$2 }
+      END {
+        alive[root]=1; changed=1
+        while (changed) {
+          changed=0
+          for (p in parent) if (alive[parent[p]] && !alive[p]) { alive[p]=1; changed=1 }
+        }
+        s=0; for (p in alive) s += rss[p]+0
+        print s
+      }')
+    if [ -n "$total_kb" ] && [ "$total_kb" -gt "$max_kb" ]; then
+      echo "::warning::RAM watchdog: killing pid=$root_pid tree-RSS=${total_kb}kB > ${max_kb}kB (~${max_gb} GB)" >&2
+      : > "$marker"
+      kill -KILL -- "-$root_pid" 2>/dev/null || kill -KILL "$root_pid" 2>/dev/null || true
+      return
+    fi
+    sleep 3
+  done
+}
+
+# Merge the OOM sentinel into a constant's results file, PRESERVING any
+# metrics measured before the kill (bench-typecheck persists Phase-1
+# fft-cost/execute-time before the prove starts). bench.py compare renders
+# `OOM` only for the metrics that are absent.
+mark_oom() {  # <results.json> <name>
+  local res="$1" c="$2"
+  if [ -s "$res" ]; then
+    jq --arg n "$c" '.[$n] = ((.[$n] // {}) + {oom: true})' "$res" > "$res.o" \
+      && mv "$res.o" "$res" \
+      || jq -n --arg n "$c" '{($n): {oom: true}}' > "$res"
+  else
+    jq -n --arg n "$c" '{($n): {oom: true}}' > "$res"
+  fi
+}
+
+# A prove can die of memory three ways: the watchdog's group-kill (marker
+# file), the kernel OOM killer (SIGKILL → exit 137, no marker), or an
+# ALLOCATION-FAILURE ABORT — one huge trace allocation fails while total RSS
+# is still under the watchdog ceiling, and the Rust/Lean runtime aborts
+# (SIGABRT → exit 134) with an allocator message in the log. All three are
+# OOM for the benchmark table.
+looks_like_oom() {  # <exit> <marker> <log>
+  local code="$1" marker="$2" log="$3"
+  [ -f "$marker" ] && return 0
+  [ "$code" -eq 137 ] && return 0
+  [ "$code" -eq 134 ] && grep -qiE \
+    'memory allocation of .* failed|std::bad_alloc|out of memory|(unable|failed) to allocate' \
+    "$log" 2>/dev/null && return 0
+  return 1
+}
+
+# `$benv` is used verbatim for the `.ixe` filename (bench-pr compiles `initStd.ixe`;
+# the bencher jobs reuse the compile job's cached `InitStd.ixe`), and lowercased
+# only to pick the Compile module. `$benv_cc` is the CamelCase form — the
+# canonical BENCHMARK KEY for env-keyed rows (ooc whole-env, compile), so the
+# PR side (`initStd`) and the bencher side (`InitStd`, from bench-main's
+# matrix.bench) agree on one name.
+case "$(printf '%s' "$benv" | tr '[:upper:]' '[:lower:]')" in
+  initstd) module=CompileInitStd; benv_cc=InitStd ;;
+  lean)    module=CompileLean;    benv_cc=Lean ;;
+  mathlib) module=CompileMathlib; benv_cc=Mathlib ;;
+  flt)     module=CompileFLT;     benv_cc=FLT ;;
+  *) echo "unknown env: $benv" >&2; exit 2 ;;
+esac
+
+# Tool resolution: prefer the in-tree build (so base measures base's code, PR
+# the PR's), fall back to PATH — CI restores cached binaries onto PATH instead
+# of building in-tree. Resolved LAZILY at each use site: the zkVM branch needs
+# neither `ix` nor `bench-typecheck` when REUSE_IXE short-circuits the compile
+# (bench-main's zkvm-execute job restores only the `.ixe` cache, no binaries).
+resolve_bin() {  # <name> → prints the path, or fails
+  local name="$1" in_tree="$repo/.lake/build/bin/$1"
+  if [ -x "$in_tree" ]; then printf '%s' "$in_tree"
+  else command -v "$name" || { echo "::error::$name not found (in-tree or PATH)" >&2; return 2; }
+  fi
+}
+
+tmp=$(mktemp -d)
+compile_log="$tmp/compile.log"
+
+# Closure-shard artifacts for the zisk heavy tier: `ix shard extract` cuts a
+# standalone closure-only env (no recompile), `ix profile` → `ix shard` cut
+# its manifest (the canonical partitioner: profiled heartbeats + min-cut,
+# capped by predicted RAM). One dir per env; slugs must match the zkvm loop's
+# result keys (same `tr` set).
+shards_dir_for_env() { printf '%s' "$repo/zkshards-$benv"; }
+cut_closure_shards() {  # <name> <slug> → 0 when <dir>/$slug.{ixe,ixes} are ready
+  local c="$1" slug="$2" dir ix_bin rc
+  dir=$(shards_dir_for_env)
+  [ -f "$dir/$slug.ixes" ] && [ -f "$dir/$slug.ixe" ] && return 0
+  ix_bin=$(resolve_bin ix 2>/dev/null) || {
+    echo "::warning::no ix binary to cut closure shards for '$c'" >&2
+    return 1
+  }
+  mkdir -p "$dir"
+  echo "::group::ix shard extract + profile + shard: $c"
+  "$ix_bin" shard extract "$ixe" --consts "$c" --out "$dir/$slug.ixe" \
+    && "$ix_bin" profile "$dir/$slug.ixe" --out "$dir/$slug.ixprof" \
+    && "$ix_bin" shard "$dir/$slug.ixprof" \
+         --max-ram "${SHARD_MAX_RAM_GB:-120}" --out "$dir/$slug.ixes"
+  rc=$?
+  echo "::endgroup::"
+  [ "$rc" -eq 0 ] || {
+    echo "::warning::extract/profile/shard failed for '$c'" >&2
+    rm -f "$dir/$slug.ixes"
+    return 1
+  }
+}
+
+# `compile` backend needs a fresh compile to measure — never honor REUSE_IXE.
+ixe="$repo/$benv.ixe"
+if [ "${REUSE_IXE:-0}" = 1 ] && [ "$backend" != compile ] && [ -f "$ixe" ]; then
+  echo "reusing existing $ixe (REUSE_IXE)" >&2
+else
+  ix_bin=$(resolve_bin ix) || exit 2
+  echo "::group::ix compile $module → $benv.ixe ($backend/$mode)"
+  "$ix_bin" compile "$repo/Benchmarks/Compile/$module.lean" \
+    --out "$ixe" 2>&1 | tee "$compile_log"
+  echo "::endgroup::"
+fi
+
+case "$backend" in
+  aiur)
+    # One bench-typecheck per constant (isolation + per-constant peak-rss).
+    # Execute mode → Phase 1 only (--execute-only). Prove mode → always attempt
+    # a full prove (no tier gate), bounded two ways: a RAM watchdog SIGKILLs
+    # the process group when tree-RSS nears the runner's ceiling (the constant
+    # then records the `oom: true` sentinel — merged into any Phase-1 metrics
+    # already measured — so bench.py compare renders `OOM` instead of dropping
+    # the row), and a wall-clock `timeout` bounds a runaway prove. `$out` is
+    # re-merged after every constant so a job-level kill mid-loop still leaves
+    # the completed rows on disk.
+    ceiling_gb=${AIUR_PROVE_MAX_RSS_GB:-120}
+    bt_bin=$(resolve_bin bench-typecheck) || exit 2
+    rows="$tmp/rows"; mkdir -p "$rows"
+    while IFS= read -r c; do
+      [ -z "$c" ] && continue
+      slug=$(printf '%s' "$c" | tr '/ .:' '____')
+      res="$tmp/$slug.json"; spans="$res.spans"; oom="$tmp/$slug.oom"
+      rm -f "$oom"
+      # bench-typecheck self-reports peak-rss (texray tree sampler) in its --json;
+      # with --texray + --json it also writes per-phase aiur/*, stark/* timings
+      # to `<json>.spans` for the drill-down.
+      if [ "$mode" = execute ]; then
+        timeout "${AIUR_EXECUTE_TIMEOUT:-25m}" \
+          "$bt_bin" --ixe "$ixe" --consts "$c" --json "$res" --execute-only --texray \
+          > "$tmp/$slug.log" 2>&1 \
+          || { echo "::warning::aiur execute '$c' failed/timed out; dropping" >&2; continue; }
+      else
+        # setsid: bench-typecheck leads its own process group so the watchdog's
+        # group-kill reaches every descendant.
+        setsid timeout "${AIUR_PROVE_TIMEOUT:-50m}" \
+          "$bt_bin" --ixe "$ixe" --consts "$c" --json "$res" --texray \
+          > "$tmp/$slug.log" 2>&1 &
+        bt_pid=$!
+        watch_ram_kill "$bt_pid" "$ceiling_gb" "$oom" &
+        w_pid=$!
+        wait "$bt_pid" 2>/dev/null; bt_exit=$?
+        kill "$w_pid" 2>/dev/null || true
+        wait "$w_pid" 2>/dev/null || true
+        if looks_like_oom "$bt_exit" "$oom" "$tmp/$slug.log"; then
+          echo "::warning::aiur prove '$c' OOM (exit $bt_exit, marker=$([ -f "$oom" ] && echo watchdog || echo runtime), ceiling ${ceiling_gb} GB)" >&2
+          mark_oom "$res" "$c"
+        elif [ "$bt_exit" -ne 0 ]; then
+          echo "::warning::aiur prove '$c' failed (exit $bt_exit); dropping" >&2
+          { sed -n '1,5p' "$tmp/$slug.log"; echo "  …"; tail -n 3 "$tmp/$slug.log"; } >&2 || true
+          continue
+        fi
+      fi
+      merge_phases "$res" "$spans"
+      if [ -s "$res" ]; then
+        cp "$res" "$rows/$slug.json"
+        jq -s 'reduce .[] as $o ({}; . + $o)' "$rows"/*.json > "$out" 2>/dev/null || true
+      fi
+    done < "$names"
+    emit_empty
+    ;;
+
+  zisk|sp1)
+    # zkVM prove is not currently wired up (no GPU runner), so this branch runs
+    # execute only. The workflow filters `zisk|sp1 prove` at parse time.
+    if [ "$mode" != execute ]; then
+      echo "::error::$backend $mode: only execute mode is supported" >&2
+      emit_empty; exit 2
+    fi
+    host="${backend}-host"; work="$repo/$backend"
+    # Build the host once so per-constant timing excludes compilation. The host
+    # self-measures and writes its own neutral results JSON via `--json`
+    # (cycles/execute-time/throughput/peak-rss), so there is nothing to grep —
+    # `timeout` only bounds a runaway constant.
+    echo "::group::cargo build $host"
+    ( cd "$work" && cargo build --quiet --release --bin "$host" )
+    echo "::endgroup::"
+    bin="$work/target/release/$host"
+    # ZisK's ASM microservices mmap the ROM with MAP_LOCKED, needing unlimited
+    # locked memory (the Zisk book's DefaultLimitMEMLOCK=infinity). The warp
+    # runner caps the memlock hard limit and can't be rebooted, so raise it
+    # in-session as root; the host children inherit it. Without this the ASM
+    # services die with `mmap(rom) errno=11`. SP1 needs no such raise.
+    [ "$backend" = zisk ] && sudo prlimit --pid $$ --memlock=unlimited:unlimited
+    # A group-killed Zisk run skips the host's Drop-time cleanup of its
+    # /dev/shm/ZISK_* segments and semaphores (multi-GB — the MT output segment
+    # alone starts at 6 GB), so the NEXT host launch fails creating its own
+    # segments (tmpfs / MAP_LOCKED exhaustion) before Zisk's startup stale-pid
+    # sweep can save it — one dropped constant per watchdog kill. Sweep the
+    # dead run's debris ourselves; nothing zisk-related is alive at call time.
+    zisk_shm_sweep() {
+      [ "$backend" = zisk ] || return 0
+      pkill -KILL -f -- '--shm_prefix ZISK_' 2>/dev/null
+      rm -f /dev/shm/ZISK_* /dev/shm/sem.ZISK_* 2>/dev/null
+      return 0
+    }
+    ceiling_gb=${ZKVM_EXECUTE_MAX_RSS_GB:-120}
+    rows="$tmp/rows"; mkdir -p "$rows"
+    # One watchdog-guarded guest run, keyed `$key` in the results. Full
+    # closures are RAM-unbounded (the ASM microservices mmap multi-GB ROMs on
+    # top of the guest trace), so the same watchdog as the aiur prove path
+    # guards the runner. `exec setsid`: the subshell (whose pid is $!)
+    # replaces itself with the session leader, so the watchdog's group-kill
+    # (`kill -- -$!`) reaches the host and every descendant — without it a
+    # plain subshell wrapper's pgid would be run.sh's own. The host writes
+    # $res only on a clean (zero-failure) run; `$out` is re-merged per run so
+    # a job-level kill keeps completed rows.
+    zkvm_run() {  # <timeout> <key> <ixe> <host args…>
+      local run_timeout="$1" key="$2" run_ixe="$3"; shift 3
+      local slug; slug=$(printf '%s' "$key" | tr '/ .:' '____')
+      local res="$tmp/$slug.json" log="$tmp/$slug.log" oom="$tmp/$slug.oom"
+      local spans="$res.spans" zk_pid w_pid zk_exit
+      rm -f "$oom"
+      ( cd "$work" && exec setsid timeout "$run_timeout" "$bin" --execute \
+          --ixe "$run_ixe" --json "$res" --texray "$@" ) \
+        > "$log" 2>&1 &
+      zk_pid=$!
+      watch_ram_kill "$zk_pid" "$ceiling_gb" "$oom" &
+      w_pid=$!
+      wait "$zk_pid" 2>/dev/null; zk_exit=$?
+      kill "$w_pid" 2>/dev/null || true
+      wait "$w_pid" 2>/dev/null || true
+      [ "$zk_exit" -ne 0 ] && zisk_shm_sweep
+      if looks_like_oom "$zk_exit" "$oom" "$log"; then
+        echo "::warning::$backend execute '$key' OOM (exit $zk_exit, marker=$([ -f "$oom" ] && echo watchdog || echo runtime), ceiling ${ceiling_gb} GB)" >&2
+        mark_oom "$res" "$key"
+      elif [ "$zk_exit" -ne 0 ] && grep -q 'kernel typecheck produced' "$log" 2>/dev/null; then
+        # The kernel REJECTED the constant (the host fails fast and aborts
+        # any remaining shards). Record the `failed` sentinel — compare
+        # renders a ❌ row + loud note, and the workflow fails at the end.
+        echo "::error::$backend: '$key' FAILED TO TYPECHECK — kernel rejected it" >&2
+        tail -n 3 "$log" >&2 || true
+        jq -n --arg n "$key" '{($n): {failed: true}}' > "$res"
+      elif [ "$zk_exit" -ne 0 ]; then
+        echo "::warning::$backend execute '$key' failed/timed out (exit $zk_exit); dropping" >&2
+        # Head for early failures (name resolution), tail for late ones.
+        { sed -n '1,5p' "$log"; echo "  …"; tail -n 3 "$log"; } >&2 || true
+        return 0
+      fi
+      merge_phases "$res" "$spans"
+      if [ -s "$res" ]; then
+        cp "$res" "$rows/$slug.json"
+        jq -s 'reduce .[] as $o ({}; . + $o)' "$rows"/*.json > "$out" 2>/dev/null || true
+      fi
+    }
+    # Closure-sharded pipeline for the heavy tier (zisk only). A heavy
+    # constant's full closure blows the runner's RAM as a single leaf, so it
+    # runs as its shard-manifest partition instead: `ix shard extract` cuts a
+    # standalone closure-only env, `ix profile` → `ix shard` cut the manifest
+    # (the canonical partitioner: profiled heartbeats + min-cut, capped by
+    # predicted RAM), and one `--shard-plan` host run executes the shards
+    # sequentially, emitting the constant's row (totals + per-shard
+    # breakdown). bench-main pre-cuts the artifacts in the compile job and
+    # ships them via cache; the PR side cuts its own — a PR can change the
+    # cost profile, and profiling counts heartbeats (not wall time) so an
+    # unchanged tree re-partitions deterministically. If cutting isn't
+    # possible (no ix binary, or a failure), fall back to the single-leaf
+    # run — the watchdog then records the honest OOM row.
+    heavy_file="${ZISK_HEAVY_NAMES:-}"
+    is_heavy() {
+      [ -n "$heavy_file" ] && [ -f "$heavy_file" ] && grep -qxF "$1" "$heavy_file"
+    }
+    shards_dir=$(shards_dir_for_env)
+    # Full-closure check (no --skip-deps) so this is directly comparable to
+    # the ooc `ix check-rs --anon --consts` run — the delta then isolates the
+    # in-circuit-vs-out-of-circuit overhead rather than mixing in subject-
+    # only vs full-closure scope.
+    while IFS= read -r c; do
+      [ -z "$c" ] && continue
+      slug=$(printf '%s' "$c" | tr '/ .:' '____')
+      if [ "$backend" = zisk ] && is_heavy "$c" && cut_closure_shards "$c" "$slug"; then
+        zkvm_run "${ZKVM_EXECUTE_TIMEOUT:-25m}" "$c" "$shards_dir/$slug.ixe" \
+          --shard-plan "$shards_dir/$slug.ixes" --json-name "$c"
+      else
+        zkvm_run "${ZKVM_EXECUTE_TIMEOUT:-25m}" "$c" "$ixe" --consts "$c"
+      fi
+    done < "$names"
+    emit_empty
+    ;;
+
+  ooc)
+    # Out-of-circuit Rust kernel (far faster than proving). Two views, both keyed
+    # off the structured line
+    #   `##check## <elapsed_ms> <passed> <failures> <total> <peak-rss-bytes>`
+    # (peak-rss from ix check's tracing-texray tree sampler): the whole env in
+    # parallel (`--anon`, keyed by env), and a per-primary check
+    # (`--anon --consts`, keyed by constant) that runs the constant's FULL
+    # dependency closure in anon mode — the same mode and scope as the zkVM
+    # execute above, so the delta isolates in-circuit vs out-of-circuit
+    # overhead rather than mixing in closure-size or metadata effects.
+    ix_bin=$(resolve_bin ix) || exit 2
+    ooc_one() {  # <label> <ix-check-args…>  → prints one JSON object
+      local label="$1"; shift
+      local log="$tmp/n.out"
+      "$ix_bin" check-rs "$ixe" "$@" > "$log" 2>>"$log" \
+        || { echo "::warning::ooc '$label' check failed; dropping" >&2; return; }
+      local line ems fl tot rss
+      line=$(grep '^##check##' "$log" | tail -1)
+      ems=$(echo "$line" | awk '{print $2}'); fl=$(echo "$line" | awk '{print $4}')
+      tot=$(echo "$line" | awk '{print $5}'); rss=$(echo "$line" | awk '{print $6}')
+      if [ -n "${fl:-}" ] && [ "$fl" != 0 ]; then
+        echo "::error::ooc: '$label' FAILED TO TYPECHECK — kernel rejected $fl item(s)" >&2
+        jq -n --arg n "$label" '{($n): {failed: true}}'
+        return
+      fi
+      [ -n "${tot:-}" ] \
+        || { echo "::warning::ooc '$label': bad ##check## line; dropping" >&2; return; }
+      local cs tp
+      cs=$(awk -v e="$ems" 'BEGIN{printf "%.3f", e/1000}')
+      tp=$(awk -v t="$tot" -v e="$ems" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }')
+      jq -n --arg n "$label" --argjson c "$tot" --argjson s "$cs" --argjson tp "$tp" \
+            --argjson rss "${rss:-0}" \
+        '{($n): {constants:$c, "check-time":$s, throughput:$tp, "peak-rss":$rss}}'
+    }
+    {
+      # Whole-env row keyed by the CamelCase env slug so the PR side matches
+      # what bench-main.yml uploads to bencher (matrix.bench, e.g. `InitStd`).
+      ooc_one "$benv_cc" --anon
+      while IFS= read -r c; do
+        [ -z "$c" ] && continue
+        ooc_one "$c" --anon --consts "$c"
+      done < "$names"
+    } | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null
+    emit_empty
+    ;;
+
+  cutshards)
+    # Pre-cut the closure-shard artifacts for every name (bench-main's
+    # compile job — it has `ix` + the Lean toolchain next to the fresh
+    # `.ixe`, so the zkvm job stays Lean-free and just restores the dir).
+    # Exactly the artifacts the zisk branch cuts lazily when absent.
+    while IFS= read -r c; do
+      [ -z "$c" ] && continue
+      slug=$(printf '%s' "$c" | tr '/ .:' '____')
+      cut_closure_shards "$c" "$slug" || true
+    done < "$names"
+    emit_empty
+    ;;
+
+  compile)
+    # `ix compile <env>.lean → <env>.ixe` is the benchmark; the compile step
+    # above always ran fresh for this backend (REUSE_IXE ignored) and teed to
+    # `$compile_log`. `ix compile` emits `##benchmark## <elapsed_ms> <bytes>
+    # <constants>` which we parse into the neutral results shape. The bencher
+    # benchmark name is the CamelCase env slug (matches bench-main.yml's
+    # matrix.bench keys: `InitStd`, `Lean`, `Mathlib`, `FLT`).
+    line=$(grep '^##benchmark##' "$compile_log" 2>/dev/null | tail -1)
+    if [ -z "$line" ]; then
+      echo "::warning::compile: no ##benchmark## line in $compile_log; dropping" >&2
+      emit_empty
+    else
+      elapsed_ms=$(echo "$line" | awk '{print $2}')
+      bytes=$(echo "$line" | awk '{print $3}')
+      constants=$(echo "$line" | awk '{print $4}')
+      elapsed_s=$(awk -v e="$elapsed_ms" 'BEGIN{printf "%.3f", e/1000}')
+      throughput=$(awk -v c="$constants" -v e="$elapsed_ms" \
+        'BEGIN{ if (e>0) printf "%.2f", c*1000/e; else print 0 }')
+      jq -n --arg n "$benv_cc" \
+            --argjson t "$elapsed_s" --argjson b "$bytes" \
+            --argjson c "$constants" --argjson tp "$throughput" \
+        '{($n): {"compile-time":$t,"file-size":$b,"constants":$c,"throughput":$tp}}' \
+        > "$out"
+    fi
+    ;;
+
+  *) echo "unknown backend: $backend" >&2; exit 2 ;;
+esac
+echo "rows in $out: $(jq 'length' "$out" 2>/dev/null || echo '?')" >&2
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
index 957b51b3..71e12911 100644
--- a/.github/workflows/bench-main.yml
+++ b/.github/workflows/bench-main.yml
@@ -1,24 +1,33 @@
-name: Aiur benchmarks
+name: Benchmark main
 
-# One workflow, two benchmarks per library env, on every push to main:
-#   1. compile job — `ix compile` the Lean env to a `.ixe` (compile-throughput
-#                    metrics) and cache the `.ixe`.
-#   2. prove job   — restore that `.ixe` from the cache (no recompile) and
-#                    STARK-check selected constants over it via bench-typecheck
-#                    (Aiur execute + prove metrics).
-# The prove job reuses the exact `.ixe` the compile job built, so the compiler
-# runs once. Compile and prove report to separate bencher testbeds so each one's
-# `--thresholds-reset` only touches its own measures.
+# Benchmarks tracked on Bencher on every push to main, all reusing the one
+# compiled `.ixe` so the compiler runs once:
+#   1. compile      — `ix compile` the Lean env to a `.ixe` (compile-throughput
+#                     metrics) and cache the `.ixe`.
+#   2. prove        — restore that `.ixe` (no recompile) and STARK-check selected
+#                     constants over it via bench-typecheck (Aiur execute + prove).
+#   3. zkvm-execute — restore that `.ixe` and execute the same constants through
+#                     the Zisk and SP1 zkVM hosts (deterministic cycle counts +
+#                     time/throughput/RAM; proving needs a GPU, so execute-only).
+#   4. ooc-check    — restore that `.ixe` and run the out-of-circuit Rust kernel
+#                     (the same kernel, out-of-circuit and parallel — far faster)
+#                     over the whole env via `ix check --anon`, tracking throughput.
+# Each job reports to its own bencher testbed/workload so a threshold reset only
+# touches its own measures.
 
 on:
   push:
-    branches: main
+    branches: [main, sb/ci-benchmarks]   # TEMPORARY: test on this branch
   workflow_dispatch:
 
 permissions:
   contents: read
   checks: write
 
+# No concurrency group: push-to-main and manual dispatch only — every merged
+# commit gets benchmarked; a later merge must never cancel or queue behind an
+# in-flight run.
+
 env:
   COMPILE_DIR: Benchmarks/Compile
 
@@ -57,7 +66,7 @@ jobs:
       - uses: actions/cache/save@v5
         with:
           path: ~/.local/bin
-          key: aiur-bench-bins-${{ github.sha }}
+          key: bench-bins-${{ github.sha }}
 
   # Compile each library env to a `.ixe` and track compile throughput. Caches
   # the `.ixe` (keyed by sha + matrix job) for the prove job to consume.
@@ -85,7 +94,7 @@ jobs:
       - uses: actions/cache/restore@v5
         with:
           path: ~/.local/bin
-          key: aiur-bench-bins-${{ github.sha }}
+          key: bench-bins-${{ github.sha }}
       - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
       # FC's library env lives in a sibling `${COMPILE_DIR}FC` package dir, so
       # point COMPILE_DIR there for the FC matrix job.
@@ -110,35 +119,45 @@ jobs:
       # warning that must not fail the build.
       - run: lake build Compile${{ matrix.bench }}
         working-directory: ${{ env.COMPILE_DIR }}
-      # Serialize the env to a `.ixe` and emit the `##benchmark##` line.
-      - name: Run ix compile
+      # Serialize the env to a `.ixe` and measure the compile via run.sh's
+      # `compile` backend — the same driver + `##benchmark##` parser the
+      # !benchmark PR path uses, so the line format is parsed in exactly one
+      # place. run.sh writes `<Bench>.ixe` at the workspace root (the env arg
+      # is used verbatim for the filename) and neutral.json keyed by the
+      # CamelCase env slug; bmf wraps it for bencher.
+      - name: Run ix compile benchmark
+        run: |
+          bash .github/scripts/run.sh . ${{ matrix.bench }} compile compile /dev/null neutral.json
+          python3 .github/scripts/bench.py bmf --in neutral.json --out benchmark.json
+          cat benchmark.json
+      # Pre-cut the zisk closure-shard artifacts for the heavy primaries
+      # (`ix extract` → `ix profile` → `ix shard`, via run.sh's cutshards
+      # backend — the same code path the !benchmark PR side runs lazily).
+      # Done here, next to the fresh `.ixe` with `ix` + the Lean toolchain
+      # on hand, so the zkvm job stays Lean-free and just restores the dir.
+      - if: matrix.bench == 'InitStd' || matrix.bench == 'Mathlib'
+        name: Cut closure shards for heavy primaries
+        env:
+          REUSE_IXE: "1"
         run: |
-          ix compile ${{ env.COMPILE_DIR }}/Compile${{ matrix.bench }}.lean \
-            --out ${{ matrix.bench }}.ixe 2>&1 | tee output.txt
-      # Cache the `.ixe` for the prove job (reused, never recompiled there).
-      # Only the matrix jobs the prove job consumes, to stay under the repo cache limit.
+          benv="${{ matrix.bench }}"; benv="${benv,}"
+          python3 .github/scripts/bench.py manifest \
+            --csv Benchmarks/Vectors.csv --env "$benv" --mode execute --primary \
+            --heavy-out heavy.txt --out /dev/null
+          echo "cutting closure shards for $(wc -l < heavy.txt) heavy primaries:"; cat heavy.txt
+          bash .github/scripts/run.sh . "${{ matrix.bench }}" cutshards execute heavy.txt /dev/null
+      # Cache the `.ixe` + closure-shard artifacts for the prove/zkvm jobs
+      # (reused, never regenerated there). Only the matrix jobs those
+      # consume, to stay under the repo cache limit. NB: every restore of
+      # this key must list the SAME paths — actions/cache versions the entry
+      # by its path list.
       - if: matrix.bench == 'InitStd' || matrix.bench == 'Mathlib'
         uses: actions/cache/save@v5
         with:
-          path: ${{ matrix.bench }}.ixe
-          key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
-      - name: Generate compile benchmark JSON
-        run: |
-          line=$(grep '^##benchmark##' output.txt)
-          elapsed_s=$(echo "$line" | awk '{printf "%.3f", $2 / 1000}')
-          bytes=$(echo "$line" | awk '{print $3}')
-          constants=$(echo "$line" | awk '{print $4}')
-          throughput=$(echo "$line" | awk '{if ($2 > 0) printf "%.2f", $4 * 1000 / $2; else print 0}')
-          cat > benchmark.json <<EOF
-          {
-            "${{ matrix.bench }}": {
-              "compile-time": {"value": ${elapsed_s}},
-              "file-size": {"value": ${bytes}},
-              "throughput": {"value": ${throughput}},
-              "constants": {"value": ${constants}}
-            }
-          }
-          EOF
+          path: |
+            ${{ matrix.bench }}.ixe
+            zkshards-${{ matrix.bench }}
+          key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
       # Upload compile metrics. Every measure shares the per-workload baseline
       # window (data points since the ix-compile reset tag): file-size/constants
       # are deterministic, pinned exactly (0/0); compile-time rides a 5% upper
@@ -173,11 +192,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        include:
-          - bench: InitStd
-            consts: Nat.add_comm Nat.sub_le_of_le_add String.append Array.append_assoc
-          - bench: Mathlib
-            consts: Nat.factorial Nat.Coprime Nat.Prime.two_le
+        bench: [InitStd, Mathlib]
     steps:
       - uses: actions/checkout@v6
         with:
@@ -186,7 +201,7 @@ jobs:
       - uses: actions/cache/restore@v5
         with:
           path: ~/.local/bin
-          key: aiur-bench-bins-${{ github.sha }}
+          key: bench-bins-${{ github.sha }}
       - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
       # Provision the toolchain so the bench-typecheck binary finds libleanshared
       # (no package build). use-github-cache off: nothing to cache here, and
@@ -197,10 +212,13 @@ jobs:
           build: false
           use-github-cache: false
       # Pull the `.ixe` the compile job built — do NOT recompile it here.
+      # (The path list must match the compile job's save exactly.)
       - uses: actions/cache/restore@v5
         with:
-          path: ${{ matrix.bench }}.ixe
-          key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
+          path: |
+            ${{ matrix.bench }}.ixe
+            zkshards-${{ matrix.bench }}
+          key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
           fail-on-cache-miss: true
       # Run each constant in its own process so a clean failure or timeout drops
       # only that constant from the report. NB: a constant heavy enough to OOM
@@ -213,43 +231,37 @@ jobs:
       # fold the max over spans in as `peak-rss` (bytes), the proving RSS
       # high-water mark.
       - name: Run Aiur typecheck benchmark
+        env:
+          REUSE_IXE: "1"
         run: |
-          measure() {
-            local c="$1" rss
-            timeout 20m bench-typecheck --ixe ${{ matrix.bench }}.ixe "$c" \
-              --json "res-$c.json" --texray 2>"tx-$c.log" \
-              || echo "warning: $c failed (OOM/timeout); dropping it from this report"
-            rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>max {max=$2+0} END {if (max>0) print max}' "tx-$c.log")
-            if [ -f "res-$c.json" ] && [ -n "$rss" ] && [ "$rss" -gt 0 ]; then
-              jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' \
-                "res-$c.json" > "res-$c.json.tmp" && mv "res-$c.json.tmp" "res-$c.json" || true
-            fi
-          }
-          for c in ${{ matrix.consts }}; do measure "$c"; done
-          # Merge the per-constant results; if none produced anything, emit `{}`.
-          jq -s 'reduce .[] as $o ({}; . + $o)' res-*.json > results.json 2>/dev/null \
-            || echo '{}' > results.json
-          [ -s results.json ] || echo '{}' > results.json
-          # Wrap each metric value as { "value": v } for Bencher Metric Format.
-          # bench-typecheck already emits slug keys (constants, fft-cost,
-          # execute-time, prove-time, throughput = constants/prove-time); peak-rss
-          # is injected above.
-          jq '
-            map_values(to_entries | map({(.key): {value: .value}}) | add)
-          ' results.json > aiur.json
+          # All primaries: run.sh attempts a full prove of every primary under
+          # the RAM watchdog; a too-large prove records the `oom: true` sentinel
+          # (alongside any Phase-1 execute metrics measured before the kill).
+          # Per-constant peak-rss, same path as the !benchmark PR run.
+          benv="${{ matrix.bench }}"; benv="${benv,}"   # InitStd→initStd, Mathlib→mathlib
+          python3 .github/scripts/bench.py manifest \
+            --csv Benchmarks/Vectors.csv --env "$benv" --mode prove --primary --out names.txt
+          echo "proving $(wc -l < names.txt) primary constants:"; cat names.txt
+          bash .github/scripts/run.sh . "${{ matrix.bench }}" aiur prove names.txt neutral.json
+          # neutral → Bencher Metric Format (phases flattened to phase:<span>
+          # measures; the boolean `oom` sentinel stripped — see bench.py bmf).
+          python3 .github/scripts/bench.py bmf --in neutral.json --out aiur.json
           cat aiur.json
       # Upload Aiur metrics. Every measure shares the per-workload baseline
-      # window (data points since the aiur reset tag). constants is deterministic
+      # window (data points since the aiur-check reset tag). constants is deterministic
       # → pinned exactly (0/0). fft-cost is deterministic but only ever drops on
       # a real Aiur win, so it rides an upper-only 5% bound (flag a regression,
       # let wins through) rather than a hard pin. prove-time/execute-time,
       # peak-rss (texray's proving RSS high-water mark), and throughput
       # (constants/prove-time, where a drop is the regression) are noisy
-      # wall-clock and ride percentage bounds.
+      # wall-clock and ride percentage bounds. The per-phase `phase:<span>`
+      # measures are uploaded for trend visibility but intentionally left
+      # un-thresholded (noisy and dynamically named; the PR-comment drill-down
+      # does the phase-level alerting).
       - uses: ./.github/actions/bencher-track
         with:
-          testbed: aiur-typecheck-x64-32x
-          workload: aiur
+          testbed: aiur-check-x64-32x
+          workload: aiur-check
           file: aiur.json
           key: ${{ secrets.BENCHER_API_KEY }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -263,12 +275,223 @@ jobs:
             --threshold-measure prove-time --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
+            --threshold-measure verify-time --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
+            --threshold-measure proof-size --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.05
+            --threshold-lower-boundary _
             --threshold-measure execute-time --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
             --threshold-measure peak-rss --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
+            --threshold-measure execute-peak-rss --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
             --threshold-measure throughput --threshold-test percentage
             --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
             --threshold-lower-boundary 0.10
+      # A typecheck failure is a correctness regression, not a benchmark blip.
+      # The failing constants never reach bencher (bmf strips the sentinel and
+      # drops the empty entry) and the clean rows are uploaded above — now
+      # fail the job LOUDLY so the red X lands on the commit.
+      - name: Fail on typecheck failures
+        run: |
+          failed=$(jq -r 'to_entries[] | select(.value.failed == true) | .key' neutral.json)
+          if [ -n "$failed" ]; then
+            for c in $failed; do echo "::error::$c FAILED TO TYPECHECK"; done
+            exit 1
+          fi
+
+  # Execute the same constants through the Zisk and SP1 zkVM hosts and track
+  # cycles / execute-time / throughput / execute-peak-rss (and shards / max-shard-cycles
+  # for any sharded run). Lean-free: reuses the compile job's cached `.ixe` and
+  # only builds the Rust host. zkVM proving needs a GPU (absent here), so this is
+  # execute-only. Toolchain + deps come from the shared install-{zisk,sp1} actions.
+  zkvm-execute:
+    needs: compile
+    runs-on: warp-ubuntu-latest-x64-32x
+    # Per-constant loop (heavy primaries ride the RAM watchdog to OOM rows)
+    # plus the env-sharded whole-env run (its own 60m timeout in run.sh) —
+    # run.sh re-merges results incrementally, so even a job-level timeout
+    # would keep completed rows, but the bencher upload needs the job alive.
+    timeout-minutes: 150
+    strategy:
+      fail-fast: false
+      matrix:
+        # backend: [zisk, sp1] — sp1 disabled for now: its execute run is too
+        # slow for a per-push job. Re-add to the list to re-enable.
+        backend: [zisk]
+        bench: [InitStd, Mathlib]
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          fetch-tags: true   # bencher-track reads the bencher-thresholds-reset tag
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          cache-workspaces: ${{ matrix.backend }}
+      - name: Install Zisk
+        if: matrix.backend == 'zisk'
+        uses: ./.github/actions/install-zisk
+      - name: Install SP1
+        if: matrix.backend == 'sp1'
+        uses: ./.github/actions/install-sp1
+      # Pull the `.ixe` + the pre-cut closure-shard artifacts the compile job
+      # built — no recompile (REUSE_IXE), no `ix` needed here: run.sh's zisk
+      # branch runs each heavy primary as its shard-manifest partition when
+      # its artifacts are present in zkshards-<Bench>/.
+      - uses: actions/cache/restore@v5
+        with:
+          path: |
+            ${{ matrix.bench }}.ixe
+            zkshards-${{ matrix.bench }}
+          key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
+          fail-on-cache-miss: true
+      - name: Run ${{ matrix.backend }} execute benchmark
+        env:
+          REUSE_IXE: "1"
+        run: |
+          # The primary subset in execute mode = all primaries for the env
+          # (cheap + heavy); heavy ones run closure-sharded on zisk.
+          benv="${{ matrix.bench }}"; benv="${benv,}"   # InitStd→initStd
+          python3 .github/scripts/bench.py manifest \
+            --csv Benchmarks/Vectors.csv --env "$benv" --mode execute --primary \
+            --heavy-out heavy.txt --out names.txt
+          echo "executing $(wc -l < names.txt) primary constants ($(wc -l < heavy.txt) heavy):"; cat names.txt
+          export ZISK_HEAVY_NAMES="$GITHUB_WORKSPACE/heavy.txt"
+          bash .github/scripts/run.sh . ${{ matrix.bench }} ${{ matrix.backend }} execute \
+            names.txt neutral.json
+          # neutral → Bencher Metric Format (phases flattened; the boolean
+          # `oom` sentinel a watchdog-killed execute records is stripped —
+          # see bench.py bmf).
+          python3 .github/scripts/bench.py bmf --in neutral.json --out bench.json
+          cat bench.json
+      # cycles / shards / max-shard-cycles are deterministic per guest ELF, but
+      # a real guest / packer improvement legitimately drops them — upper-only
+      # 0% bound (flag regressions, let wins through), like `fft-cost` on the
+      # aiur job. execute-time / peak-rss / throughput are noisy wall-clock →
+      # percentage bounds (throughput's regression is a drop).
+      - uses: ./.github/actions/bencher-track
+        with:
+          testbed: ${{ matrix.backend }}-check-x64-32x
+          workload: ${{ matrix.backend }}-check
+          file: bench.json
+          key: ${{ secrets.BENCHER_API_KEY }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          thresholds: |
+            --threshold-measure cycles --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
+            --threshold-lower-boundary _
+            --threshold-measure shards --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
+            --threshold-lower-boundary _
+            --threshold-measure max-shard-cycles --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
+            --threshold-lower-boundary _
+            --threshold-measure execute-time --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
+            --threshold-measure execute-peak-rss --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
+            --threshold-measure throughput --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
+            --threshold-lower-boundary 0.10
+      # See the prove job's twin step: reject → red X, after the clean upload.
+      - name: Fail on typecheck failures
+        run: |
+          failed=$(jq -r 'to_entries[] | select(.value.failed == true) | .key' neutral.json)
+          if [ -n "$failed" ]; then
+            for c in $failed; do echo "::error::$c FAILED TO TYPECHECK"; done
+            exit 1
+          fi
+
+  # Out-of-circuit Rust kernel typecheck — the same kernel as the zkVM guest, but
+  # run out-of-circuit and in parallel (`--workers` defaults to the core count),
+  # so far faster than proving. Checks the whole env via `ix check --anon`,
+  # tracking throughput (constants/sec), wall time, and peak RAM. Reuses the
+  # compile job's cached `.ixe` and the staged `ix` binary — no recompile.
+  ooc-check:
+    needs: compile
+    runs-on: warp-ubuntu-latest-x64-32x
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        bench: [InitStd, Mathlib]
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          fetch-tags: true   # bencher-track reads the bencher-thresholds-reset tag
+      - uses: actions/cache/restore@v5
+        with:
+          path: ~/.local/bin
+          key: bench-bins-${{ github.sha }}
+      - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
+      # Provision the toolchain so `ix` finds libleanshared (no package build).
+      - uses: leanprover/lean-action@v1
+        with:
+          auto-config: false
+          build: false
+          use-github-cache: false
+      - name: Install GNU time
+        run: |
+          sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true
+          sudo apt-get update && sudo apt-get install -y time
+      - uses: actions/cache/restore@v5
+        with:
+          path: |
+            ${{ matrix.bench }}.ixe
+            zkshards-${{ matrix.bench }}
+          key: bench-ixe-${{ github.sha }}-${{ matrix.bench }}
+          fail-on-cache-miss: true
+      # run.sh ooc runs `ix check --anon` (whole env, parallel) and emits the
+      # neutral { <env>: { constants, check-time, throughput, peak-rss } } — same
+      # path as the !benchmark ooc backend.
+      - name: Run out-of-circuit kernel check
+        env:
+          REUSE_IXE: "1"
+        run: |
+          # Whole env (keyed by env) + the primary constants checked full-closure
+          # (keyed by constant) for an apples-to-apples baseline next to zisk/sp1.
+          benv="${{ matrix.bench }}"; benv="${benv,}"
+          python3 .github/scripts/bench.py manifest \
+            --csv Benchmarks/Vectors.csv --env "$benv" --mode execute --primary --out names.txt
+          bash .github/scripts/run.sh . "${{ matrix.bench }}" ooc execute names.txt neutral.json
+          # neutral → Bencher Metric Format (see bench.py bmf).
+          python3 .github/scripts/bench.py bmf --in neutral.json --out bench.json
+          cat bench.json
+      # constants is deterministic → pinned (0/0); check-time / throughput /
+      # peak-rss are noisy parallel wall-clock → percentage bounds.
+      - uses: ./.github/actions/bencher-track
+        with:
+          testbed: ooc-check-x64-32x
+          workload: ooc-check
+          file: bench.json
+          key: ${{ secrets.BENCHER_API_KEY }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          thresholds: |
+            --threshold-measure constants --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
+            --threshold-lower-boundary 0
+            --threshold-measure check-time --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
+            --threshold-measure throughput --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
+            --threshold-lower-boundary 0.10
+            --threshold-measure peak-rss --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
+      # See the prove job's twin step: reject → red X, after the clean upload.
+      - name: Fail on typecheck failures
+        run: |
+          failed=$(jq -r 'to_entries[] | select(.value.failed == true) | .key' neutral.json)
+          if [ -n "$failed" ]; then
+            for c in $failed; do echo "::error::$c FAILED TO TYPECHECK"; done
+            exit 1
+          fi
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
index a2a1823c..e44b4861 100644
--- a/.github/workflows/bench-pr.yml
+++ b/.github/workflows/bench-pr.yml
@@ -1,4 +1,25 @@
-# Creates a PR benchmark comment with a comparison to main
+# `!benchmark` PR command: run the curated constant set (Benchmarks/Vectors.csv)
+# through chosen prover backend(s) and post a main-vs-PR comparison table.
+#
+#   !benchmark ([aiur] [zisk] [sp1] [ooc] [compile] | all) [execute]
+#     (sp1 is temporarily disabled: its install step below is commented out,
+#      so an sp1 cell fails at the install-less execute step)
+#   BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
+#   BENCH_FULL=1                   # run the full curated set, not just primary
+#   BENCH_TIER=cheap|heavy|all     # tier override (default: all)
+#   BENCH_SHARD=1                  # restrict to the multi-shard target constants
+#   RUST_LOG=info                  # passthrough env (allowlisted)
+#
+# Mode is fixed per backend: `aiur` runs `prove` by default (its report also
+# carries the execute-side columns `fft-cost` / `execute-time`); `zisk` / `sp1`
+# / `ooc` run `execute`; `compile` runs `ix compile <env>.lean → <env>.ixe`
+# (the same job bench-main.yml uploads under testbed `ix-compile-*`). The
+# optional bare `execute` token flips `aiur` to execute-only (skips Phase 2);
+# on the other backends it's a no-op. main's numbers come from bencher.dev;
+# the workflow re-runs the base SHA locally only for what bencher can't supply:
+# the full set when the SHA isn't ingested yet (freshly-pushed main whose CI is
+# still running), or just the missing names when the PR's Vectors.csv selects
+# constants main was never benched on (constants the PR itself adds).
 name: Benchmark pull requests
 
 on:
@@ -11,145 +32,419 @@ permissions:
   pull-requests: read
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  group: ${{ github.workflow }}-${{ github.event.issue.number || github.event.pull_request.number }}
   cancel-in-progress: true
 
 jobs:
   setup:
-    name: Comparative PR benchmark comment
-    if:
-      github.event.issue.pull_request
-      && github.event.issue.state == 'open'
-      && (contains(github.event.comment.body, '!benchmark'))
-      && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER')
+    name: Parse !benchmark comment
+    if: >-
+      github.event.issue.pull_request &&
+      github.event.issue.state == 'open' &&
+      contains(github.event.comment.body, '!benchmark') &&
+      (github.event.comment.author_association == 'MEMBER' ||
+       github.event.comment.author_association == 'OWNER')
     runs-on: ubuntu-latest
     outputs:
-      benches: ${{ steps.bench-params.outputs.benches }}
-      env-vars: ${{ steps.bench-params.outputs.env-vars }}
+      matrix: ${{ steps.parse.outputs.matrix }}
+      tier: ${{ steps.parse.outputs.tier }}
+      shard: ${{ steps.parse.outputs.shard }}
+      full: ${{ steps.parse.outputs.full }}
+      passthrough-env: ${{ steps.parse.outputs.passthrough-env }}
+      config-summary: ${{ steps.parse.outputs.config-summary }}
+      base-sha: ${{ steps.shas.outputs.base }}
+      head-sha: ${{ steps.shas.outputs.head }}
     steps:
       - uses: actions/checkout@v6
-      - name: Parse PR comment body
-        id: bench-params
+      # issue_comment exposes the PR's base/head only via this action; the
+      # pull_request event carries them on the payload directly.
+      - if: github.event_name == 'issue_comment'
+        uses: xt0rted/pull-request-comment-branch@v3
+        id: comment-branch
+      - name: Resolve base/head SHAs
+        id: shas
         run: |
-          # Parse `issue_comment` body
-          printf '${{ github.event.comment.body }}' > comment.txt
-          BENCH_COMMAND=$(head -n 1 comment.txt | tr -d '\r')
-          echo "$BENCH_COMMAND"
-
-          BENCHES=$(echo $BENCH_COMMAND | awk -F'!benchmark ' '{ print $2 }')
-          # Set default benches to run if none specified
-          BENCHES=${BENCHES:-"bench-aiur"}
-          echo "BENCHES:"
-          echo "$BENCHES"
-          JSON=$(echo $BENCHES | jq -R -c 'split(" ")')
-
-          echo "JSON:"
-          echo "$JSON"
-
-          echo "benches=$JSON" | tee -a $GITHUB_OUTPUT
+          if [ "${{ github.event_name }}" = pull_request ]; then
+            echo "base=${{ github.event.pull_request.base.sha }}" >> "$GITHUB_OUTPUT"
+            echo "head=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "base=${{ steps.comment-branch.outputs.base_sha }}" >> "$GITHUB_OUTPUT"
+            echo "head=${{ steps.comment-branch.outputs.head_sha }}" >> "$GITHUB_OUTPUT"
+          fi
+      # Parse the !benchmark command from an env var (never inline-interpolated).
+      # The allowlist drops anything that isn't a known flag/env key; an empty
+      # body (pull_request) yields the parser defaults.
+      - name: Parse command
+        id: parse
+        env:
+          COMMENT_BODY: ${{ github.event.comment.body }}
+        run: python3 .github/scripts/bench.py parse
 
-          # Can't persist env vars between jobs, so we pass them as an output and set them in the next job
-          echo "env-vars=$(tail -n +2 comment.txt | tr -d '\r' | tr '\n' ' ')" | tee -a $GITHUB_OUTPUT
+  # Build the PR's `ix` + `bench-typecheck` once (they embed the IxVM kernel
+  # and the Aiur prover), stage under ~/.local/bin, and cache by head SHA —
+  # the matrix cells restore instead of re-running the full Lean build per
+  # cell, and re-running !benchmark on the same commit skips the build
+  # entirely. ubuntu-latest mirrors bench-main.yml's build job so PR binaries
+  # carry the same instruction-set provenance as the binaries behind bencher's
+  # main-side numbers.
+  build:
+    needs: setup
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          ref: ${{ needs.setup.outputs.head-sha }}
+      - id: bins
+        uses: actions/cache/restore@v5
+        with:
+          path: ~/.local/bin
+          key: bench-bins-${{ needs.setup.outputs.head-sha }}
+      - if: steps.bins.outputs.cache-hit != 'true'
+        uses: actions-rust-lang/setup-rust-toolchain@v1
+      # `.cargo/config.toml` sets `-Ctarget-cpu=native`; log the build CPU so a
+      # benchmark shift can be traced to an instruction-set change.
+      - name: Log build CPU
+        if: steps.bins.outputs.cache-hit != 'true'
+        run: |
+          lscpu
+          grep -qw avx512f /proc/cpuinfo \
+            && echo "AVX-512F: present (compiled into the binary)" \
+            || echo "AVX-512F: absent"
+      - if: steps.bins.outputs.cache-hit != 'true'
+        uses: leanprover/lean-action@v1
+        with:
+          auto-config: false
+          build: true
+          build-args: "ix bench-typecheck"
+          use-github-cache: false
+      - if: steps.bins.outputs.cache-hit != 'true'
+        run: |
+          mkdir -p ~/.local/bin
+          cp .lake/build/bin/ix .lake/build/bin/bench-typecheck ~/.local/bin/
+          chmod +x ~/.local/bin/ix ~/.local/bin/bench-typecheck
+      - if: steps.bins.outputs.cache-hit != 'true'
+        uses: actions/cache/save@v5
+        with:
+          path: ~/.local/bin
+          key: bench-bins-${{ needs.setup.outputs.head-sha }}
 
   benchmark:
-    needs: [ setup ]
-    runs-on: warp-ubuntu-latest-x64-16x
+    needs: [setup, build]
+    runs-on: ${{ matrix.cell.runner }}
+    # Wide enough for the zisk cell's worst case: per-constant loop + PR-side
+    # `ix profile` + the env-sharded execute (own 60m cap inside run.sh).
+    timeout-minutes: 180
     strategy:
+      fail-fast: false
       matrix:
-        # Runs a job for each benchmark specified in the `issue_comment` input
-        bench: ${{ fromJSON(needs.setup.outputs.benches) }}
+        cell: ${{ fromJSON(needs.setup.outputs.matrix) }}
+    env:
+      BACKEND: ${{ matrix.cell.backend }}
+      BENV: ${{ matrix.cell.env }}
+      MODE: ${{ matrix.cell.mode }}
+      LABEL: ${{ matrix.cell.label }}
+      BASE_SHA: ${{ needs.setup.outputs.base-sha }}
+      HEAD_SHA: ${{ needs.setup.outputs.head-sha }}
+      TIER: ${{ needs.setup.outputs.tier }}
+      SHARD: ${{ needs.setup.outputs.shard }}
+      FULL: ${{ needs.setup.outputs.full }}
     steps:
-      - name: Set env vars
-        run: |
-          # Overrides default env vars with those specified in the `issue_comment` input if identically named
-          for var in ${{ needs.setup.outputs.env-vars }}
-          do
-            echo "$var" | tee -a $GITHUB_ENV
-          done
-      - uses: actions/checkout@v6
-        # Get base branch of the PR
-      - uses: xt0rted/pull-request-comment-branch@v3
-        id: comment-branch
-      - name: Checkout base branch
+      # PR checked out at the workspace root so the local install actions and the
+      # helper scripts resolve; base (bencher-miss fallback only) goes under base/.
+      - name: Checkout PR
         uses: actions/checkout@v6
         with:
-          ref: ${{ steps.comment-branch.outputs.base_sha }}
-          path : ${{ github.workspace }}/base
-      - name: Run `lake build` on base branch
+          ref: ${{ env.HEAD_SHA }}
+      - name: Apply passthrough env
+        run: |
+          while IFS= read -r line; do
+            [ -n "$line" ] && printf '%s\n' "$line" >> "$GITHUB_ENV"
+          done <<'PTENV'
+          ${{ needs.setup.outputs.passthrough-env }}
+          PTENV
+      # Select the constants for this cell → names.txt. Defaults to the primary
+      # subset; BENCH_FULL=1 (→ full=1) runs the whole curated set. The `compile`
+      # backend short-circuits this in bench.py — its "benchmark name" is the
+      # CamelCase env slug, so names.txt gets one line and the CSV is ignored.
+      - name: Select constants from Benchmarks/Vectors.csv
+        id: man
+        run: |
+          PRIMARY=--primary; [ "$FULL" = 1 ] && PRIMARY=
+          python3 .github/scripts/bench.py manifest \
+            --csv Benchmarks/Vectors.csv --env "$BENV" --mode "$MODE" \
+            --backend "$BACKEND" \
+            --tier "$TIER" --shard "$SHARD" $PRIMARY --out "$GITHUB_WORKSPACE/names.txt" \
+            --heavy-out "$GITHUB_WORKSPACE/heavy.txt" \
+            | tee -a "$GITHUB_OUTPUT"
+          # Heavy-tier names run closure-sharded on zisk (run.sh cuts the
+          # artifacts with `ix extract` → `ix profile` → `ix shard` when
+          # they aren't already cached).
+          echo "ZISK_HEAVY_NAMES=$GITHUB_WORKSPACE/heavy.txt" >> "$GITHUB_ENV"
+
+      # Restore the once-built PR binaries (see the build job) and stage them
+      # into .bins/pr — ~/.local/bin is reused below for the base side's cache
+      # restore, so each side keeps its own directory and PATH. run.sh resolves
+      # tools in-tree first, then PATH, so the staged dir Just Works.
+      - name: Restore PR binaries
+        uses: actions/cache/restore@v5
+        with:
+          path: ~/.local/bin
+          key: bench-bins-${{ env.HEAD_SHA }}
+          fail-on-cache-miss: true
+      - run: |
+          mkdir -p .bins/pr
+          mv ~/.local/bin/ix ~/.local/bin/bench-typecheck .bins/pr/
+      # Toolchain provisioning only (no package build): the restored binaries
+      # link against the toolchain's libleanshared. Mathlib olean cache only
+      # for the mathlib env (its `ix compile` needs the oleans).
+      - name: Provision Lean toolchain
         uses: leanprover/lean-action@v1
         with:
-          lake-package-directory: ${{ github.workspace }}/base
-          test: false
-      - name: Run bench on base branch
+          lake-package-directory: .
+          auto-config: false
+          build: false
+          use-github-cache: false
+          use-mathlib-cache: ${{ matrix.cell.env == 'mathlib' && 'true' || 'false' }}
+      # The compiled env is identical across every cell of the same PR commit —
+      # cache it so only the first cell pays the `ix compile`. (The `compile`
+      # backend ignores REUSE_IXE by design: it measures the compile itself.)
+      - name: Restore PR .ixe
+        id: pr-ixe
+        uses: actions/cache/restore@v5
+        with:
+          path: ${{ matrix.cell.env }}.ixe
+          key: bench-pr-ixe-${{ env.HEAD_SHA }}-${{ matrix.cell.env }}
+      # zkVM cells additionally need the Rust toolchain + the backend's toolchain
+      # and system deps (the shared composite install actions).
+      - name: Set up zkVM Rust toolchain
+        if: matrix.cell.backend == 'zisk' || matrix.cell.backend == 'sp1'
+        uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          cache-workspaces: ${{ matrix.cell.backend }}
+      # sp1 is temporarily disabled (execute too slow for CI); uncomment to
+      # re-enable, along with bench-main.yml's sp1 matrix cell.
+      # - name: Install SP1
+      #   if: matrix.cell.backend == 'sp1'
+      #   uses: ./.github/actions/install-sp1
+      - name: Install Zisk
+        if: matrix.cell.backend == 'zisk'
+        uses: ./.github/actions/install-zisk
+
+      # ---------- main side ----------
+      # Try bencher.dev first (bench-main.yml has uploaded main's numbers).
+      # fetch-main's exit codes are load-bearing: 3 = transient (base SHA not
+      # ingested yet) → fall back to a local base run of the full set; anything
+      # else (2 = backend/mode has no main testbed — a BACKEND_TABLE /
+      # bench-main.yml drift) is a permanent misconfiguration that a local
+      # rebuild can never fix, so fail the cell loudly instead of silently
+      # paying the fallback on every future run.
+      #
+      # Partial miss (exit 0 + non-empty missing.txt): the PR's Vectors.csv
+      # selects names main was never benched on — typically constants the PR
+      # itself adds. Bencher's numbers stand for the covered set; the base
+      # checkout runs JUST the missing names and merges below, so a brand-new
+      # constant still gets a real main-vs-PR delta on its first !benchmark.
+      # `run-base` gates every base-side step; `base-names` picks the file the
+      # base run measures (missing.txt = partial, names.txt = full fallback).
+      - name: Fetch main from bencher
+        id: bencher
         run: |
-          if $(lake run get-exe-targets | grep -q ${{ matrix.bench }}); then
-            lake exe ${{ matrix.bench }}
-          else
-            echo "No matching bench target found on base branch"
-          fi
-        working-directory: ${{ github.workspace }}/base
-      - name: Checkout PR branch
+          set +e
+          python3 .github/scripts/bench.py fetch-main \
+            --sha "$BASE_SHA" --backend "$BACKEND" --mode "$MODE" --env "$BENV" \
+            --names "$GITHUB_WORKSPACE/names.txt" \
+            --missing-out "$GITHUB_WORKSPACE/missing.txt" \
+            --out "$GITHUB_WORKSPACE/main.json"
+          rc=$?
+          set -e
+          case $rc in
+            0) if [ -s "$GITHUB_WORKSPACE/missing.txt" ]; then
+                 echo "source=bencher + base run ($(wc -l < "$GITHUB_WORKSPACE/missing.txt") new)" >> "$GITHUB_OUTPUT"
+                 echo "run-base=true" >> "$GITHUB_OUTPUT"
+                 echo "base-names=missing.txt" >> "$GITHUB_OUTPUT"
+               else
+                 echo "source=bencher" >> "$GITHUB_OUTPUT"
+                 echo "run-base=false" >> "$GITHUB_OUTPUT"
+               fi ;;
+            3) echo "source=ran" >> "$GITHUB_OUTPUT"
+               echo "run-base=true" >> "$GITHUB_OUTPUT"
+               echo "base-names=names.txt" >> "$GITHUB_OUTPUT" ;;
+            *) echo "::error::fetch-main: permanent config error (exit $rc) — check BACKEND_TABLE in bench.py vs bench-main.yml testbeds"; exit "$rc" ;;
+          esac
+      - name: Checkout base (bencher data missing or partial)
+        if: steps.bencher.outputs.run-base == 'true'
         uses: actions/checkout@v6
         with:
-          path: ${{ github.workspace }}/pr
-          ref: ${{ steps.comment-branch.outputs.head_sha }}
-      - name: Run `lake build` on PR branch
+          ref: ${{ env.BASE_SHA }}
+          path: base
+      # bench-main.yml's build/compile jobs already cached the base SHA's
+      # binaries and `.ixe` on the push to main — restore both before paying
+      # for a from-scratch base build. The `.ixe` cache key/path use the
+      # CamelCase env slug (bench-main's matrix.bench).
+      - name: Compute env slug
+        if: steps.bencher.outputs.run-base == 'true'
+        id: envcc
+        run: |
+          benv="$BENV"
+          echo "cc=${benv^}" >> "$GITHUB_OUTPUT"
+      - name: Restore base binaries (bench-main build cache)
+        if: steps.bencher.outputs.run-base == 'true'
+        id: base-bins
+        uses: actions/cache/restore@v5
+        with:
+          path: ~/.local/bin
+          key: bench-bins-${{ env.BASE_SHA }}
+      - name: Restore base .ixe (bench-main compile cache)
+        if: steps.bencher.outputs.run-base == 'true'
+        id: base-ixe
+        uses: actions/cache/restore@v5
+        with:
+          # Path list must match bench-main's compile-job save (actions/cache
+          # versions entries by path list).
+          path: |
+            ${{ steps.envcc.outputs.cc }}.ixe
+            zkshards-${{ steps.envcc.outputs.cc }}
+          key: bench-ixe-${{ env.BASE_SHA }}-${{ steps.envcc.outputs.cc }}
+      # Cached base binaries are usable only when the two `lean-toolchain`
+      # files are identical (plain `cmp`), and — for the mathlib env — only
+      # when the `.ixe` was also restored (otherwise base's `ix compile` needs
+      # mathlib oleans that only the full build fetches).
+      - name: Resolve base binaries
+        if: steps.bencher.outputs.run-base == 'true'
+        id: base-src
+        run: |
+          cached=false
+          if [ "${{ steps.base-bins.outputs.cache-hit }}" = true ] \
+             && cmp -s lean-toolchain base/lean-toolchain; then
+            if [ "$BENV" != mathlib ] || [ "${{ steps.base-ixe.outputs.cache-hit }}" = true ]; then
+              mkdir -p .bins/base
+              mv ~/.local/bin/ix ~/.local/bin/bench-typecheck .bins/base/ 2>/dev/null || true
+              [ -x .bins/base/ix ] && [ -x .bins/base/bench-typecheck ] && cached=true
+            fi
+          fi
+          echo "cached=$cached" >> "$GITHUB_OUTPUT"
+          echo "base binaries: $([ "$cached" = true ] && echo restored from bench-main cache || echo building from source)"
+      - name: Build base (ix, bench-typecheck)
+        if: steps.bencher.outputs.run-base == 'true' && steps.base-src.outputs.cached != 'true'
         uses: leanprover/lean-action@v1
         with:
-          lake-package-directory: ${{ github.workspace }}/pr
-          test: false
-      - name: Copy base benchmarks into PR dir for comparison
+          lake-package-directory: base
+          auto-config: false
+          build: true
+          build-args: "ix bench-typecheck"
+          use-github-cache: false
+          use-mathlib-cache: ${{ matrix.cell.env == 'mathlib' && 'true' || 'false' }}
+      # NOTE: this runs the PR's run.sh against base-built binaries. When a PR
+      # changes the benchmark CLIs themselves, the base binaries reject the new
+      # flags and every constant drops — compare then renders the loud
+      # "main produced no results" note instead of a silent all-n/a table.
+      - name: Run backend on base → merge into main.json
+        if: steps.bencher.outputs.run-base == 'true'
         run: |
-          BENCH_DIR_PR=${{ github.workspace }}/pr/.lake/benches
-          BENCH_DIR_BASE=${{ github.workspace }}/base/.lake/benches
-          mkdir -p $BENCH_DIR_PR
-          [ -d "$BENCH_DIR_BASE" ] && cp -r $BENCH_DIR_BASE/. $BENCH_DIR_PR/
-          ls $BENCH_DIR_PR
-      - name: Run bench on PR branch and generate comparison report
+          if [ "${{ steps.base-ixe.outputs.cache-hit }}" = true ]; then
+            mv "${{ steps.envcc.outputs.cc }}.ixe" "base/$BENV.ixe"
+            # bench-main's pre-cut closure shards ride the same cache entry;
+            # in place, run.sh skips re-cutting them for the base tree.
+            mv "zkshards-${{ steps.envcc.outputs.cc }}" "base/zkshards-$BENV" 2>/dev/null || true
+            export REUSE_IXE=1
+          fi
+          if [ "${{ steps.base-src.outputs.cached }}" = true ]; then
+            export PATH="$PWD/.bins/base:$PATH"
+          else
+            export PATH="$PWD/base/.lake/build/bin:$PATH"
+          fi
+          bash .github/scripts/run.sh base "$BENV" "$BACKEND" "$MODE" \
+            "$GITHUB_WORKSPACE/${{ steps.bencher.outputs.base-names }}" \
+            "$GITHUB_WORKSPACE/base.json"
+          # Partial fallback: bencher already supplied main.json for the
+          # covered names; fill only the gaps from the base run. Bencher wins
+          # any overlap (e.g. ooc's always-emitted whole-env row) — it is the
+          # canonical main-side number. Full fallback: main.json doesn't exist
+          # (fetch-main exits 3 before writing), so the base run IS main.json.
+          if [ -s "$GITHUB_WORKSPACE/main.json" ]; then
+            jq -s '.[0] + .[1]' "$GITHUB_WORKSPACE/base.json" "$GITHUB_WORKSPACE/main.json" \
+              > "$GITHUB_WORKSPACE/main.merged" \
+              && mv "$GITHUB_WORKSPACE/main.merged" "$GITHUB_WORKSPACE/main.json"
+          else
+            mv "$GITHUB_WORKSPACE/base.json" "$GITHUB_WORKSPACE/main.json"
+          fi
+
+      # ---------- PR side ----------
+      - name: Run backend on PR → pr.json
+        env:
+          REUSE_IXE: ${{ steps.pr-ixe.outputs.cache-hit == 'true' && '1' || '0' }}
+        run: |
+          export PATH="$PWD/.bins/pr:$PATH"
+          bash .github/scripts/run.sh . "$BENV" "$BACKEND" "$MODE" \
+            "$GITHUB_WORKSPACE/names.txt" "$GITHUB_WORKSPACE/pr.json"
+      # First cell to compile the PR env publishes it for the others (racing
+      # saves are fine — the first wins, the rest fail gracefully).
+      - name: Save PR .ixe
+        if: steps.pr-ixe.outputs.cache-hit != 'true'
+        uses: actions/cache/save@v5
+        with:
+          path: ${{ matrix.cell.env }}.ixe
+          key: bench-pr-ixe-${{ env.HEAD_SHA }}-${{ matrix.cell.env }}
+
+      # ---------- compare ----------
+      - name: Build comparison table
         run: |
-          BENCH_REPORT=1 lake exe ${{ matrix.bench }}
-        working-directory: ${{ github.workspace }}/pr
-      - name: Get env for PR body
+          mkdir -p out
+          python3 .github/scripts/bench.py compare \
+            --main main.json --pr pr.json --out "out/table-$LABEL.md" \
+            --backend "$BACKEND" --env "$BENV" --mode "$MODE" \
+            --count "${{ steps.man.outputs.count }}" \
+            --main-source "${{ steps.bencher.outputs.source }}"
+          cat "out/table-$LABEL.md"
+
+      - name: Upload table
         if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: table-${{ env.LABEL }}
+          path: out/table-${{ env.LABEL }}.md
+          if-no-files-found: warn
+      # A PR-side typecheck failure is a correctness regression: fail the
+      # cell LOUDLY (red X on the PR) — after the table upload, so the
+      # comment still posts with the constant's ❌ row and note.
+      - name: Fail on PR-side typecheck failures
         run: |
-          SHORT_SHA_PR=$(git rev-parse --short HEAD)
-          REPO_URL=${{ github.server_url }}/${{ github.repository }}
-          echo "COMMIT_LINK=[\`$SHORT_SHA_PR\`]($REPO_URL/commit/${{ steps.comment-branch.outputs.head_sha }})" | tee -a $GITHUB_ENV
-          echo "WORKFLOW_LINK=[Workflow logs]($REPO_URL/actions/runs/${{ github.run_id }})" | tee -a $GITHUB_ENV
-        working-directory: ${{ github.workspace }}/pr
+          failed=$(jq -r 'to_entries[] | select(.value.failed == true) | .key' "$GITHUB_WORKSPACE/pr.json")
+          if [ -n "$failed" ]; then
+            for c in $failed; do echo "::error::$c FAILED TO TYPECHECK on the PR side"; done
+            exit 1
+          fi
+
+  comment:
+    needs: [setup, benchmark]
+    if: always() && needs.setup.result == 'success'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - name: Download tables
+        uses: actions/download-artifact@v4
+        with:
+          path: tables
+          pattern: table-*
+          merge-multiple: true
+      - name: Build comment body
+        env:
+          SUMMARY: ${{ needs.setup.outputs.config-summary }}
+        run: |
+          python3 .github/scripts/bench.py comment \
+            --tables tables --summary "$SUMMARY" \
+            --head "${{ needs.setup.outputs.head-sha }}" \
+            --repo-url "${{ github.server_url }}/${{ github.repository }}" \
+            --run-id "${{ github.run_id }}" --out comment-body.md
       - name: Generate token to write PR comment
-        uses: actions/create-github-app-token@v3
-        if: always()
         id: app-token
+        uses: actions/create-github-app-token@v3
         with:
           app-id: ${{ secrets.TOKEN_APP_ID }}
           private-key: ${{ secrets.TOKEN_APP_PRIVATE_KEY }}
-      - name: Build benchmark comment body
-        if: success()
-        run: |
-          {
-            echo '## Benchmark for `${{ matrix.bench }}` at ${{ env.COMMIT_LINK }}';
-            echo "";
-            for file in .lake/benches/*/report.md; do
-              [ -f "$file" ] && cat "$file" && echo ""
-            done
-            echo "${{ env.WORKFLOW_LINK }}";
-          } > ${{ github.workspace }}/comment-body.md
-        working-directory: ${{ github.workspace }}/pr
-      - name: Comment on successful run
-        if: success()
+      - name: Post / update comment
         uses: peter-evans/create-or-update-comment@v5
         with:
           token: ${{ steps.app-token.outputs.token }}
-          issue-number: ${{ github.event.issue.number }}
-          body-path: 'comment-body.md'
-      - name: Comment on failing run
-        if: failure()
-        uses: peter-evans/create-or-update-comment@v5
-        with:
-          token: ${{ steps.app-token.outputs.token }}
-          issue-number: ${{ github.event.issue.number }}
-          body: |
-            ## Benchmark for `${{ matrix.bench }}` at ${{ env.COMMIT_LINK }} failed :x:
-
-            [Workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
+          issue-number: ${{ github.event.issue.number || github.event.pull_request.number }}
+          body-path: comment-body.md
diff --git a/.github/workflows/bencher-thresholds-reset.yml b/.github/workflows/bencher-thresholds-reset.yml
index 37a0213c..f05db98a 100644
--- a/.github/workflows/bencher-thresholds-reset.yml
+++ b/.github/workflows/bencher-thresholds-reset.yml
@@ -17,8 +17,9 @@ name: Bencher thresholds reset
 #     `bencher-thresholds-reset:<token>` label on the PR, whatever added it — so a
 #     Triage+ collaborator can queue a reset by applying the label directly, and
 #     cancel by removing it before merge. Naming convention: one label per token,
-#     `bencher-thresholds-reset:<token>` where <token> is `ix-compile`, `aiur`, or
-#     `all` (the merge step expands an `all` label into every workload). Labeling
+#     `bencher-thresholds-reset:<token>` where <token> is `ix-compile`,
+#     `aiur-check`, `zisk-check`, `sp1-check`, `ooc-check`, or `all` (the merge
+#     step expands an `all` label into every workload). Labeling
 #     requires Triage+, so PR authors from forks cannot self-queue a reset. The
 #     label shares the command/workflow name; the git tag it moves is the same
 #     stem with a dash: `bencher-thresholds-reset-<workload>`.
@@ -36,7 +37,7 @@ on:
         description: Workload baseline to reset
         required: true
         type: choice
-        options: [ix-compile, aiur, all]
+        options: [ix-compile, aiur-check, zisk-check, sp1-check, ooc-check, all]
       sha:
         description: "Commit to anchor to (default: HEAD)"
         required: false
@@ -65,7 +66,7 @@ jobs:
       MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }}
     steps:
       - run: |
-          valid="ix-compile aiur"
+          valid="ix-compile aiur-check zisk-check sp1-check ooc-check"
           if [ "$EVENT" = workflow_dispatch ]; then
             # Reset the chosen workload(s) at the given commit; no PR scan.
             sha="${INPUT_SHA:-$HEAD_SHA}"
@@ -119,7 +120,7 @@ jobs:
       - run: |
           # Accepted command tokens — applied verbatim as labels (incl. `all`,
           # which the merge job expands into every workload).
-          accepted="ix-compile aiur all"
+          accepted="ix-compile aiur-check zisk-check sp1-check ooc-check all"
           # Parse the workload token(s) after the command, lowercased.
           workloads=$(printf '%s' "$BODY" \
             | grep -oiE '!bencher-thresholds-reset[[:space:]]+[a-z0-9 -]+' \
@@ -142,5 +143,5 @@ jobs:
               --body "♻️ Baseline reset queued for:$ok — will anchor to the merge commit when this PR merges."
           else
             gh pr comment "$PR" --repo "$REPO" \
-              --body "⚠️ Reset command matched no known workload (expected \`ix-compile\`, \`aiur\`, or \`all\`). Nothing will reset on merge."
+              --body "⚠️ Reset command matched no known workload (expected \`ix-compile\`, \`aiur-check\`, \`zisk-check\`, \`sp1-check\`, \`ooc-check\`, or \`all\`). Nothing will reset on merge."
           fi
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 68f5d451..4d368c39 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -62,3 +62,56 @@ jobs:
         uses: EmbarkStudios/cargo-deny-action@v2
         with:
           rust-version: ${{ env.RUST_VERSION }}
+
+  # zkVM host build gate: do the Zisk/SP1 hosts (and their guest ELFs, via
+  # each workspace's build.rs) still compile? rust-test doesn't build these
+  # workspaces (special toolchains), and bench-main.yml's zkvm-execute job —
+  # which runs real executions — tolerates per-constant failures by design
+  # (dropped rows, OOM sentinels), so it never turns red on a breakage. These
+  # jobs are the red-X signal, kept cheap: build-only, no execution, and
+  # therefore no Zisk proving key (the key is loaded at runtime by
+  # `client.setup()`, never at build time — skipping it saves a ~3 GB download
+  # + const-tree regeneration per run). SP1 and Zisk build as independent jobs
+  # so they parallelize; each installs only its own toolchain via sp1up /
+  # ziskup (prebuilt binaries). The apt list inside the install actions is the
+  # shared superset both backends need (proofman's C++ links
+  # OpenMPI/OpenMP/GMP/…; SP1's host crates need pkg-config + libssl-dev).
+  sp1-build:
+    name: SP1 host build
+    runs-on: warp-ubuntu-latest-x64-16x
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          cache-workspaces: sp1
+      - uses: ./.github/actions/install-sp1
+      # The precompile-aware SP1 runner-binary is auto-built from the fork git
+      # dep by `sp1-core-executor-runner`'s build script — no manual override.
+      # `cargo test` reuses the build's dep graph and runs the host's unit
+      # tests — the clap surface run.sh drives (`--consts` comma-splitting,
+      # `--consts-file` union/dedup), so a CLI regression reds this gate too.
+      - name: Build + test sp1-host (guest ELF via build.rs)
+        run: |
+          cd sp1
+          cargo build --release --bin sp1-host
+          cargo test --release --bin sp1-host
+
+  zisk-build:
+    name: Zisk host build
+    runs-on: warp-ubuntu-latest-x64-16x
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          cache-workspaces: zisk
+      - uses: ./.github/actions/install-zisk
+        with:
+          proving-key: false
+      # Unit tests: the clap surface run.sh drives, plus the closure auditor
+      # (closure_detects_missing_dep self-skips without an IX_TEST_IXE
+      # fixture — this gate has no Lean build to produce one).
+      - name: Build + test zisk-host (guest ELF via build.rs)
+        run: |
+          cd zisk
+          cargo build --release --bin zisk-host
+          cargo test --release --bin zisk-host
diff --git a/.github/workflows/ignored.yml b/.github/workflows/ignored.yml
index 5e92186a..99751746 100644
--- a/.github/workflows/ignored.yml
+++ b/.github/workflows/ignored.yml
@@ -8,9 +8,9 @@ on:
 permissions:
   contents: read
 
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
+# No concurrency group: push-to-main and manual dispatch only — every merged
+# commit runs the extended tests; a later merge must never cancel an in-flight
+# run.
 
 jobs:
   ignored-test:
diff --git a/.github/workflows/riscv-bench.yml b/.github/workflows/riscv-bench.yml
deleted file mode 100644
index d9ec22b7..00000000
--- a/.github/workflows/riscv-bench.yml
+++ /dev/null
@@ -1,162 +0,0 @@
-name: RISC-V bench
-
-# zkVM execute is ~10+ min (toolchain installs + host builds + emulation), so it
-# is kept off the per-PR path: this workflow runs only on pushes to main (and on
-# manual dispatch). It compiles the `minimal.ixe` fixture, then executes the
-# kernel typecheck of one constant in the SP1 and Zisk VMs — in parallel jobs.
-on:
-  push:
-    branches: main
-  workflow_dispatch:
-
-permissions:
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  # Compile a tiny env once (ix is already built here) and hand it to the zkVM
-  # execute jobs via artifact, so those jobs stay Lean-free.
-  compile-fixture:
-    name: Compile zkVM fixture (minimal.ixe)
-    runs-on: warp-ubuntu-latest-x64-16x
-    steps:
-      - uses: actions/checkout@v6
-      - uses: actions-rust-lang/setup-rust-toolchain@v1
-      - uses: leanprover/lean-action@v1
-        with:
-          build-args: "--wfail -v"
-      - name: Compile zkVM test fixture (minimal.ixe)
-        run: lake exe ix compile Tests/MinimalDefs.lean --out minimal.ixe
-      - uses: actions/upload-artifact@v4
-        with:
-          name: minimal-ixe
-          path: minimal.ixe
-          if-no-files-found: error
-
-  # Execute the kernel typecheck of the `minimal.ixe` fixture natively (no Nix,
-  # no proof, no GPU). SP1 and Zisk run as independent jobs so they parallelize;
-  # each installs only its own toolchain via sp1up / ziskup (prebuilt binaries)
-  # and downloads the shared fixture. minimal.ixe carries the full Init closure,
-  # so we scope execution with `--constant myReflEq --skip-deps`: that
-  # subject-only-typechecks just the named constant, trusting its Init
-  # dependencies as Claim assumptions, instead of typechecking all of Init (which
-  # never finishes in the emulator). Each host bails non-zero on any typecheck
-  # failure; we also assert the `failures: 0` line.
-  #
-  # The apt list is the shared superset both backends need: the ZisK book's full
-  # Ubuntu list (its prebuilt cargo-zisk and proofman's C++ link OpenMPI, OpenMP,
-  # GMP, nlohmann-json, nasm, secp256k1, …) plus pkg-config + libssl-dev for
-  # SP1's host crates (openssl/bindgen). The Nix shells provided all this; a bare
-  # runner doesn't. Must precede the toolchain install (it runs cargo-zisk).
-  sp1-execute:
-    name: SP1 zkVM Execute
-    needs: compile-fixture
-    runs-on: warp-ubuntu-latest-x64-16x
-    steps:
-      - uses: actions/checkout@v6
-      - uses: actions-rust-lang/setup-rust-toolchain@v1
-        with:
-          cache-workspaces: sp1
-      - name: Install system build deps
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y \
-            xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \
-            nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \
-            libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \
-            openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \
-            pkg-config libssl-dev
-      - uses: actions/download-artifact@v4
-        with:
-          name: minimal-ixe
-      - name: Install SP1 toolchain (sp1up, latest)
-        run: |
-          curl -L https://sp1up.succinct.xyz | bash
-          ~/.sp1/bin/sp1up
-          echo "$HOME/.sp1/bin" >> "$GITHUB_PATH"
-      # The precompile-aware SP1 runner-binary is auto-built from the fork git
-      # dep by `sp1-core-executor-runner`'s build script — no manual override.
-      - name: SP1 — execute minimal.ixe (assert failures == 0)
-        run: |
-          cd sp1
-          cargo run --bin sp1-host -- --execute --ixe ../minimal.ixe --constant myReflEq --skip-deps | tee only.txt
-          grep -qE "failures: 0\b" only.txt
-
-  zisk-execute:
-    name: Zisk zkVM Execute
-    needs: compile-fixture
-    runs-on: warp-ubuntu-latest-x64-16x
-    steps:
-      - uses: actions/checkout@v6
-      - uses: actions-rust-lang/setup-rust-toolchain@v1
-        with:
-          cache-workspaces: zisk
-      - name: Install system build deps
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y \
-            xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \
-            nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \
-            libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \
-            openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \
-            pkg-config libssl-dev
-      - uses: actions/download-artifact@v4
-        with:
-          name: minimal-ixe
-      - name: Install Zisk toolchain (ziskup, pinned v0.18.0)
-        # `--version 0.18.0` pins the toolchain to match our deps. Our host links
-        # the argumentcomputer/zisk `blake3-precompile` fork, which is based on
-        # v0.18.0 (its cargo-zisk has `check-setup`, used below to regenerate the
-        # key's const-trees). Without the pin, ziskup installs `releases/latest`,
-        # which resolves to upstream `v1.0.0-alpha` — a different circuit whose
-        # cargo-zisk dropped the `check-setup` subcommand, breaking the key step.
-        # `--cpu` picks the CPU build (no GPU on the runner) and `--nokey` skips
-        # ziskup's key install — both avoid its interactive /dev/tty prompts. We
-        # keep `--nokey` because the upstream `zisk-setup` bucket only carries the
-        # upstream circuit's key; our fork has a different circuit (extra Blake3f
-        # AIR), so we restore the fork-matching key from our own S3 in the next
-        # step. `--prefix $HOME/.zisk` pins the install where cargo-zisk's
-        # ZiskPaths fallback looks (the runner sets XDG_CONFIG_HOME, which would
-        # otherwise relocate it).
-        run: |
-          curl -L https://raw.githubusercontent.com/0xPolygonHermez/zisk/main/ziskup/install.sh \
-            | bash -s -- --cpu --nokey -y --version 0.18.0 --prefix "$HOME/.zisk"
-          echo "$HOME/.zisk/bin" >> "$GITHUB_PATH"
-      # Execute still needs a proving key present: zisk-host calls
-      # `client.setup()` (which the SDK runs before the execute branch), and that
-      # loads the circuit's const-tree files. We host the fork-matching key in a
-      # public S3 bucket WITHOUT the const-trees — exactly like Zisk's released
-      # `zisk-provingkey-*.tar.gz` on `storage.googleapis.com/zisk-setup` — and
-      # regenerate them here with `cargo-zisk check-setup -a`, which is how
-      # `ziskup` itself populates them. That keeps the artifact ~3 GB (gzip)
-      # instead of ~48 GB. The object name carries the fork rev so a circuit
-      # change can't silently reuse a stale key. Public bucket → plain curl, no
-      # AWS creds.
-      - name: Restore Zisk proving key (fork circuit) from S3
-        run: |
-          mkdir -p "$HOME/.zisk"
-          curl -fSL --retry 3 \
-            https://argument-zisk-setup.s3.amazonaws.com/zisk-provingkey-blake3-8f9e24d5-cpu.tar.gz \
-            -o /tmp/zisk-provingkey.tar.gz
-          tar -C "$HOME/.zisk" -xzf /tmp/zisk-provingkey.tar.gz
-          rm -f /tmp/zisk-provingkey.tar.gz
-          # Regenerate the const-tree files omitted from the artifact (CPU build,
-          # so no --gpu). This is the "may take a while" step ziskup prints.
-          cargo-zisk check-setup --proving-key "$HOME/.zisk/provingKey" -a
-      - name: Zisk — execute minimal.ixe (assert failures == 0)
-        run: |
-          cd zisk
-          # ZisK's ASM microservices mmap the ROM with MAP_LOCKED, which needs
-          # unlimited locked memory — the Zisk book's "Critical Memory
-          # Configuration" prescribes DefaultLimitMEMLOCK=infinity. The runner
-          # caps the memlock hard limit (so a bare `ulimit -l unlimited` can't
-          # raise it) and we can't reboot it, so raise the limit in-session as
-          # root via prlimit; the cargo child (and the ASM services it spawns)
-          # inherit it. Without this the services die with
-          # `mmap(rom) errno=11` / "shmem creation ... failed".
-          sudo prlimit --pid $$ --memlock=unlimited:unlimited
-          cargo run --bin zisk-host -- --execute --ixe ../minimal.ixe --constant myReflEq --skip-deps | tee only.txt
-          grep -qE "failures: 0\b" only.txt
diff --git a/Benchmarks/Typecheck.lean b/Benchmarks/Typecheck.lean
index 047cbec7..04992eac 100644
--- a/Benchmarks/Typecheck.lean
+++ b/Benchmarks/Typecheck.lean
@@ -5,6 +5,7 @@ import Ix.Aiur.Compiler
 import Ix.Aiur.Statistics
 import Ix.TracingTexray
 import Ix.Benchmark.Bench
+import Ix.Cli.ConstsFile
 import Ix.Cli.NameResolve
 
 /-!
@@ -18,25 +19,33 @@ runtime. Useful standalone (per-constant timeline + RAM breakdown via
 tracing-texray) and as a machine source (neutral results JSON).
 
 ```
-lake exe bench-typecheck --ixe <path> [names…] [flags]
+lake exe bench-typecheck --ixe <path> --consts <n1,n2,…> [--consts-file <p>] [flags]
 
-  --ixe <path>       serialized `Ixon.Env`, e.g. from `ix compile Foo.lean`
-                     (writes `foo.ixe`). Required.
-  [names…]           zero or more fully-qualified constant names to benchmark,
-                     e.g. `Nat.add_comm String.append`.
-  --manifest <path>  additionally read names from a file: one per line, blank
-                     lines and `#` comments ignored. Unions with [names…].
+  --ixe <path>          serialized `Ixon.Env`, e.g. from `ix compile Foo.lean`
+                        (writes `foo.ixe`). Required.
+  --consts <n1,n2,…>    comma-separated fully-qualified constant names to
+                        benchmark (e.g. `Nat.add_comm,String.append`). Same
+                        flag/shape as `ix check --consts`, `zisk-host --consts`,
+                        and `sp1-host --consts`.
+  --consts-file <path>  additionally read names from a file: one per line, blank
+                        lines and `#` comments ignored. Unions with --consts.
 
-  Names (from either source) resolve against the env's named map via
+  Names (from any source) resolve against the env's named map via
   `String.toName` plus a `toString` fallback (mirrors `ix check --ixe`), so
   numeric / private components round-trip (`Foo.0.Bar`, `_private.M.0.foo`).
-  Pass at least one name or a `--manifest`.
+  Pass at least one name via --consts or --consts-file.
 
+  --skip-deps    check only each target itself (verify_const, trusting its
+                 deps) instead of its whole transitive closure (verify_claim,
+                 the default). Same flag as `zisk-host --skip-deps`; reserved
+                 for targets too expensive to full-closure-check.
   --json <path>  write per-constant results JSON to <path>. Off by default:
                  normal CLI usage prints only the human-readable summary.
-  --texray       force the tracing-texray timeline + RAM breakdown on.
-  --no-texray    force it off. Default: on iff `--json` was NOT given, so a
-                 plain local run gets the breakdown while a JSON run stays quiet.
+  --texray       enable the tracing-texray timeline + RAM breakdown. With
+                 --json <path>, per-phase span timings are also written to
+                 <path>.spans (JSON Lines) for the CI drill-down. Off by default.
+  --execute-only run only Phase 1 (constants / fft-cost / execute-time) and skip
+                 proving — the fast `execute`-mode signal.
 ```
 
 For each constant the harness STARK-checks `Ix.Claim.check addr none` (the full
@@ -47,8 +56,11 @@ transitive typecheck) in two phases:
    (Σ width·height·log2(height) over circuits — the proving-cost proxy), and
    `execute-time`.
 2. **Prove** (cheap→expensive by measured fft-cost): the end-to-end STARK prove,
-   recording `prove-time`. With texray on, each prove emits a per-span timeline
-   (`aiur/execute`, `aiur/witness`, `stark/...`) with RAM Δ/peak to stderr.
+   recording `prove-time`, the serialized `proof-size` (bytes), and
+   `verify-time` (verifying the fresh proof) — prover changes can trade speed
+   against proof size or verification cost, so all three are tracked. With
+   texray on, each prove emits a per-span timeline (`aiur/execute`,
+   `aiur/witness`, `stark/...`) with RAM Δ/peak to stderr.
 
 When `--json` is set the file is rewritten after every prove, so an external
 `timeout` still leaves a complete file of the results collected so far (cheapest
@@ -57,9 +69,13 @@ warning, so a single bad name never fails the run. The harness imposes no time
 limit; bound a run with an external `timeout` if needed.
 
 The JSON is a neutral, flat shape (`{ "<name>": { "constants": …, "fft-cost": …,
-"execute-time": …, "prove-time": …, "throughput": … } }`, where `prove-time` and
-`throughput` appear only for proven constants); any bencher-specific reshaping
-is the caller's job (see `.github/workflows/bench-main.yml`).
+"execute-time": …, "execute-peak-rss": …, "prove-time": …, "proof-size": …,
+"verify-time": …, "peak-rss": …, "throughput": … } }`). `execute-peak-rss` is
+the Phase-1 RSS high-water, sampled before proving starts, so it is comparable
+across execute-only and prove runs; `prove-time`, `proof-size`, `verify-time`,
+`peak-rss` (the prover's high-water), and `throughput` appear only for proven
+constants. Any bencher-specific reshaping is the caller's job (see
+`.github/workflows/bench-main.yml`).
 -/
 
 open Lean (Json Name)
@@ -77,15 +93,6 @@ def friParameters : Aiur.FriParameters := {
   queryProofOfWorkBits := 0
 }
 
-/-- Manifest lines as raw strings: one name per line. Everything from a `#` to
-    end of line is a comment (whole-line or inline); blank lines are dropped.
-    `#` never appears in a Lean name, so splitting on it is safe. Resolution
-    against the env happens later (so the `toString` fallback can see the
-    displayed form the user wrote). -/
-def parseManifest (contents : String) : Array String :=
-  (contents.splitOn "\n").filterMap (fun line =>
-    let s := ((line.splitOn "#").head?.getD "").trimAscii
-    if s.isEmpty then none else some s.toString) |>.toArray
 
 
 /-- Per-constant measurements. `proveSec` is `none` when the constant was
@@ -95,27 +102,66 @@ structure Result where
   constants : Nat
   fftCost : Float
   executeSec : Float
+  /-- The kernel REJECTED the constant (Phase-1 check error). The JSON entry
+      is the bare `{"failed": true}` sentinel — a rejected constant is a
+      correctness signal, not a benchmark datum — and Phase 2 skips it.
+      bench.py compare renders a ❌ row plus a loud note. -/
+  failed : Bool := false
   proveSec : Option Float := none
+  /-- Serialized proof size in bytes (`Aiur.Proof.toBytes`). Tracked because
+      prover changes can trade speed against proof size. -/
+  proofSize : Option Nat := none
+  /-- Wall time of `AiurSystem.verify` over the fresh proof — the other side
+      of the same trade-off. `none` if verification failed (reported loudly). -/
+  verifySec : Option Float := none
+  /-- Peak resident-set size in bytes (tracing-texray tree sampler), captured
+      after the constant's heaviest phase. -/
+  peakRss : Option Nat := none
+  /-- Phase-1 (execute) RSS high-water mark, sampled before any proving
+      allocations. Present in BOTH modes so an execute-only run compares
+      apples-to-apples against a prove run's baseline — `peak-rss` in a prove
+      run is dominated by the prover and would dwarf an execute-only peak. -/
+  executePeakRss : Option Nat := none
   deriving Inhabited
 
-/-- Round a Float to `d` decimal places, to keep the emitted JSON readable. -/
-def roundTo (d : Nat) (f : Float) : Float :=
+/-- A `Json` number with at most `d` decimal places, rendered decimally.
+    `Float`'s own `ToJson` prints the full binary representation
+    (`0.02602000000000000146…`), so build the `JsonNumber` (mantissa ·
+    10⁻ᵈ) directly from the rounded value instead. -/
+def jsonRound (d : Nat) (f : Float) : Json :=
   let scale := (10.0 : Float) ^ d.toFloat
-  (f * scale).round / scale
+  let scaled := f * scale
+  let m : Int :=
+    if scaled < 0 then -Int.ofNat (-scaled).round.toUInt64.toNat
+    else Int.ofNat scaled.round.toUInt64.toNat
+  Json.num ⟨m, d⟩
 
 /-- Neutral, flat results object: `name → { constants, fft-cost, execute-time,
     prove-time?, throughput? }`. No bencher-specific shaping. -/
 def Result.toJsonEntry (r : Result) : String × Json :=
+  if r.failed then (r.name, Json.mkObj [("failed", Json.bool true)]) else
   let base : List (String × Json) :=
     [ ("constants", Lean.toJson r.constants)
-    , ("fft-cost", Lean.toJson (roundTo 0 r.fftCost))
-    , ("execute-time", Lean.toJson (roundTo 6 r.executeSec)) ]
+    , ("fft-cost", jsonRound 0 r.fftCost)
+    , ("execute-time", jsonRound 6 r.executeSec) ]
+  let base := match r.peakRss with
+    | some n => base ++ [ ("peak-rss", Lean.toJson n) ]
+    | none => base
+  let base := match r.executePeakRss with
+    | some n => base ++ [ ("execute-peak-rss", Lean.toJson n) ]
+    | none => base
   -- prove-time and the derived proving throughput (constants/prove-time, the
   -- proving analog of compile's constants/sec) are present only once proven.
   let fields := match r.proveSec with
-    | some p => base ++ [ ("prove-time", Lean.toJson (roundTo 6 p))
-                        , ("throughput", Lean.toJson (roundTo 2 (r.constants.toFloat / p))) ]
+    | some p => base ++ [ ("prove-time", jsonRound 6 p)
+                        , ("throughput", jsonRound 2 (r.constants.toFloat / p)) ]
     | none => base
+  let fields := match r.proofSize with
+    | some n => fields ++ [ ("proof-size", Lean.toJson n) ]
+    | none => fields
+  let fields := match r.verifySec with
+    | some v => fields ++ [ ("verify-time", jsonRound 6 v) ]
+    | none => fields
   (r.name, Json.mkObj fields)
 
 /-- Time a thunk, returning its value and the elapsed seconds. The result is
@@ -130,37 +176,39 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
   let some ixeArg := p.flag? "ixe"
     | IO.eprintln "error: --ixe <path> is required"; return 1
   let ixePath := ixeArg.as! String
-  -- Names come from the variadic positional args and/or a `--manifest` file.
-  let cliNames := p.variableArgsAs! String
-  let fileNames ← match p.flag? "manifest" with
-    | some f => pure (parseManifest (← IO.FS.readFile (f.as! String)))
-    | none => pure #[]
-  -- Union, preserving first-seen order, so the same const isn't proven twice.
-  let nameArgs := Id.run do
-    let mut seen : Std.HashSet String := {}
-    let mut acc : Array String := #[]
-    for n in cliNames ++ fileNames do
-      if !seen.contains n then seen := seen.insert n; acc := acc.push n
-    return acc
+  -- `--consts` comma-list ∪ `--consts-file`, shared grammar + dedup
+  -- (Ix.Cli.ConstsFile — same parser as `ix check-rs`). Raw strings:
+  -- resolution against the env happens later (so the `toString` fallback can
+  -- see the displayed form the user wrote).
+  let nameArgs ← Ix.Cli.ConstsFile.gather p
   if nameArgs.isEmpty then
-    IO.eprintln "error: provide one or more constant names and/or --manifest <path>"
+    IO.eprintln "error: provide at least one constant via --consts <n1,n2,…> and/or --consts-file <path>"
     return 1
   let jsonOut : Option String := (p.flag? "json").map (·.as! String)
-  -- subject-only: check just the target (`verify_const`, trusting its deps)
+  -- skip-deps: check just the target (`verify_const`, trusting its deps)
   -- instead of re-checking the whole transitive closure (`verify_claim`).
-  let subjectOnly := p.hasFlag "subject-only"
-  -- Default: trace iff we're not in JSON/bencher mode.
-  let useTexray :=
-    if p.hasFlag "texray" then true
-    else if p.hasFlag "no-texray" then false
-    else jsonOut.isNone
+  let skipDeps := p.hasFlag "skip-deps"
+  -- Execute-only: run just Phase 1 (constants / fft-cost / execute-time) and
+  -- skip the Phase 2 prove loop.
+  let executeOnly := p.hasFlag "execute-only"
+  -- Off by default; CI passes --texray explicitly.
+  let useTexray := p.hasFlag "texray"
+  -- Start the process-tree RSS sampler so each Result's peak-rss reflects the
+  -- true high-water mark. When --texray + --json are both on, also install the
+  -- streaming subscriber and point the per-span sink at `<json>.spans`, so the
+  -- prover's aiur/* and stark/* phase timings land as JSON Lines for the CI
+  -- comparison.
+  TracingTexray.startSampler
+  match useTexray, jsonOut with
+  | true, some path => TracingTexray.init {}; TracingTexray.jsonSink s!"{path}.spans"
+  | _, _ => pure ()
 
   -- Compile the IxVM kernel once; build the prover system once.
   let .ok toplevel := IxVM.ixVM
     | throw (IO.userError "Merging IxVM kernel failed")
   let .ok compiled := toplevel.compile
     | throw (IO.userError "Compilation of IxVM kernel failed")
-  let entrypoint := if subjectOnly then `verify_const else `verify_claim
+  let entrypoint := if skipDeps then `verify_const else `verify_claim
   let some funIdx := compiled.getFuncIdx entrypoint
     | throw (IO.userError s!"{entrypoint} entrypoint missing")
   let aiurSystem := Aiur.AiurSystem.build compiled.bytecode commitmentParameters
@@ -190,7 +238,7 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
 
   -- Phase 1: execute every constant (cheap, deterministic structural metrics).
   -- For full-closure check claims, use `checkAddrWithEnv` against the
-  -- shared `envHandle`. For `--subject-only` (`buildVerifyConst`), the
+  -- shared `envHandle`. For `--skip-deps` (`buildVerifyConst`), the
   -- witness is a small subject-only blob — keep Lean witness +
   -- `executeIxVM`.
   IO.println "── Phase 1: execute (witness generation) ──"
@@ -198,13 +246,17 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
   for (label, addr) in targets do
     try
       let (res, execSec) ← timed fun _ =>
-        if subjectOnly then
+        if skipDeps then
           let witness := IxVM.ClaimHarness.buildVerifyConst ixonEnv addr
           compiled.bytecode.executeIxVM funIdx witness.input witness.inputIOBuffer
         else
           compiled.bytecode.checkAddrWithEnv funIdx envHandle addr.hash
       match res with
-      | .error e => IO.eprintln s!"  execute {label} failed: {e}"
+      | .error e =>
+        IO.eprintln s!"  ❌ {label} FAILED TO TYPECHECK: {e}"
+        execed := execed.push
+          ({ name := label, constants := 0, fftCost := 0, executeSec := 0,
+             failed := true }, addr)
       | .ok (_, _, queryCounts) =>
         let stats := Aiur.computeStats compiled queryCounts
         let constants := (IxVM.ClaimHarness.closureFrom ixonEnv addr).size
@@ -223,6 +275,21 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
       IO.FS.writeFile path (Json.mkObj (results.map Result.toJsonEntry).toList).pretty
     | none => pure ()
 
+  -- Phase-1 RSS high-water, sampled BEFORE any proving allocations so the
+  -- measure is comparable between execute-only and prove runs (`peak-rss`
+  -- from a prove run is the prover's high-water and would dwarf it).
+  let executePeak ← TracingTexray.peakTreeRssBytes
+  execed := execed.map (fun (r, a) => ({ r with executePeakRss := some executePeak }, a))
+
+  -- `--execute-only`: stop after Phase 1; the results JSON (if requested) is
+  -- already complete with the execute metrics.
+  if executeOnly then
+    writeJson (execed.map (·.1))
+    match jsonOut with
+    | some path => IO.println s!"wrote {execed.size} execute-only benchmarks to {path}"
+    | none => IO.println s!"executed {execed.size} constants (--execute-only); pass --json <path> to emit results"
+    return 0
+
   -- Phase 2: prove cheap→expensive. Refine each entry with its prove-time as it
   -- lands. Install texray first so the prove spans (timeline + RAM Δ/peak) render.
   if useTexray then TracingTexray.init {}
@@ -232,9 +299,10 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
   let mut spent : Float := 0.0
   for i in [:ordered.size] do
     let (r, addr) := ordered[i]!
+    if r.failed then continue
     try
       let (proveRes, proveSec) ← timed fun _ =>
-        if subjectOnly then
+        if skipDeps then
           let witness := IxVM.ClaimHarness.buildVerifyConst ixonEnv addr
           let (claim, proof, ioBuf) :=
             aiurSystem.proveIxVM friParameters funIdx witness.input witness.inputIOBuffer
@@ -243,19 +311,34 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do
         else
           match aiurSystem.proveAddrWithEnv friParameters funIdx envHandle addr.hash with
           | .error e => .error e
-          | .ok (_claimBytes, proof, ioBuf) =>
-            -- The shared envHandle path doesn't return an `Array G`
-            -- claim — adapt to the existing benchmark return shape
-            -- by recomputing the claim digest from the witness's
-            -- input (Phase 2 doesn't read it).
-            .ok (#[], proof, ioBuf)
+          | .ok (claimBytes, proof, ioBuf) =>
+            -- The envHandle path returns the SERIALIZED `Ix.Claim`; rebuild
+            -- the Array-G claim `verify` takes — `verify_claim`'s input is
+            -- the 32-G blake3 digest of those bytes (same recipe as
+            -- `ix verify`).
+            let digest := Address.blake3 claimBytes
+            let claim :=
+              Aiur.buildClaim funIdx (digest.hash.data.map .ofUInt8) #[]
+            .ok (claim, proof, ioBuf)
       match (proveRes : Except String (Array Aiur.G × Aiur.Proof × Aiur.IOBuffer)) with
       | .error e => IO.eprintln s!"  prove {r.name} failed: {e}"; continue
-      | .ok _ => pure ()
-      spent := spent + proveSec
-      IO.println s!"  {r.name}: prove={proveSec}s (cumulative {spent}s)"
-      ordered := ordered.set! i ({ r with proveSec := some proveSec }, addr)
-      writeJson (ordered.map (·.1))
+      | .ok (claim, proof, _ioBuf) =>
+        spent := spent + proveSec
+        let peak ← TracingTexray.peakTreeRssBytes
+        let proofSize := (Aiur.Proof.toBytes proof).size
+        let (verifyRes, verifySec) ← timed fun _ =>
+          aiurSystem.verify friParameters claim proof
+        let verifySec? ← match verifyRes with
+          | .ok () => pure (some verifySec)
+          | .error e =>
+            IO.eprintln s!"  verify {r.name} FAILED: {e}"
+            pure none
+        IO.println s!"  {r.name}: prove={proveSec}s verify={verifySec}s \
+          proof={proofSize} bytes (cumulative {spent}s)"
+        ordered := ordered.set! i
+          ({ r with proveSec := some proveSec, peakRss := some peak
+                  , proofSize := some proofSize, verifySec := verifySec? }, addr)
+        writeJson (ordered.map (·.1))
     catch e =>
       IO.eprintln s!"  prove {r.name} threw: {e}"
 
@@ -269,15 +352,14 @@ def typecheckCmd : Cli.Cmd := `[Cli|
   "Benchmark IxVM-kernel execution + proving of `Ix.Claim.check` over `.ixe` constants"
 
   FLAGS:
-    "ixe"       : String; "Path to a serialized `Ixon.Env` (e.g. produced by `ix compile`). Required."
-    "manifest"  : String; "Additionally read constant names from a file (one per line; `#` comments and blank lines ignored). Unions with the positional names."
+    "ixe"          : String; "Path to a serialized `Ixon.Env` (e.g. produced by `ix compile`). Required."
+    "consts"       : String; "Comma-separated fully-qualified constant names to benchmark (e.g. `Nat.add_comm,String.append`). Same flag/shape as `ix check --consts`, `zisk-host --consts`, and `sp1-host --consts`."
+    "consts-file"  : String; "Additionally read constant names from a file (one per line; `#` comments and blank lines ignored). Unions with --consts."
     "json"      : String; "Write per-constant results JSON to this path. Off by default; normal CLI usage prints only the human-readable summary."
-    "subject-only";       "Check only each target itself (verify_const, trusting its deps) instead of re-checking its whole transitive closure (verify_claim)."
-    texray;               "Force the tracing-texray timeline + RAM breakdown on (per-prove spans on stderr)."
-    "no-texray";          "Force the breakdown off. Default: on iff --json was not given."
+    "skip-deps";          "Check only each target itself (verify_const, trusting its deps) instead of re-checking its whole transitive closure (verify_claim). Same flag as `zisk-host --skip-deps`."
+    "execute-only";       "Execute only (Phase 1: constants / fft-cost / execute-time) and skip proving. The fast per-PR `execute`-mode signal."
+    texray;               "Enable the tracing-texray timeline + RAM breakdown (per-prove spans on stderr). Combined with --json, per-phase span timings are additionally written to `<json>.spans` as JSON Lines for the CI drill-down. Off by default."
 
-  ARGS:
-    ...names : String;   "Fully-qualified constant name(s) to benchmark (e.g. `Nat.add_comm String.append`). Optional if `--manifest` is given."
 ]
 
 def main (args : List String) : IO UInt32 :=
diff --git a/Benchmarks/Vectors.csv b/Benchmarks/Vectors.csv
new file mode 100644
index 00000000..7a5d772b
--- /dev/null
+++ b/Benchmarks/Vectors.csv
@@ -0,0 +1,92 @@
+# Benchmark constant vectors -- single shared source of truth for which
+# constants to run. Consumed identically by Aiur (bench-typecheck --consts-file),
+# the Zisk/SP1 hosts (--consts loop), and CI shell (awk). One row per curated
+# constant. Measurements (fft, cycles, prove-time, …) live in the neutral
+# results JSON each tool emits and in bencher.dev — never in this file.
+#
+# Columns (shard_target and primary default to 0 when trailing; most rows only
+# carry the first three):
+#   name         fully-qualified Lean name (resolves via NameResolve.resolveIxeAddr).
+#   env          compile target / .ixe it resolves in: initStd | lean | mathlib.
+#   tier         cheap = prove-feasible on a CI runner; heavy = a single-shard
+#                prove exceeds the RAM watchdog ceiling (expect an OOM row).
+#                Consumed by bench.py manifest only (BENCH_TIER filter; the
+#                non-primary prove set defaults to cheap) — run.sh attempts a
+#                full prove of every selected constant regardless of tier.
+#   shard_target 1 = heavy constant designated as a multi-shard prove target.
+#   primary      1 = part of the primary subset spanning shape + the cheap->heavy
+#                cost range. Default for the !benchmark PR comment and the
+#                bencher prove / zkVM jobs (full set via BENCH_FULL=1). Heavy
+#                reps run only in execute/native; prove mode keeps the cheap
+#                primaries (tier filter).
+#
+# CI filters on the env column (not line number), so these '#' lines and the
+# header are skipped by: awk -F, '$1!~/^#/ && $1!="name" && $2==env ...'
+name,env,tier,shard_target,primary
+HEq,initStd,cheap
+Nat,initStd,cheap
+Eq.rec,initStd,cheap
+HEq.rec,initStd,cheap
+Trans.mk,initStd,cheap
+Array.toList,initStd,cheap
+Acc.rec,initStd,cheap
+Sum.elim,initStd,cheap
+Prod.map,initStd,cheap
+Option.bind,initStd,cheap
+Except.bind,initStd,cheap
+WellFounded.fix,initStd,cheap
+Nat.add,initStd,cheap
+List.filterMap,initStd,cheap
+Int.add,initStd,cheap
+BitVec.toFin,initStd,cheap
+Nat.add_comm,initStd,cheap,0,1
+USize.toNat,initStd,cheap
+Nat.decEq,initStd,cheap
+ByteSlice.ofByteArray,initStd,cheap
+Nat.decLe,initStd,cheap
+Nat.strongRecOn,initStd,cheap
+Int.emod,initStd,cheap
+Array.foldlM,initStd,cheap
+Array.filter,initStd,cheap
+Nat.sub_le_of_le_add,initStd,cheap,0,1
+BitVec.add,initStd,cheap
+Int.gcd,initStd,cheap,0,1
+Nat.toDigits,initStd,cheap
+Array.map,initStd,cheap
+Lean.Name.hash,initStd,cheap
+BitVec.umod,initStd,cheap
+Nat.repr,initStd,cheap
+String.intercalate,initStd,heavy
+_private.Init.Prelude.0.Lean.extractMainModule._unsafe_rec,initStd,heavy
+Char.toLower,initStd,heavy
+Nat.gcd_comm,initStd,heavy,0,1
+Int.emod_emod_of_dvd,initStd,heavy
+Array.append_assoc,initStd,heavy
+Vector.append,initStd,heavy,0,1
+Fin.foldl,initStd,heavy
+List.mergeSort,initStd,heavy,1,1
+Array.binSearch,initStd,heavy,1
+Array.qsortOrd,initStd,heavy
+String.split,initStd,heavy,0,1
+Std.Time.Week.Offset.ofMilliseconds,initStd,heavy
+Vector.extract_append._proof_2,initStd,heavy,1,1
+ByteArray.utf8DecodeChar?_utf8EncodeChar_append,initStd,heavy,0,1
+String.append,initStd,cheap,0,1
+_private.Init.Data.Range.Polymorphic.SInt.0.Int8.instRxcHasSize_eq,initStd,heavy,0,1
+_private.Init.Data.Range.Polymorphic.SInt.0.Int16.instRxcHasSize_eq,initStd,heavy,0,1
+_private.Init.Data.Range.Polymorphic.SInt.0.Int32.instRxcHasSize_eq,initStd,heavy,0,1
+_private.Init.Data.Range.Polymorphic.SInt.0.Int64.instRxcHasSize_eq,initStd,heavy,0,1
+Char.ofOrdinal_le_of_le,initStd,heavy,0,1
+Array.extract_append,initStd,heavy,0,1
+Std.Tactic.BVDecide.BVExpr.bitblast.goCache_Inv_of_Inv._mutual,initStd,heavy,0,1
+Lean.Expr.replace,lean,cheap
+List.Sorted,mathlib,cheap
+Nat.choose,mathlib,cheap
+Nat.factorial,mathlib,cheap,0,1
+Nat.fib,mathlib,cheap
+GCDMonoid.gcd,mathlib,heavy
+Nat.Prime.two_le,mathlib,heavy
+Finset.prod,mathlib,heavy
+Finset.sum,mathlib,heavy
+Polynomial.eval,mathlib,heavy
+Multiset.sort,mathlib,heavy,1,1
diff --git a/Cargo.lock b/Cargo.lock
index 9db6079f..2c5bc0f2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -831,7 +831,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.61.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -2335,7 +2335,7 @@ version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -3085,7 +3085,7 @@ dependencies = [
  "once_cell",
  "socket2",
  "tracing",
- "windows-sys 0.60.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -3368,7 +3368,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b55fb86dfd3a2f5f76ea78310a88f96c4ea21a3031f8d212443d56123fd0521"
 dependencies = [
  "libc",
- "windows-sys 0.61.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -4022,7 +4022,7 @@ dependencies = [
 [[package]]
 name = "tracing-texray"
 version = "0.2.0"
-source = "git+https://github.com/argumentcomputer/tracing-texray?rev=31d194dd1bc50458d26f77c89bb68f67e5d1c149#31d194dd1bc50458d26f77c89bb68f67e5d1c149"
+source = "git+https://github.com/argumentcomputer/tracing-texray?rev=bd4faa08a4fa4edb46bde393b4d20c6bd49591d0#bd4faa08a4fa4edb46bde393b4d20c6bd49591d0"
 dependencies = [
  "loom",
  "parking_lot",
@@ -4499,7 +4499,7 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets 0.52.6",
+ "windows-targets",
 ]
 
 [[package]]
@@ -4508,16 +4508,7 @@ version = "0.59.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
 dependencies = [
- "windows-targets 0.52.6",
-]
-
-[[package]]
-name = "windows-sys"
-version = "0.60.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
-dependencies = [
- "windows-targets 0.53.5",
+ "windows-targets",
 ]
 
 [[package]]
@@ -4535,31 +4526,14 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
 dependencies = [
- "windows_aarch64_gnullvm 0.52.6",
- "windows_aarch64_msvc 0.52.6",
- "windows_i686_gnu 0.52.6",
- "windows_i686_gnullvm 0.52.6",
- "windows_i686_msvc 0.52.6",
- "windows_x86_64_gnu 0.52.6",
- "windows_x86_64_gnullvm 0.52.6",
- "windows_x86_64_msvc 0.52.6",
-]
-
-[[package]]
-name = "windows-targets"
-version = "0.53.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
-dependencies = [
- "windows-link",
- "windows_aarch64_gnullvm 0.53.1",
- "windows_aarch64_msvc 0.53.1",
- "windows_i686_gnu 0.53.1",
- "windows_i686_gnullvm 0.53.1",
- "windows_i686_msvc 0.53.1",
- "windows_x86_64_gnu 0.53.1",
- "windows_x86_64_gnullvm 0.53.1",
- "windows_x86_64_msvc 0.53.1",
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
 ]
 
 [[package]]
@@ -4577,96 +4551,48 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
-
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
-
 [[package]]
 name = "windows_i686_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
-[[package]]
-name = "windows_i686_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
-
 [[package]]
 name = "windows_i686_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
-[[package]]
-name = "windows_i686_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
-
 [[package]]
 name = "windows_i686_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
-[[package]]
-name = "windows_i686_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
-
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
-
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
-
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
-
 [[package]]
 name = "winnow"
 version = "1.0.2"
diff --git a/Cargo.toml b/Cargo.toml
index 456f0b7a..b8afcfd6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -58,7 +58,7 @@ sha2 = "0.10"
 tiny-keccak = { version = "2", features = ["keccak"] }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
-tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "31d194dd1bc50458d26f77c89bb68f67e5d1c149" }
+tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "bd4faa08a4fa4edb46bde393b4d20c6bd49591d0" }
 
 [workspace.lints.rust]
 invalid_reference_casting = "warn"
diff --git a/Ix/Cli/CheckRsCmd.lean b/Ix/Cli/CheckRsCmd.lean
index 1cb0fbf7..190f08ee 100644
--- a/Ix/Cli/CheckRsCmd.lean
+++ b/Ix/Cli/CheckRsCmd.lean
@@ -6,18 +6,22 @@
 
   - Default (Meta): kernel runs with metadata fields populated (Lean.Name,
     binder info, mdata). Supports `--ns` / `--consts` / `--consts-file`
-    for seed filtering and `--fail-out` for bisect-loop workflows.
+    for seed filtering and `--fail-out` for bisect-loop workflows. Seeded
+    meta checks are SUBJECT-ONLY: each seed is checked with its deps
+    lazily ingressed but trusted, not re-checked.
   - `--anon` (metadata-free): the env is loaded via `Env::get_anon` —
     `named`/`names`/`comms` sections are discarded at load time, never
-    reaching the kernel. Every kernel-checkable address (every constant
-    except Muts blocks and projections — projections are covered by
-    their parent block) is checked. The kernel's typechecking logic
-    structurally cannot read metadata (`M::MField<T>` is `()` in Anon
-    mode); progress labels are `@<hex>` addresses, not names.
-
-    `--anon` is incompatible with `--ns` / `--consts` / `--consts-file`:
-    the anon path checks everything in the env. Add `--addrs <hex,…>`
-    in the future if address-based filtering is needed.
+    reaching the kernel. The kernel's typechecking logic structurally
+    cannot read metadata (`M::MField<T>` is `()` in Anon mode); progress
+    labels are `@<hex>` addresses, not names.
+
+    Without a filter, every kernel-checkable address is checked (whole
+    env). With `--consts` / `--consts-file`, the named constants are
+    checked together with their FULL dependency closures — the same mode
+    and scope as the zkVM hosts' `--consts` execute path, so an
+    out-of-circuit run is directly comparable to the in-circuit one. Add
+    `--skip-deps` for a subject-only check (deps trusted), mirroring
+    `zisk-host --skip-deps`. `--ns` prefix filtering stays meta-only.
 
   Direct Lean → kernel typechecking (compile-and-check from source) is
   available via the `rsCheckConstsFFI` API for tests
@@ -29,6 +33,8 @@ public import Cli
 public import Ix.Common
 public import Ix.KernelCheck
 public import Ix.Meta
+public import Ix.TracingTexray
+public import Ix.Cli.ConstsFile
 public import Ix.Cli.ValidateCmd
 public import Std.Internal.UV.System
 
@@ -48,18 +54,6 @@ private structure SeedSpec where
 private def SeedSpec.isEmpty (s : SeedSpec) : Bool :=
   s.prefixes.isEmpty && s.exacts.isEmpty
 
-/-- Read one constant name per line from `path`. Blank lines and lines
-    starting with `#` (after trimming) are ignored. -/
-private def readNamesFile (path : String) : IO (List Lean.Name) := do
-  let content ← IO.FS.readFile path
-  let lines := content.splitOn "\n"
-  let names : List Lean.Name := lines.filterMap fun raw =>
-    let cs := raw.toList.dropWhile Char.isWhitespace
-    let trimmed := String.ofList (cs.reverse.dropWhile Char.isWhitespace).reverse
-    if trimmed.isEmpty || trimmed.startsWith "#" then none
-    else some trimmed.toName
-  pure names
-
 /-- Build a `SeedSpec` from `--ns`, `--consts`, and `--consts-file`. -/
 private def resolveSeedSpec (p : Cli.Parsed) : IO (Option SeedSpec) := do
   let nsFlag     := p.flag? "ns"
@@ -82,7 +76,8 @@ private def resolveSeedSpec (p : Cli.Parsed) : IO (Option SeedSpec) := do
     exacts := exacts ++ parsed
   if let some flag := fileFlag then
     let path := flag.as! String
-    let parsed ← readNamesFile path
+    -- Shared grammar (Ix.Cli.ConstsFile); meta seeds resolve via `toName`.
+    let parsed := (← ConstsFile.read path).toList.map (·.toName)
     if parsed.isEmpty then
       IO.println s!"[check] warning: --consts-file '{path}' yielded zero names"
     else
@@ -168,7 +163,50 @@ private def runCheckAnon (envPath : String) (p : Cli.Parsed) : IO UInt32 := do
   if !failOutPath.isEmpty then
     IO.println s!"[check] streamed {failures.size} failure(s) to {failOutPath}"
 
-  IO.println s!"##check## {elapsed} {passed} {failures.size} {results.size}"
+  let peakRss ← TracingTexray.peakTreeRssBytes
+  IO.println s!"##check## {elapsed} {passed} {failures.size} {results.size} {peakRss}"
+  return if failures.isEmpty then 0 else 1
+
+/-- Anon-mode per-constant runner: dispatch to `rsCheckAnonConstsFFI`. Checks
+    the named constants and (by default) their full dependency closures — the
+    zkVM hosts' semantics — or subject-only under `--skip-deps`. -/
+private def runCheckAnonConsts (envPath : String) (p : Cli.Parsed) : IO UInt32 := do
+  let verbose := p.flag? "verbose" |>.isSome
+  let skipDeps := p.hasFlag "skip-deps"
+  let failOutPath : String :=
+    match p.flag? "fail-out" with
+    | some flag => flag.as! String
+    | none      => ""
+  -- Raw strings (no toName round-trip): the FFI resolves displayed forms
+  -- against the env's `named` map, matching the zkVM hosts' resolution.
+  let names ← ConstsFile.gather p
+  if names.isEmpty then
+    IO.println "[check] error: --consts/--consts-file resolved to zero names"
+    return 1
+
+  let scope := if skipDeps then "subject-only" else "full-closure"
+  IO.println s!"Running Ix kernel check (anon mode, {scope}) on {envPath}"
+  IO.println s!"[check] {names.size} seed constant(s): {", ".intercalate names.toList}"
+  let start ← IO.monoMsNow
+  let results ← rsCheckAnonConstsFFI envPath names skipDeps (!verbose) failOutPath
+  let elapsed := (← IO.monoMsNow) - start
+
+  let mut passed := 0
+  let mut failures : Array (String × String) := #[]
+  for (hex, res) in results do
+    match res with
+    | none => passed := passed + 1
+    | some err => failures := failures.push (s!"#{hex}", err.message)
+
+  IO.println s!"[check] checked {results.size} constants in {elapsed.formatMs}"
+  IO.println s!"[check] {passed}/{results.size} passed"
+  reportFailures failures
+
+  if !failOutPath.isEmpty then
+    IO.println s!"[check] streamed {failures.size} failure(s) to {failOutPath}"
+
+  let peakRss ← TracingTexray.peakTreeRssBytes
+  IO.println s!"##check## {elapsed} {passed} {failures.size} {results.size} {peakRss}"
   return if failures.isEmpty then 0 else 1
 
 /-- Meta-mode runner: dispatch to `rsCheckIxonFFI` with seed filtering. -/
@@ -219,7 +257,8 @@ private def runCheckMeta (envPath : String) (p : Cli.Parsed) : IO UInt32 := do
   if !failOutPath.isEmpty then
     IO.println s!"[check] streamed {failures.size} failure(s) to {failOutPath}"
 
-  IO.println s!"##check## {elapsed} {passed} {failures.size} {seedNames.size}"
+  let peakRss ← TracingTexray.peakTreeRssBytes
+  IO.println s!"##check## {elapsed} {passed} {failures.size} {seedNames.size} {peakRss}"
   return if failures.isEmpty then 0 else 1
 
 def runCheckRsCmd (p : Cli.Parsed) : IO UInt32 := do
@@ -228,6 +267,10 @@ def runCheckRsCmd (p : Cli.Parsed) : IO UInt32 := do
       return 1
   let envPath := pathArg.as! String
 
+  -- Start the process-tree RSS sampler so the `##check##` line can report an
+  -- accurate peak-rss (the parallel kernel check's high-water mark).
+  TracingTexray.startSampler
+
   -- `--workers N` is plumbed through the existing
   -- `IX_KERNEL_CHECK_WORKERS` env var that `resolve_kernel_check_workers`
   -- (`src/ffi/kernel.rs`) reads. Setting `1` forces a single-threaded
@@ -240,14 +283,19 @@ def runCheckRsCmd (p : Cli.Parsed) : IO UInt32 := do
     Std.Internal.UV.System.osSetenv "IX_KERNEL_CHECK_WORKERS" (toString n)
 
   let anon := p.flag? "anon" |>.isSome
+  let hasConsts := (p.flag? "consts").isSome || (p.flag? "consts-file").isSome
+  if p.hasFlag "skip-deps" && !(anon && hasConsts) then
+    p.printError "error: --skip-deps only applies to `--anon --consts/--consts-file` \
+      (meta-mode seeded checks are always subject-only)"
+    return 1
   if anon then
-    let hasConsts := p.flag? "consts" |>.isSome
-    let hasNs := p.flag? "ns" |>.isSome
-    let hasConstsFile := p.flag? "consts-file" |>.isSome
-    if hasConsts || hasNs || hasConstsFile then
-      p.printError "error: --anon checks the entire env; --consts/--ns/--consts-file are unsupported"
+    if p.flag? "ns" |>.isSome then
+      p.printError "error: --ns prefix filtering is meta-only; --anon supports --consts/--consts-file"
       return 1
-    runCheckAnon envPath p
+    if hasConsts then
+      runCheckAnonConsts envPath p
+    else
+      runCheckAnon envPath p
   else
     runCheckMeta envPath p
 
@@ -261,8 +309,9 @@ def checkRsCmd : Cli.Cmd := `[Cli|
   FLAGS:
     anon;                   "Run the kernel in anon mode (no metadata read from .ixe)"
     ns            : String; "Comma-separated Lean.Name prefixes to filter on (meta mode only)"
-    consts        : String; "Comma-separated EXACT constant names to seed (meta mode only)"
-    "consts-file" : String; "Path to a file with one constant name per line (meta mode only)"
+    consts        : String; "Comma-separated EXACT constant names. Meta mode: subject-only seed check. Anon mode: full-closure check of each name (the zkVM hosts' semantics; --skip-deps for subject-only)."
+    "consts-file" : String; "Path to a file with one constant name per line (`#` comments); unions with --consts."
+    "skip-deps";            "With --anon --consts: check each named constant subject-only, trusting its deps (same flag as zisk-host/sp1-host/bench-typecheck)."
     "fail-out"    : String; "Write failing constants to this path (consumable by --consts-file)"
     workers       : Nat;    "Number of parallel kernel-check workers; 1 disables parallelism (default: available_parallelism). Plumbs via IX_KERNEL_CHECK_WORKERS env var."
     verbose;                "Log every constant on its own line (default: quiet)"
diff --git a/Ix/Cli/CompileCmd.lean b/Ix/Cli/CompileCmd.lean
index a00f95e5..19a75862 100644
--- a/Ix/Cli/CompileCmd.lean
+++ b/Ix/Cli/CompileCmd.lean
@@ -3,6 +3,7 @@ public import Cli
 public import Ix.Common
 public import Ix.CompileM
 public import Ix.Meta
+public import Ix.Cli.ConstsFile
 public import Ix.Cli.ValidateCmd
 
 public section
@@ -14,19 +15,6 @@ private def defaultOutPathFor (pathStr : String) : String :=
   let stem := path.fileStem.getD (path.fileName.getD pathStr)
   stem.toLower ++ ".ixe"
 
-/-- Read one constant name per line from `path`. Blank lines and lines
-    starting with `#` (after trimming) are ignored. Mirrors
-    `Ix.Cli.CheckCmd.readNamesFile`. -/
-private def readNamesFile (path : String) : IO (List Lean.Name) := do
-  let content ← IO.FS.readFile path
-  let lines := content.splitOn "\n"
-  let names : List Lean.Name := lines.filterMap fun raw =>
-    let cs := raw.toList.dropWhile Char.isWhitespace
-    let trimmed := String.ofList (cs.reverse.dropWhile Char.isWhitespace).reverse
-    if trimmed.isEmpty || trimmed.startsWith "#" then none
-    else some trimmed.toName
-  pure names
-
 def runCompileCmd (p : Cli.Parsed) : IO UInt32 := do
   let some path := p.positionalArg? "path"
     | p.printError "error: must specify <path> to a Lean source file"
@@ -51,7 +39,10 @@ def runCompileCmd (p : Cli.Parsed) : IO UInt32 := do
     if let some flag := p.flag? "exclude" then
       for n in parsePrefixes (flag.as! String) do s := s.insert n
     if let some flag := p.flag? "exclude-file" then
-      for n in ← readNamesFile (flag.as! String) do s := s.insert n
+      -- Shared names-file grammar (Ix.Cli.ConstsFile); names resolve here
+      -- via `toName` like the `--exclude` comma-list.
+      for n in ← Ix.Cli.ConstsFile.read (flag.as! String) do
+        s := s.insert n.toName
     pure s
   if !excludeSet.isEmpty then
     IO.println s!"[compile] exclude: {excludeSet.size} name(s) will be dropped from seed set"
@@ -67,7 +58,36 @@ def runCompileCmd (p : Cli.Parsed) : IO UInt32 := do
   -- Seeds pass through `collectDeps` for the transitive-dep closure.
   -- Flag name is `--module` (not `--ns`) because the match is against
   -- the source module name, not the decl's own namespace.
-  let constList ← match p.flag? "module" with
+  -- `--consts` / `--consts-file`: seed by EXACT constant name, transitive
+  -- deps via `collectDeps` — a closure-only env (e.g. one benchmark constant
+  -- + deps) instead of the whole import env. Resolution tries `String.toName`
+  -- first, then a displayed-form scan so `_private`/numeric components
+  -- round-trip. Mutually exclusive with `--module`; `--exclude` doesn't
+  -- apply (the seed list is already explicit).
+  let constsSeeds ← Ix.Cli.ConstsFile.gather p
+  if !constsSeeds.isEmpty && (p.flag? "module").isSome then
+    p.printError "error: --consts/--consts-file and --module are mutually exclusive"
+    return 1
+  let constList ←
+    if !constsSeeds.isEmpty then do
+      let mut seeds : List Lean.Name := []
+      let mut missing : List String := []
+      for n in constsSeeds do
+        let name := n.toName
+        if leanEnv.constants.contains name then
+          seeds := name :: seeds
+        else
+          match leanEnv.constants.toList.find? (fun (m, _) => toString m == n) with
+          | some (m, _) => seeds := m :: seeds
+          | none => missing := n :: missing
+      if !missing.isEmpty then
+        p.printError s!"error: no constant(s) named {missing} in the environment"
+        return 1
+      IO.println s!"[compile] consts: {seeds.length} seed constant(s)"
+      let closed := collectDeps leanEnv seeds
+      IO.println s!"[compile] consts: {closed.length} constants after transitive-dep closure"
+      pure closed
+    else match p.flag? "module" with
     | none =>
       if excludeSet.isEmpty then pure leanEnv.constants.toList
       else
@@ -124,6 +144,8 @@ def compileCmd : Cli.Cmd := `[Cli|
 
   FLAGS:
     out            : String; "Output path for serialized Ixon.Env bytes; defaults to the lowercased input file stem with `.ixe` (e.g. CompileMathlib.lean -> compilemathlib.ixe)"
+    consts         : String; "Comma-separated EXACT constant names to compile (transitive deps pulled in automatically) instead of the whole import env — e.g. `Nat.add_comm`. Same flag/shape as `ix check --consts`. Mutually exclusive with --module; --exclude does not apply."
+    "consts-file"  : String; "Additionally read seed constant names from a file (one per line; `#` comments and blank lines ignored). Unions with --consts."
     module         : String; "Comma-separated module-name prefixes to filter on (e.g. 'Tests.Ix.Kernel.TutorialDefs,Tests.Ix.Kernel.NatReduction'). Match is against the SOURCE MODULE a constant came from (via `Lean.Environment.getModuleIdxFor?`), not the constant's own name — so macro-emitted decls that register under unqualified names still get caught when their host module's name matches. Transitive deps are pulled in automatically."
     exclude        : String; "Comma-separated exact Lean.Name(s) to strip from the seed set. Excluded names that are still referenced by another seed will reappear via the transitive-dep closure."
     "exclude-file" : String; "Path to a file with one Lean.Name per line to strip from the seed set. Same semantics as --exclude; same line format as `ix check --consts-file`."
diff --git a/Ix/Cli/ConstsFile.lean b/Ix/Cli/ConstsFile.lean
new file mode 100644
index 00000000..5083348b
--- /dev/null
+++ b/Ix/Cli/ConstsFile.lean
@@ -0,0 +1,61 @@
+/-
+  Shared parsing for constant-name inputs (`--consts` comma-lists and
+  `--consts-file` files) across every CLI that takes them: `ix check-rs`,
+  `ix compile --exclude-file`, and `bench-typecheck`.
+
+  One grammar everywhere: one name per line, everything from a `#` to end of
+  line is a comment (whole-line or inline), blank lines dropped. `#` never
+  appears in a Lean name, so splitting on it is safe. The zkVM hosts'
+  `--consts-file` (Rust `collect_consts`) parses the same grammar, so a single
+  names file drives all backends identically.
+
+  Names stay RAW strings here — resolution differs per caller (`toName` for
+  meta-mode seeds, string-match against the env's `named` map for the anon /
+  zkVM-style paths, where a `toName` round-trip could mangle numeric or
+  private components).
+-/
+module
+public import Cli
+
+public section
+
+namespace Ix.Cli.ConstsFile
+
+/-- Parse names-file contents: one name per line, `#`-to-EOL comments,
+    blank lines dropped. -/
+def parseLines (contents : String) : Array String :=
+  (contents.splitOn "\n").filterMap (fun line =>
+    let s := ((line.splitOn "#").head?.getD "").trimAscii
+    if s.isEmpty then none else some s.toString) |>.toArray
+
+/-- Read and parse a names file. -/
+def read (path : String) : IO (Array String) :=
+  parseLines <$> IO.FS.readFile path
+
+/-- Split a `--consts`-style comma-list into trimmed, non-empty names. -/
+def parseCommaList (arg : String) : Array String :=
+  (arg.splitOn ",").filterMap (fun s =>
+    let t := s.trimAscii
+    if t.isEmpty then none else some t.toString) |>.toArray
+
+/-- Union of a parsed `--consts` comma-list flag and a `--consts-file` file
+    (both optional), deduped in first-seen order. -/
+def gather (p : Cli.Parsed)
+    (constsFlag : String := "consts") (fileFlag : String := "consts-file") :
+    IO (Array String) := do
+  let fromFlag : Array String :=
+    match p.flag? constsFlag with
+    | some f => parseCommaList (f.as! String)
+    | none => #[]
+  let fromFile : Array String ←
+    match p.flag? fileFlag with
+    | some f => read (f.as! String)
+    | none => pure #[]
+  -- Linear-scan dedupe: name lists are tens of entries, not thousands.
+  let mut acc : Array String := #[]
+  for n in fromFlag ++ fromFile do
+    if !acc.contains n then
+      acc := acc.push n
+  return acc
+
+end Ix.Cli.ConstsFile
diff --git a/Ix/Cli/ShardCmd.lean b/Ix/Cli/ShardCmd.lean
index 002e9cd7..8bdee963 100644
--- a/Ix/Cli/ShardCmd.lean
+++ b/Ix/Cli/ShardCmd.lean
@@ -14,10 +14,19 @@
   manifest and prints a what-if report (per-shard cost + total cross-shard
   ingress). The partitioner is self-contained — no external graph-library
   dependency.
+
+  `ix shard extract <path.ixe> --consts <n1,n2,…>`: the pipeline's scoping
+  step — extract the named constants' dependency closure from a serialized
+  env into a standalone `.ixe`, without recompiling from source. The output
+  carries the closure's genuine constant bytes, blobs, and reducibility
+  hints, plus each closure constant's name→address entry, so it composes
+  with everything that consumes a `.ixe` (`ix profile` → `ix shard`,
+  `ix check-rs --consts`, the zkVM hosts, `bench-typecheck`).
 -/
 module
 public import Cli
 public import Ix.KernelCheck
+public import Ix.Cli.ConstsFile
 
 public section
 
@@ -25,6 +34,43 @@ open Ix.KernelCheck
 
 namespace Ix.Cli.ShardCmd
 
+def runShardExtractCmd (p : Cli.Parsed) : IO UInt32 := do
+  let some pathArg := p.positionalArg? "path"
+    | p.printError "error: must specify <path> to a .ixe file"
+      return 1
+  let envPath := pathArg.as! String
+  let names ← Ix.Cli.ConstsFile.gather p
+  if names.isEmpty then
+    p.printError "error: pass at least one name via --consts or --consts-file"
+    return 1
+  let outPath : String :=
+    match p.flag? "out" with
+    | some flag => flag.as! String
+    -- Default output mirrors the first constant's slug next to the source
+    -- env: `init.ixe --consts Nat.add_comm` → `nat_add_comm.ixe`.
+    | none =>
+      let slug := names[0]!.map fun c =>
+        if c.isAlphanum then c.toLower else '_'
+      s!"{slug}.ixe"
+  let quiet := !(p.flag? "verbose" |>.isSome)
+  rsEnvExtractFFI envPath names outPath quiet
+  IO.println s!"[extract] wrote {outPath} ({names.size} root name(s))"
+  return 0
+
+def shardExtractCmd : Cli.Cmd := `[Cli|
+  "extract" VIA runShardExtractCmd;
+  "Extract named constants + their dependency closure from a `.ixe` into a standalone `.ixe`"
+
+  FLAGS:
+    consts        : String; "Comma-separated EXACT constant names (displayed form) to extract, e.g. `Nat.add_comm,String.append`. Same flag/shape as `ix check-rs --consts`. A mutual-block member extracts its whole block."
+    "consts-file" : String; "Additionally read names from a file (one per line; `#` comments and blank lines ignored). Unions with --consts."
+    out           : String; "Output `.ixe` path. Defaults to a slug of the first name (e.g. `nat_add_comm.ixe`)."
+    verbose;                "Print extraction details to stderr."
+
+  ARGS:
+    path : String; "Path to the source `.ixe` (e.g. from `ix compile`)."
+]
+
 def runShardCmd (p : Cli.Parsed) : IO UInt32 := do
   let some pathArg := p.positionalArg? "path"
     | p.printError "error: must specify <path> to a .ixprof file"
@@ -87,6 +133,9 @@ def shardCmd : Cli.Cmd := `[Cli|
 
   ARGS:
     path : String; "Path to a .ixprof produced by `ix profile`"
+
+  SUBCOMMANDS:
+    shardExtractCmd
 ]
 
 end
diff --git a/Ix/KernelCheck.lean b/Ix/KernelCheck.lean
index f32a6fab..38c5037b 100644
--- a/Ix/KernelCheck.lean
+++ b/Ix/KernelCheck.lean
@@ -142,6 +142,46 @@ opaque rsCheckAnonFFI :
     @& String →                          -- fail-out path ("" = none)
     IO (Array (String × Option CheckError))
 
+/-- FFI: anon-mode type-check of named constants with (by default) their full
+    dependency closures — the same mode and scope as the zkVM hosts' `--consts`
+    execute path, so an out-of-circuit run is directly comparable to the
+    in-circuit one. The `Bool` after the names is `skip-deps`: `true` checks
+    only each name's own work item (subject-only; deps trusted), mirroring
+    `zisk-host --skip-deps`.
+
+    Names are the constants' displayed forms (e.g. `"Nat.add_comm"`,
+    `"_private.Init.….instRxcHasSize_eq"`), resolved through the env's `named`
+    metadata by string match — the same resolution the zkVM hosts use — after
+    which the check runs on the anon view (the kernel never sees names). A
+    member of a mutual block selects the whole block's work item. Multiple
+    names union their closures into one check set.
+
+    Returns `(hex_address, Option CheckError)` pairs, one per checked target,
+    exactly like `rsCheckAnonFFI`. Errors (rather than returning) when a name
+    doesn't resolve, so a typo can't silently produce an empty check. -/
+@[extern "rs_kernel_check_anon_consts"]
+opaque rsCheckAnonConstsFFI :
+    @& String →                          -- .ixe path
+    @& Array String →                    -- constant names (displayed form)
+    @& Bool →                            -- skip-deps (subject-only)
+    @& Bool →                            -- quiet
+    @& String →                          -- fail-out path ("" = none)
+    IO (Array (String × Option CheckError))
+
+/-- FFI: extract the named constants' dependency closure from a serialized
+    env into a standalone `.ixe` — genuine constant bytes, blobs, and
+    reducibility hints, plus the closure constants' Named entries so names
+    still resolve — without recompiling from source. Names resolve like
+    `rsCheckAnonConstsFFI` (displayed form); a mutual-block member pulls its
+    whole block. Errors on an unresolvable name. -/
+@[extern "rs_env_extract"]
+opaque rsEnvExtractFFI :
+    @& String →                          -- source .ixe path
+    @& Array String →                    -- constant names (displayed form)
+    @& String →                          -- output .ixe path
+    @& Bool →                            -- quiet
+    IO Unit
+
 /-- FFI: profile a `.ixe` out of circuit, writing a `.ixprof` sidecar with
     per-block heartbeats + the delta-unfold graph (the sharding cost model,
     see `plans/sharding.md`). Runs the anon kernel over every checkable target.
diff --git a/Ix/TracingTexray.lean b/Ix/TracingTexray.lean
index c0c4c064..2aeac07f 100644
--- a/Ix/TracingTexray.lean
+++ b/Ix/TracingTexray.lean
@@ -41,6 +41,32 @@ private opaque initWith
 def init (s : Settings := {}) : IO Unit :=
   initWith s.namePrefixes s.trackRam s.streaming
 
+@[extern "rs_texray_start_sampler"]
+private opaque startSamplerFFI (intervalMs : UInt64) : IO Unit
+
+/-- Start the process-tree RSS sampler (idempotent). Unlike the per-span
+    `/proc/self/status` reads, this sums RSS across this process and all its
+    children, so [`peakTreeRssBytes`] captures memory resident in helper
+    processes (e.g. a zkVM host's services). `intervalMs` is the sample period. -/
+def startSampler (intervalMs : UInt64 := 50) : IO Unit :=
+  startSamplerFFI intervalMs
+
+@[extern "rs_texray_peak_tree_rss_bytes"]
+private opaque peakTreeRssBytesFFI : IO UInt64
+
+/-- Peak resident-set size in bytes across this process and its children. `0`
+    until [`startSampler`] has run or on non-Linux platforms. -/
+def peakTreeRssBytes : IO Nat := do
+  return (← peakTreeRssBytesFFI).toNat
+
+@[extern "rs_texray_json_sink"]
+private opaque jsonSinkFFI (path : @& String) : IO Unit
+
+/-- Direct the per-span timing sink to `path` (JSON Lines). Pair with `init`
+    (streaming) so the examined `aiur/*` / `stark/*` spans are recorded. -/
+def jsonSink (path : String) : IO Unit :=
+  jsonSinkFFI path
+
 end TracingTexray
 
 end
diff --git a/README.md b/README.md
index d17190b0..a975ad0d 100644
--- a/README.md
+++ b/README.md
@@ -231,7 +231,7 @@ Non-Nix users: install the SP1 toolchain manually per the
    ```
 
    For a larger, realistic env compile one of the `Benchmarks/Compile`
-   targets, then scope proving to a single constant with `--constant`
+   targets, then scope proving to one or more constants with `--consts`
    (step 2):
 
    ```
@@ -250,7 +250,7 @@ Non-Nix users: install the SP1 toolchain manually per the
    # Prove a single constant out of a larger env (Anon-only): the host resolves
    # the name and ships only that constant's closure sub-env. Full-closure by
    # default; add --skip-deps for a subject-only check (deps trusted).
-   WITHOUT_VK_VERIFICATION=1 RUST_LOG=info cargo run --release -- --ixe ../../init.ixe --constant Nat.add_comm
+   WITHOUT_VK_VERIFICATION=1 RUST_LOG=info cargo run --release -- --ixe ../../init.ixe --consts Nat.add_comm
    ```
 
    With no `--ixe`, the host runs against an empty `Ixon.Env`.
@@ -354,19 +354,20 @@ Non-Nix users: install Zisk manually per the
    RUST_LOG=info cargo run --release -- --verify-constraints --ixe ../minimal.ixe
    # Generate and verify a VadcopFinal proof of the same typecheck (CPU)
    RUST_LOG=info cargo run --release -- --ixe ../minimal.ixe
-   # Check a single named constant out of a larger env. The host resolves the
+   # Check one or more named constants out of a larger env. The host resolves each
    # name and ships only its closure sub-env (lazy fault-in, no whole-env load).
    # By default this is the FULL-CLOSURE typecheck — the constant and its whole
-   # dependency closure (matching the Aiur `bench-typecheck --constant`).
+   # dependency closure (matching the Aiur `bench-typecheck --consts <names>`).
    # Composes with --execute (cycles only) and plain prove.
-   RUST_LOG=info cargo run --release -- --execute --ixe ../init.ixe --constant Nat.add_comm
-   RUST_LOG=info cargo run --release -- --ixe ../init.ixe --constant Nat.add_comm
+   RUST_LOG=info cargo run --release -- --execute --ixe ../init.ixe --consts Nat.add_comm
+   RUST_LOG=info cargo run --release -- --ixe ../init.ixe --consts Nat.add_comm,Nat.succ
    # Add --skip-deps for a subject-only check (deps trusted, not re-checked):
-   RUST_LOG=info cargo run --release -- --execute --ixe ../init.ixe --constant Vector.extract_append --skip-deps
+   RUST_LOG=info cargo run --release -- --execute --ixe ../init.ixe --consts Vector.extract_append --skip-deps
    ```
 
-   `--constant` / `--skip-deps` are the same flags the Aiur `bench-typecheck`
-   uses, so the two backends share one vocabulary. `--skip-deps` trusts
+   `--consts` / `--skip-deps` are the same flags `ix check`, `sp1-host`, and the
+   Aiur `bench-typecheck` use, so all four share one vocabulary. `--skip-deps`
+   trusts
    dependencies rather than re-checking them, so it is far cheaper than the
    full-closure default — reserve it for constants too expensive to
    full-closure-check that also can't be sharded (e.g. `Vector.extract_append`
@@ -477,41 +478,24 @@ Non-Nix users: install Zisk manually per the
    [`DEFAULT_MEMORY_LIMIT`](https://github.com/succinctlabs/sp1/blob/v6.2.0/crates/core/executor/src/opts.rs#L25),
    configurable via `MEMORY_LIMIT` env var up to a ~1 TB JIT ceiling
    [`MAX_JIT_LOG_ADDR`](https://github.com/succinctlabs/sp1/blob/v6.2.0/crates/primitives/src/consts.rs#L11)),
-   or scope to a single constant with `--constant <name>` (all backends),
-   which resolves the name and ships only that constant's closure sub-env to the
-   guest. By default it re-checks the full dependency closure; add `--skip-deps`
-   to check it **subject-only** (dependencies trusted and lazily faulted in, not
+   or scope to one or more constants with `--consts <n1,n2,…>` (all backends),
+   which resolves each name and ships only that constant's closure sub-env to the
+   guest. By default each re-checks its full dependency closure; add `--skip-deps`
+   to check them **subject-only** (dependencies trusted and lazily faulted in, not
    re-typechecked) — so individual constants of a large env still fit the cap,
    even ones whose full-closure typecheck would not. To prove a large env in
    full under Zisk, shard it (see
    *Sharding large environments* below): each shard ships only its own closure
    sub-env, so the pieces fit the cap even when the whole env does not.
 
-   **Host RAM cap (`--max-witness-stored`).** Distinct from the in-guest
-   heap cap above, the prover side (Zisk's `proofman`) holds in-flight
-   witness traces in host RAM during `CALCULATING_CONTRIBUTIONS`. Peak
-   host RAM per shard ≈ `fixed-overhead + N × avg-witness-size`, where
-   `N` is the `max_witness_stored` setting. With the Blake3f precompile the
-   Ix kernel typecheck workload measures roughly `40 GB + N × 16 GB` on
-   typical 200–300 kB anon-byte shards — e.g. `N = 10` peaks near 200 GB
-   (a `--shard-bytes 250000 --max-witness-stored 10` mergesort run completes
-   under a 200 GiB guard without tripping it). An earlier pre-Blake3f figure
-   of ~25 GB per witness is stale; the precompile shrank the witness.
-
-   The `zisk-host` CLI defaults to `--max-witness-stored 5` (Zisk's
-   built-in default is 10, tuned for larger-memory boxes). Override per
-   machine:
-
-   | Host RAM | `--max-witness-stored` | Notes                                                  |
-   | -------- | ---------------------- | ------------------------------------------------------ |
-   | ≤ 128 GB | `3`                    | Override down; consider smaller shards too             |
-   | 256 GB   | `5` (project default)  | Comfortable margin on the typical setup                |
-   | 512 GB   | `10` (Zisk default)    | Override up for maximum prover parallelism             |
-   | ≥ 1 TB   | `10` (Zisk default)    | Override up; default is conservative for this workload |
-
-   Lowering the cap roughly linearly bounds peak RAM but throttles
-   prover parallelism (~10–30 % slower in practice). Raise it if your
-   machine has more RAM headroom; lower it if you OOM during
+   **Host RAM during proving.** Distinct from the in-guest heap cap above,
+   the prover side (Zisk's `proofman`) holds in-flight witness traces in host
+   RAM during `CALCULATING_CONTRIBUTIONS`. The number of resident witnesses
+   (Zisk's `max_witness_stored`) is left at Zisk's built-in default of 10:
+   we measured that lowering it does not materially reduce peak host RAM or
+   prove time for the Ix kernel typecheck workload, so it is not exposed as a
+   knob. Peak host RAM per shard is instead governed by shard size — prove
+   smaller shards (`--shard-bytes`) if you OOM during
    `CALCULATING_CONTRIBUTIONS`. Not relevant for `--execute` or
    `--verify-constraints` modes.
 
diff --git a/crates/aiur/src/execute.rs b/crates/aiur/src/execute.rs
index 21f24f23..561f2ee3 100644
--- a/crates/aiur/src/execute.rs
+++ b/crates/aiur/src/execute.rs
@@ -846,6 +846,61 @@ pub fn unconstrained_big_uint_div_mod_helper(
   Ok((q_ptr, r_ptr))
 }
 
+/// Read-only twin of `unconstrained_big_uint_div_mod_helper` for trace
+/// population: recompute `(q, r)` and resolve the list-head pointers the
+/// execution already recorded in `memory[10]` — every node was built there
+/// during execution, so each key must be present.
+pub fn find_unconstrained_big_uint_div_mod(
+  a_ptr: G,
+  b_ptr: G,
+  memory: &FxIndexMap<usize, QueryMap>,
+) -> Result<(G, G), String> {
+  let a_limbs = read_klimbs_u64(memory, a_ptr)?;
+  let b_limbs = read_klimbs_u64(memory, b_ptr)?;
+  let a_big = klimbs_u64_to_biguint(&a_limbs);
+  let b_big = klimbs_u64_to_biguint(&b_limbs);
+  let (q_big, r_big) = if b_big == num_bigint::BigUint::ZERO {
+    (num_bigint::BigUint::ZERO, a_big.clone())
+  } else {
+    (&a_big / &b_big, &a_big % &b_big)
+  };
+  let q_ptr = find_klimbs_u64(memory, &biguint_to_klimbs_u64(&q_big))?;
+  let r_ptr = find_klimbs_u64(memory, &biguint_to_klimbs_u64(&r_big))?;
+  Ok((q_ptr, r_ptr))
+}
+
+/// Read-only twin of `build_klimbs_u64`: resolve the pointer of each
+/// (already-recorded) list node without inserting.
+fn find_klimbs_u64(
+  memory: &FxIndexMap<usize, QueryMap>,
+  limbs: &[u64],
+) -> Result<G, String> {
+  let queries = memory.get(&10).ok_or_else(|| {
+    "memory[10] channel not registered (no List<U64> in program?)".to_string()
+  })?;
+  let nil_key: Vec<G> =
+    std::iter::once(G::ONE).chain((0..9).map(|_| G::ZERO)).collect();
+  let mut tail_ptr = queries
+    .get(&nil_key)
+    .ok_or_else(|| "List<U64> Nil node not recorded".to_string())?
+    .output[0];
+  for limb in limbs.iter().rev() {
+    let mut key: Vec<G> = Vec::with_capacity(10);
+    key.push(G::ZERO); // Cons tag (first variant of ListNode‹U64›)
+    for b in &limb.to_le_bytes() {
+      key.push(G::from_u8(*b));
+    }
+    key.push(tail_ptr);
+    tail_ptr = queries
+      .get(&key)
+      .ok_or_else(|| {
+        format!("List<U64> Cons node for limb {limb} not recorded")
+      })?
+      .output[0];
+  }
+  Ok(tail_ptr)
+}
+
 /// Walk a `List<U64>` chain from `head_ptr` in `memory[10]`, returning the
 /// u64 limbs in head-first order. Each memory[10] entry is the standard Aiur
 /// tagged-enum layout: `[tag, byte0..byte7, next_ptr]`. `tag == 0` = Nil
diff --git a/crates/aiur/src/trace.rs b/crates/aiur/src/trace.rs
index e696e688..5b45d06c 100644
--- a/crates/aiur/src/trace.rs
+++ b/crates/aiur/src/trace.rs
@@ -13,7 +13,9 @@ use rayon::{
 use crate::{
   FxIndexMap, G,
   bytecode::{Block, Ctrl, Function, Op, Toplevel},
-  execute::{IOBuffer, IOKeyInfo, QueryRecord},
+  execute::{
+    IOBuffer, IOKeyInfo, QueryRecord, find_unconstrained_big_uint_div_mod,
+  },
   function_channel,
   gadgets::{bytes1::Bytes1, bytes2::Bytes2},
   memory::Memory,
@@ -558,10 +560,26 @@ impl Op {
           ),
         );
       },
+      Op::UnconstrainedBigUintDivMod(a, b) => {
+        // Mirrors the execute arm and the two auxiliary columns the
+        // constraints allocate: recompute `(q, r)` and resolve the head
+        // pointers execution recorded in memory[10]. Skipping the two map
+        // pushes would shift every later `ValIdx` (and witness column) in
+        // the block.
+        let (q_ptr, r_ptr) = find_unconstrained_big_uint_div_mod(
+          map[*a].0,
+          map[*b].0,
+          &context.query_record.memory_queries,
+        )
+        .expect("BigUint div-mod result not recorded");
+        for f in [q_ptr, r_ptr] {
+          map.push((f, 1));
+          slice.push_auxiliary(index, f);
+        }
+      },
       Op::AssertEq(..)
       | Op::IOSetInfo(..)
       | Op::IOWrite(..)
-      | Op::UnconstrainedBigUintDivMod(..)
       | Op::Debug(..) => {},
     }
   }
diff --git a/crates/ffi/src/kernel.rs b/crates/ffi/src/kernel.rs
index a386d6a2..d4791e46 100644
--- a/crates/ffi/src/kernel.rs
+++ b/crates/ffi/src/kernel.rs
@@ -1318,12 +1318,14 @@ enum AnonWorkItem {
 /// [`ix_kernel::anon_work::build_anon_work`] (shared with the
 /// SP1/Zisk guests) and layers the FFI's per-target result-slot
 /// bookkeeping on top.
-fn build_anon_work(
-  env: &IxonEnv,
-) -> Result<(Vec<AnonWorkItem>, Vec<Address>), String> {
+/// Assign result slots to a set of kernel work items — the indexing step
+/// shared by the whole-env check (every item) and the per-constant closure
+/// check (a filtered subset).
+fn index_anon_work(
+  kernel_work: Vec<ix_kernel::anon_work::AnonWorkItem>,
+) -> (Vec<AnonWorkItem>, Vec<Address>) {
   use ix_kernel::anon_work::AnonWorkItem as KItem;
 
-  let kernel_work = ix_kernel::anon_work::build_anon_work(env)?;
   let mut work: Vec<AnonWorkItem> = Vec::with_capacity(kernel_work.len());
   let mut addrs: Vec<Address> = Vec::new();
   for item in kernel_work {
@@ -1341,7 +1343,13 @@ fn build_anon_work(
       },
     }
   }
-  Ok((work, addrs))
+  (work, addrs)
+}
+
+fn build_anon_work(
+  env: &IxonEnv,
+) -> Result<(Vec<AnonWorkItem>, Vec<Address>), String> {
+  Ok(index_anon_work(ix_kernel::anon_work::build_anon_work(env)?))
 }
 
 #[allow(clippy::needless_pass_by_value)]
@@ -1623,6 +1631,362 @@ pub extern "C" fn rs_kernel_check_anon(
   build_anon_result_array(&addrs_for_return, &results)
 }
 
+/// FFI: anon-mode type-check of named constants with (by default) their full
+/// dependency closures — the same mode and scope as the zkVM hosts' `--consts`
+/// execute path, so an out-of-circuit run is directly comparable to the
+/// in-circuit one. `skip_deps` restricts the check to each name's own work
+/// item (subject-only; deps trusted), mirroring `zisk-host --skip-deps`.
+///
+/// Names resolve through the env's `named` metadata by displayed form (the
+/// same string match the zkVM hosts use), then the metadata is dropped and
+/// the check runs on the anon view — the kernel never sees names. A member
+/// of a mutual block selects the whole block's work item (blocks check
+/// atomically). Multiple names union their closures into one check set.
+///
+/// Returns `(hex_address, Option CheckError)` pairs, one per checked target,
+/// exactly like `rs_kernel_check_anon`.
+#[unsafe(no_mangle)]
+pub extern "C" fn rs_kernel_check_anon_consts(
+  env_path: LeanString<LeanBorrowed<'_>>,
+  names: LeanArray<LeanBorrowed<'_>>,
+  skip_deps: LeanBool<LeanBorrowed<'_>>,
+  quiet: LeanBool<LeanBorrowed<'_>>,
+  fail_out: LeanString<LeanBorrowed<'_>>,
+) -> LeanIOResult<LeanOwned> {
+  use ix_kernel::anon_work::{
+    AnonWorkItem as KItem, block_of_addr, closure_addrs, work_block_addr,
+  };
+
+  let total_start = Instant::now();
+  let quiet = quiet.to_bool();
+  let skip_deps = skip_deps.to_bool();
+  let path = env_path.to_string();
+  let fail_out_path = fail_out.to_string();
+  let fail_out_path =
+    if fail_out_path.is_empty() { None } else { Some(fail_out_path) };
+  let names_vec: Vec<String> = names.map(|obj| obj.as_string().to_string());
+  if names_vec.is_empty() {
+    return LeanIOResult::error_string(
+      "rs_kernel_check_anon_consts: no constant names given",
+    );
+  }
+
+  let t0 = Instant::now();
+  let bytes = match std::fs::read(&path) {
+    Ok(bytes) => bytes,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_kernel_check_anon_consts: failed to read {path}: {e}"
+      ));
+    },
+  };
+
+  // Resolve displayed names → addresses through the full env's `named`
+  // metadata (the anon view discards it); the full env drops right after.
+  let resolved: Vec<Address> = {
+    let mut slice: &[u8] = &bytes;
+    let full = match IxonEnv::get(&mut slice) {
+      Ok(env) => env,
+      Err(e) => {
+        return LeanIOResult::error_string(&format!(
+          "rs_kernel_check_anon_consts: failed to deserialize {path}: {e}"
+        ));
+      },
+    };
+    let by_name: FxHashMap<String, Address> = full
+      .named
+      .iter()
+      .map(|e| (e.key().to_string(), e.value().addr.clone()))
+      .collect();
+    let mut addrs = Vec::with_capacity(names_vec.len());
+    let mut missing: Vec<&str> = Vec::new();
+    for n in &names_vec {
+      match by_name.get(n) {
+        Some(a) => addrs.push(a.clone()),
+        None => missing.push(n),
+      }
+    }
+    if !missing.is_empty() {
+      return LeanIOResult::error_string(&format!(
+        "rs_kernel_check_anon_consts: no constant(s) named [{}] in {path}",
+        missing.join(", ")
+      ));
+    }
+    addrs
+  };
+  eprintln!(
+    "[rs_kernel_check_anon_consts] resolve:    {:>8.1?} ({} name(s))",
+    t0.elapsed(),
+    resolved.len()
+  );
+
+  let t1 = Instant::now();
+  let mut slice: &[u8] = &bytes;
+  let ixon_env = match IxonEnv::get_anon(&mut slice) {
+    Ok(env) => env,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_kernel_check_anon_consts: failed to deserialize (anon) {path}: {e}"
+      ));
+    },
+  };
+  drop(bytes);
+  eprintln!(
+    "[rs_kernel_check_anon_consts] anon parse: {:>8.1?} ({} consts)",
+    t1.elapsed(),
+    ixon_env.const_count(),
+  );
+
+  // Map each seed to the work item whose ingress block owns it (standalone →
+  // itself; a mutual-block member → the whole block, checked atomically).
+  let t2 = Instant::now();
+  let kernel_work = match ix_kernel::anon_work::build_anon_work(&ixon_env) {
+    Ok(work) => work,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_kernel_check_anon_consts: build_anon_work: {e}"
+      ));
+    },
+  };
+  let by_block: FxHashMap<Address, usize> = kernel_work
+    .iter()
+    .enumerate()
+    .map(|(i, w)| (work_block_addr(&ixon_env, w), i))
+    .collect();
+  let mut seed_items: Vec<usize> = Vec::new();
+  for addr in &resolved {
+    let block = block_of_addr(&ixon_env, addr);
+    match by_block.get(&block) {
+      Some(i) => {
+        if !seed_items.contains(i) {
+          seed_items.push(*i);
+        }
+      },
+      None => {
+        return LeanIOResult::error_string(&format!(
+          "rs_kernel_check_anon_consts: no work item covers {} (block {})",
+          addr.hex(),
+          block.hex()
+        ));
+      },
+    }
+  }
+
+  // The check set. Subject-only: the seeds' own items. Full-closure: every
+  // work item inside the seeds' dependency closure — the same set a zkVM
+  // guest enumerates from its closure sub-env (`build_sub_env` +
+  // `build_anon_work`), computed here directly from `closure_addrs` without
+  // serializing a sub-env.
+  let selected: Vec<KItem> = if skip_deps {
+    seed_items.iter().map(|&i| kernel_work[i].clone()).collect()
+  } else {
+    let roots: Vec<Address> = seed_items
+      .iter()
+      .flat_map(|&i| kernel_work[i].proven_targets())
+      .collect();
+    let closure = closure_addrs(&ixon_env, &roots);
+    kernel_work
+      .into_iter()
+      .filter(|w| match w {
+        KItem::Standalone { addr } => closure.contains(addr),
+        KItem::Block { block_addr, .. } => closure.contains(block_addr),
+      })
+      .collect()
+  };
+  let (work, addrs) = index_anon_work(selected);
+  eprintln!(
+    "[rs_kernel_check_anon_consts] build work: {:>8.1?} ({} items, {} targets, {})",
+    t2.elapsed(),
+    work.len(),
+    addrs.len(),
+    if skip_deps { "subject-only" } else { "full-closure" },
+  );
+
+  let failure_log: Option<Arc<FailureLog>> = match fail_out_path.as_deref() {
+    None => None,
+    Some(out_path) => match FailureLog::open(out_path, &path, addrs.len()) {
+      Ok(log) => {
+        eprintln!(
+          "[rs_kernel_check_anon_consts] streaming failures to {out_path}"
+        );
+        Some(Arc::new(log))
+      },
+      Err(e) => {
+        return LeanIOResult::error_string(&format!(
+          "rs_kernel_check_anon_consts: failed to open fail-out file {out_path}: {e}"
+        ));
+      },
+    },
+  };
+
+  let total = addrs.len();
+  let addrs_for_return = addrs.clone();
+  let t3 = Instant::now();
+  let ixon_env_arc = Arc::new(ixon_env);
+  let results = match run_anon_checks_parallel(
+    ixon_env_arc,
+    work,
+    addrs,
+    quiet,
+    failure_log.clone(),
+  ) {
+    Ok(r) => r,
+    Err(msg) => {
+      if let Some(log) = failure_log.as_ref() {
+        log.finalize();
+      }
+      return build_uniform_error(total, &format!("[thread] {msg}"));
+    },
+  };
+
+  let passed = results.iter().filter(|r| r.is_ok()).count();
+  let failed = results.iter().filter(|r| r.is_err()).count();
+  eprintln!(
+    "[rs_kernel_check_anon_consts] {passed}/{total} passed, {failed} failed ({:.1?})",
+    t3.elapsed()
+  );
+  eprintln!(
+    "[rs_kernel_check_anon_consts] total:      {:>8.1?}",
+    total_start.elapsed()
+  );
+  if let Some(log) = failure_log.as_ref() {
+    log.finalize();
+    eprintln!(
+      "[rs_kernel_check_anon_consts] streamed {} failure(s) to fail-out",
+      log.count()
+    );
+  }
+
+  build_anon_result_array(&addrs_for_return, &results)
+}
+
+/// FFI: extract the named constants' dependency closure from a serialized
+/// env into a standalone `.ixe` — genuine constant bytes, blobs, and
+/// reducibility hints (via the anon view), plus every closure constant's
+/// Named entry (via the full view) so names still resolve downstream — all
+/// without recompiling from source. Each name extracts its covering work
+/// item (a mutual-block member pulls the whole block).
+#[unsafe(no_mangle)]
+pub extern "C" fn rs_env_extract(
+  env_path: LeanString<LeanBorrowed<'_>>,
+  names: LeanArray<LeanBorrowed<'_>>,
+  out_path: LeanString<LeanBorrowed<'_>>,
+  quiet: LeanBool<LeanBorrowed<'_>>,
+) -> LeanIOResult<LeanOwned> {
+  use ix_kernel::anon_work::{
+    block_of_addr, build_anon_work, build_sub_env_named, work_block_addr,
+  };
+
+  let quiet = quiet.to_bool();
+  let path = env_path.to_string();
+  let out = out_path.to_string();
+  let names_vec: Vec<String> = names.map(|obj| obj.as_string().to_string());
+  if names_vec.is_empty() {
+    return LeanIOResult::error_string(
+      "rs_env_extract: no constant names given",
+    );
+  }
+
+  let bytes = match std::fs::read(&path) {
+    Ok(bytes) => bytes,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_env_extract: failed to read {path}: {e}"
+      ));
+    },
+  };
+  let mut slice: &[u8] = &bytes;
+  let full = match IxonEnv::get(&mut slice) {
+    Ok(env) => env,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_env_extract: failed to deserialize {path}: {e}"
+      ));
+    },
+  };
+  // Resolve displayed names → addresses through the full env's `named`
+  // metadata (the anon view discards it).
+  let by_name: FxHashMap<String, Address> = full
+    .named
+    .iter()
+    .map(|e| (e.key().to_string(), e.value().addr.clone()))
+    .collect();
+  let mut resolved: Vec<Address> = Vec::with_capacity(names_vec.len());
+  let mut missing: Vec<&str> = Vec::new();
+  for n in &names_vec {
+    match by_name.get(n.as_str()) {
+      Some(a) => resolved.push(a.clone()),
+      None => missing.push(n),
+    }
+  }
+  if !missing.is_empty() {
+    return LeanIOResult::error_string(&format!(
+      "rs_env_extract: no constant(s) named [{}] in {path}",
+      missing.join(", ")
+    ));
+  }
+
+  let mut slice: &[u8] = &bytes;
+  let anon = match IxonEnv::get_anon(&mut slice) {
+    Ok(env) => env,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_env_extract: failed to deserialize (anon) {path}: {e}"
+      ));
+    },
+  };
+
+  // Roots: each name's covering work item's proven targets (standalone →
+  // itself; a mutual-block member → every sibling, checked atomically).
+  let work = match build_anon_work(&anon) {
+    Ok(work) => work,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!(
+        "rs_env_extract: build_anon_work: {e}"
+      ));
+    },
+  };
+  let by_block: FxHashMap<Address, usize> = work
+    .iter()
+    .enumerate()
+    .map(|(i, w)| (work_block_addr(&anon, w), i))
+    .collect();
+  let mut roots: Vec<Address> = Vec::new();
+  for addr in &resolved {
+    let block = block_of_addr(&anon, addr);
+    match by_block.get(&block) {
+      Some(&i) => roots.extend(work[i].proven_targets()),
+      None => {
+        return LeanIOResult::error_string(&format!(
+          "rs_env_extract: no work item covers block {}…",
+          &block.hex()[..16]
+        ));
+      },
+    }
+  }
+
+  let sub_bytes = match build_sub_env_named(&anon, &full, &roots) {
+    Ok(b) => b,
+    Err(e) => {
+      return LeanIOResult::error_string(&format!("rs_env_extract: {e}"));
+    },
+  };
+  if let Err(e) = std::fs::write(&out, &sub_bytes) {
+    return LeanIOResult::error_string(&format!(
+      "rs_env_extract: failed to write {out}: {e}"
+    ));
+  }
+  if !quiet {
+    eprintln!(
+      "[rs_env_extract] {} name(s) → {} ({} bytes) from {path}",
+      names_vec.len(),
+      out,
+      sub_bytes.len(),
+    );
+  }
+  LeanIOResult::ok(LeanOwned::box_usize(0))
+}
+
 // ===========================================================================
 // Sharding profiler: run the anon kernel out of circuit over a `.ixe`,
 // recording per-block heartbeats + the delta-unfold graph into a `.ixprof`.
diff --git a/crates/ffi/src/texray.rs b/crates/ffi/src/texray.rs
index 77d91b5b..326f23a0 100644
--- a/crates/ffi/src/texray.rs
+++ b/crates/ffi/src/texray.rs
@@ -59,3 +59,38 @@ extern "C" fn rs_texray_init(
   let _ = Registry::default().with(layer.with_filter(filter)).try_init();
   LeanIOResult::ok(LeanOwned::box_usize(0))
 }
+
+/// Start tracing-texray's process-tree RSS sampler (idempotent). `interval_ms`
+/// is the sampling period in milliseconds. Runs on a background daemon thread;
+/// [`rs_texray_peak_tree_rss_bytes`] reads back the high-water mark. Captures
+/// child-process memory (e.g. a zkVM host's helper processes) that a bare
+/// `/proc/self/status` read misses.
+#[unsafe(no_mangle)]
+extern "C" fn rs_texray_start_sampler(
+  interval_ms: u64,
+) -> LeanIOResult<LeanOwned> {
+  tracing_texray::rss_sampler::start(std::time::Duration::from_millis(
+    interval_ms,
+  ));
+  LeanIOResult::ok(LeanOwned::box_usize(0))
+}
+
+/// Peak resident-set size (bytes) across this process and its children per the
+/// tree sampler. `0` until [`rs_texray_start_sampler`] has run or off Linux.
+#[unsafe(no_mangle)]
+extern "C" fn rs_texray_peak_tree_rss_bytes() -> LeanIOResult<LeanOwned> {
+  let bytes = tracing_texray::rss_sampler::peak_tree_rss_bytes();
+  LeanIOResult::ok(LeanOwned::box_u64(bytes))
+}
+
+/// Direct tracing-texray's per-span timing sink to `path` (one
+/// `{"span","seconds"}` JSON line per closed examined span). Combine with a
+/// `streaming`/examined subscriber so the prover's `aiur/*` + `stark/*` spans
+/// are recorded for the CI drill-down.
+#[unsafe(no_mangle)]
+extern "C" fn rs_texray_json_sink(
+  path: LeanString<LeanBorrowed<'_>>,
+) -> LeanIOResult<LeanOwned> {
+  let _ = tracing_texray::json_sink::to_file(&path.to_string());
+  LeanIOResult::ok(LeanOwned::box_usize(0))
+}
diff --git a/crates/kernel/src/anon_work.rs b/crates/kernel/src/anon_work.rs
index be0aae24..1f5be74d 100644
--- a/crates/kernel/src/anon_work.rs
+++ b/crates/kernel/src/anon_work.rs
@@ -279,6 +279,17 @@ pub fn build_sub_env(
   source: &IxonEnv,
   roots: &[Address],
 ) -> Result<Vec<u8>, String> {
+  let sub = sub_env_of(source, roots);
+  let mut buf = Vec::new();
+  sub.put(&mut buf).map_err(|e| format!("sub-env serialize: {e}"))?;
+  Ok(buf)
+}
+
+/// The in-memory closure sub-env behind [`build_sub_env`]: copy the BFS
+/// dependency closure of `roots` (genuine bytes + blobs + reducibility
+/// hints), no Named section.
+#[cfg(not(target_arch = "riscv64"))]
+fn sub_env_of(source: &IxonEnv, roots: &[Address]) -> IxonEnv {
   let closure = closure_addrs(source, roots);
   let mut sub = IxonEnv::new();
   for addr in &closure {
@@ -296,6 +307,55 @@ pub fn build_sub_env(
       sub.anon_hints.insert(addr.clone(), *h);
     }
   }
+  sub
+}
+
+/// [`build_sub_env`] plus a name→address entry for every closure constant
+/// named in the FULL view of the same env — a standalone `.ixe` whose names
+/// still resolve (for `--consts`-style tools), extracted without recompiling
+/// from source.
+///
+/// METADATA IS DROPPED: each copied entry is `Named::with_addr` (empty
+/// `ConstantMeta`), because real metadata references name addresses
+/// throughout its tree and carrying it would require the full env's
+/// hash-consed name index. The extract serves the ANON pipeline
+/// (`check-rs --anon`, the zkVM hosts, `ix profile`/`ix shard`,
+/// `bench-typecheck`), where metadata is never read and reducibility hints
+/// travel in the `anon_hints` section instead. Meta-mode tools need the
+/// source env.
+#[cfg(not(target_arch = "riscv64"))]
+pub fn build_sub_env_named(
+  source: &IxonEnv,
+  full: &IxonEnv,
+  roots: &[Address],
+) -> Result<Vec<u8>, String> {
+  use ix_common::env::NameData;
+
+  let sub = sub_env_of(source, roots);
+  // The Named section serializes keys as name HASHES resolved through the
+  // names section, so each key's component chain must be interned too.
+  fn intern_chain(sub: &IxonEnv, name: &ix_common::env::Name) {
+    let addr = Address::from_blake3_hash(*name.get_hash());
+    if sub.get_name(&addr).is_some() {
+      return;
+    }
+    match name.as_data() {
+      NameData::Anonymous(_) => {},
+      NameData::Str(parent, _, _) | NameData::Num(parent, _, _) => {
+        intern_chain(sub, parent);
+      },
+    }
+    sub.store_name(addr, name.clone());
+  }
+  for e in full.named.iter() {
+    if sub.get_const(&e.value().addr).is_some() {
+      intern_chain(&sub, e.key());
+      sub.register_name(
+        e.key().clone(),
+        ixon::env::Named::with_addr(e.value().addr.clone()),
+      );
+    }
+  }
   let mut buf = Vec::new();
   sub.put(&mut buf).map_err(|e| format!("sub-env serialize: {e}"))?;
   Ok(buf)
diff --git a/crates/kernel/src/shard.rs b/crates/kernel/src/shard.rs
index 8e312f6c..e594ff09 100644
--- a/crates/kernel/src/shard.rs
+++ b/crates/kernel/src/shard.rs
@@ -1653,8 +1653,8 @@ pub fn block_step_cost(b: &BlockEntry) -> u64 {
 /// size shards straight from `MemTotal` without ever picking a budget. Inverts
 /// the measured single-leaf prover model on this setup
 /// (`peak_RAM_GiB ≈ 50 + 33 × steps_billions`, measured by a guarded 7-shard GPU
-/// prove sweep over 0.27–3.79e9-step Init shards, R²=0.99, `--max-witness-stored
-/// 5`) at [`RAM_USABLE_FRAC`] of RAM (reserving the rest for the OS, cross-shard
+/// prove sweep over 0.27–3.79e9-step Init shards, R²=0.99) at
+/// [`RAM_USABLE_FRAC`] of RAM (reserving the rest for the OS, cross-shard
 /// re-ingress, and run-to-run variance). Returns 0 when the box can't even hold the ~50 GiB
 /// prover base (nothing will prove). Approximate by design — pair with
 /// [`partition_for_cycle_cap`] to get N. The earlier `45 + 32` model was
@@ -1662,8 +1662,7 @@ pub fn block_step_cost(b: &BlockEntry) -> u64 {
 /// target actually used ~225 GB).
 /// Measured prover-RAM model (the single source of truth, used by both
 /// [`cycle_cap_for_ram`] and [`ram_gib_for_steps`]): peak host RAM ≈
-/// `RAM_BASE_GIB + RAM_GIB_PER_BCYCLE × steps_billions` (at
-/// `--max-witness-stored 5`).
+/// `RAM_BASE_GIB + RAM_GIB_PER_BCYCLE × steps_billions`.
 pub const RAM_BASE_GIB: f64 = 50.0;
 pub const RAM_GIB_PER_BCYCLE: f64 = 33.0;
 /// Usable fraction of a host-RAM budget (headroom for OS + variance) — applied
@@ -1685,8 +1684,8 @@ pub fn cycle_cap_for_ram(ram_gb: f64) -> u64 {
 }
 
 /// Measured single-GPU **leaf prove time**: `≈ PROVE_SETUP_SECS +
-/// PROVE_SECS_PER_BCYCLE × steps_billions` per shard (RTX PRO 6000,
-/// `--max-witness-stored 5`). Aggregation adds a smaller per-fold term this model
+/// PROVE_SECS_PER_BCYCLE × steps_billions` per shard (RTX PRO 6000).
+/// Aggregation adds a smaller per-fold term this model
 /// omits — minutes next to hours of leaf proving at large shard counts.
 pub const PROVE_SETUP_SECS: f64 = 54.0;
 pub const PROVE_SECS_PER_BCYCLE: f64 = 158.0;
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
new file mode 100644
index 00000000..868e824e
--- /dev/null
+++ b/docs/benchmarking.md
@@ -0,0 +1,212 @@
+# Benchmarking
+
+Ix is benchmarked on two surfaces, both driven by one curated constant set and
+the same backend drivers:
+
+- **`!benchmark` PR comment** (`.github/workflows/bench-pr.yml`) — on demand,
+  posts a **main-vs-PR** comparison table on the pull request. main's numbers
+  are pulled from bencher.dev via its public reports API (`bench.py fetch-main`);
+  the PR side is measured fresh. If bencher hasn't ingested the base SHA yet
+  (freshly-pushed main whose push CI is still running), the workflow falls
+  back to re-running the base SHA locally.
+- **Bencher.dev** (`.github/workflows/bench-main.yml`) — on every push to `main`,
+  tracks each measure over time at <https://bencher.dev> (project `ix`). This is
+  the canonical store for main-branch measurements; the `!benchmark` PR path
+  reads from it.
+
+## Backends
+
+| backend | what it measures | metrics |
+|---|---|---|
+| `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove; each fresh proof is also verified) | `fft-cost`, `execute-time`, `execute-peak-rss`, `prove-time`, `verify-time`, `proof-size`, `peak-rss` |
+| `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `execute-peak-rss`; zisk's closure-sharded heavy rows add `shards`, `max-shard-cycles`, `shard-{cycles,time,peak-rss}:<k>` |
+| `ooc` | the same kernel run **out-of-circuit and in parallel** (`ix check-rs`) — far faster | `throughput`, `check-time`, `peak-rss` |
+| `compile` | `ix compile <env>.lean → <env>.ixe` on the current PR — measures the compile step itself, keyed by CamelCase env slug (`InitStd`, `Lean`, `Mathlib`, `FLT`) | `compile-time`, `throughput`, `file-size`, `constants` |
+
+In **prove** mode, `run.sh` attempts a full prove for every primary — cheap
+*and* heavy. A RAM watchdog (`watch_ram_kill`) samples the process tree's RSS
+every ~3 s and `SIGKILL`s the tree if it exceeds `AIUR_PROVE_MAX_RSS_GB`
+(default 120 GB — 8 GB headroom under a 128 GB runner). When killed, the
+constant records the neutral `{"oom": true}` sentinel and `bench.py compare`
+renders `OOM` cells (with `n/a` Δ%) in the table for that row.
+
+A **typecheck failure** is a correctness regression, not a benchmark blip,
+and surfaces loudly everywhere: the tool fails fast (a zisk shard failure
+aborts the constant's remaining shards, like an OOM kill), the constant
+records the `{"failed": true}` sentinel instead of numbers, `compare`
+renders ❌ cells plus a bold "FAILED TO TYPECHECK" note under the table,
+the failing rows never reach bencher, and the workflow job exits nonzero —
+after the clean rows have been uploaded (bench-main) or the table posted
+(bench-pr) — so the red X lands on the commit/PR.
+
+The `ooc` backend reports two views: the **whole env** (`ix check-rs --anon`,
+keyed by env) and a **per-primary full-closure check** (`ix check-rs --anon
+--consts <name>`, keyed by constant). The per-primary view runs the constant's
+full dependency closure in anon mode — the same mode and scope as the zkVM
+execute — so the delta isolates in-circuit vs out-of-circuit overhead rather
+than mixing in closure-size or metadata effects. (`--skip-deps` exists on both
+sides for a subject-only variant, but the benchmarks use full-closure.)
+
+All are driven by `.github/scripts/run.sh` (compile the env `.ixe`, run the
+backend, emit a neutral `{ "<name>": { "<metric>": n } }` JSON). The PR workflow
+compares two such JSONs; the bencher workflow wraps one in Bencher Metric
+Format.
+
+### Peak RAM and the per-phase drill-down (tracing-texray)
+
+Every tool sources `peak-rss` from **tracing-texray's process-tree sampler** — a
+background thread that sums `VmRSS` across the process *and its children* and
+tracks the high-water mark. This captures memory that a bare `/proc/self/status`
+read misses, most importantly Zisk's ASM microservices (separate PIDs).
+
+Each tool also writes its per-phase span timings (tracing-texray's JSON-Lines
+sink, one `{"span","seconds"}` per closed span) to a side file, which `run.sh`
+aggregates into a `phases` object on the constant's entry. `aiur` yields a rich
+breakdown (`aiur/execute`, `aiur/witness`, `stark/fri_open`, …) since the prover
+instruments those spans; `zisk`/`sp1` record a single `execute`/`prove` phase;
+`ooc` records none. `bench-main.yml` flattens the `phases` object into
+`phase:<span>` measures on the way to bencher, so each span is tracked over
+time. (**TODO**: `bench.py compare` used to emit a per-constant collapsible
+drill-down in the PR comment; that renderer was removed while the primary
+table's flag / threshold semantics were being stabilised — see the TODO in
+`bench.py` at the previous `_phase_details` location. The `phases` data is
+still populated in the neutral JSON, ready to consume.)
+
+## Constant set — `Benchmarks/Vectors.csv`
+
+One CSV is the single source of truth for *which* constants to run:
+`name,env,tier,shard_target,primary`. Rows may omit trailing zero fields — the
+parser tolerates 3, 4, or 5 columns, defaulting `shard_target` and `primary`
+to `0`. Measurements never live here; they live in each tool's neutral results
+JSON and in bencher.dev.
+
+- `env` — compile target the constant resolves in (`initStd` / `lean` / `mathlib`).
+- `tier` — `cheap` (prove-feasible on a CI runner under Aiur's ~128 GB RAM
+  ceiling) or `heavy` (a single-shard prove exceeds the RAM watchdog ceiling
+  and records an OOM row). Consumed only by `bench.py manifest` for selection
+  (the `BENCH_TIER` filter, and the non-primary prove set defaults to cheap);
+  `run.sh` itself never branches on tier — it attempts a full prove of every
+  selected constant under the watchdog.
+- `primary` — the curated **primary subset** (currently ~20 constants across
+  initStd + mathlib), spanning shape and cost range. Default for the
+  `!benchmark` PR comment and the bencher jobs. Set `BENCH_FULL=1` to include
+  everything (~68 total).
+- `shard_target` — marks a heavy constant designated for the manifest-sharded
+  prove path (currently 4 rows).
+
+`bench.py manifest` selects names by env + `--primary` (plus optional
+`--tier`, `--shard`). The `compile` backend short-circuits this — its
+"benchmark" is the env slug itself, so `manifest` writes a one-line
+`names.txt` with the CamelCase env name (`InitStd`, etc.) and skips the CSV.
+`bench.py compare` renders the PR table from the two side JSONs.
+
+## `!benchmark` grammar
+
+Maintainer comment on a PR:
+
+```
+!benchmark ([aiur] [zisk] [sp1] [ooc] [compile] | all) [execute]
+BENCH_ENVS=initStd,mathlib     # which compiled envs (default initStd)
+BENCH_FULL=1                   # run the full curated set, not just primary
+BENCH_TIER=cheap|heavy|all     # tier override (default: all)
+BENCH_SHARD=1                  # restrict to the multi-shard target constants
+RUST_LOG=info                  # passthrough env (allowlisted)
+```
+
+Mode is fixed per backend: `aiur` runs `prove` by default (its report also
+carries the execute-side columns `fft-cost` / `execute-time` /
+`execute-peak-rss` alongside `prove-time` / `peak-rss` — `execute-peak-rss`
+is sampled at the Phase 1/2 boundary, before proving allocations, precisely
+so execute-mode comparisons stay apples-to-apples against prove-run
+baselines); `zisk` / `sp1` / `ooc` run `execute`; `compile` runs `ix
+compile`. The optional bare `execute` token flips `aiur` to execute-only
+(`bench-typecheck --execute-only`, skips Phase 2); on other backends it's a
+no-op. Defaults: `aiur`, `initStd`, primary subset. Backends fan out as a
+matrix; each cell is one `(backend, env, mode)` job. main's numbers are
+pulled from bencher.dev.
+
+## Bencher jobs (`bench-main.yml`)
+
+`build → compile → { prove, zkvm-execute, ooc-check }`, each reporting to its
+own testbed + **workload** (`aiur-check`, `zisk-check`, `sp1-check`,
+`ooc-check`, `ix-compile`). The four typecheck testbeds share the shape
+`<backend>-check-x64-32x`; the compile job uses `ix-compile-x64-32x`. Every
+bench job runs on the same runner (`warp-ubuntu-latest-x64-32x`).
+
+sp1 benchmarks are temporarily disabled (its execute run is too slow for
+CI); the host still builds + unit-tests on every PR via ci.yml. To
+re-enable, uncomment sp1 in two places: the zkvm-execute matrix cell in
+`bench-main.yml` and the Install SP1 step in `bench-pr.yml`.
+
+**Heavy primaries run closure-sharded on zisk.** A heavy constant's full
+closure blows the runner's RAM as a single guest leaf, so it executes as its
+shard-manifest partition instead: `ix shard extract <Env>.ixe --consts <name>`
+cuts a standalone closure-only env (no recompile; anon-faithful — identical
+addresses and fft-cost as the full env), `ix profile` → `ix shard --max-ram
+120` cut the manifest (the canonical partitioner: profiled heartbeats +
+min-cut, capped by predicted RAM), and one `zisk-host --shard-plan` run
+executes the shards sequentially. The constant's row then carries total
+`cycles`, `shards`, `max-shard-cycles`, `execute-time`, `throughput`
+(cycles/s), `execute-peak-rss` (max over the per-shard windows), plus the
+per-shard breakdown uploaded as `shard-cycles:<k>` / `shard-time:<k>` /
+`shard-peak-rss:<k>` measures. Cheap primaries keep the plain single-leaf
+`--consts` run.
+
+The artifacts live in `zkshards-<Env>/` and are cut once per tree:
+bench-main's compile job pre-cuts them (run.sh's `cutshards` backend — it
+has `ix`, the toolchain, and the fresh `.ixe`; the zkvm job stays Lean-free)
+and ships them in the `bench-ixe-*` cache; on the `!benchmark` PR path,
+run.sh cuts them fresh in-place with the side's own `ix` (cheap: seconds per
+closure) — each side partitions its **own** tree, because
+a PR can change the cost profile, and the profiler counts heartbeats rather
+than wall time, so an unchanged tree re-partitions deterministically. The
+heavy set comes from Vectors.csv's tier column via `bench.py manifest
+--heavy-out` (`ZISK_HEAVY_NAMES`).
+
+A constant whose partition still can't fit — an atomic mutual block above
+the cap (`ix shard` flags it INFEASIBLE) — OOMs its shard under the RAM
+watchdog like any other over-ceiling run: the constant gets the honest OOM
+row, its remaining shards are skipped, and the loop proceeds to the next
+constant. If cutting fails entirely (no `ix` on PATH, extract error), the
+constant falls back to the single-leaf run.
+
+Threshold semantics per measure kind:
+- **`constants`** — pinned exactly (0/0). A definitional count; either
+  direction is worth flagging (someone added/removed a def).
+- **`fft-cost`, `cycles`, `shards`, `max-shard-cycles`** — deterministic but
+  directional: `upper 0` (any increase is a real regression), `lower _`
+  (drops are legitimate wins — algorithmic improvements, better packing).
+- **`execute-time`, `prove-time`, `verify-time`, `check-time`, `compile-time`, `peak-rss`,
+  `execute-peak-rss`, `file-size`, `proof-size`** — noisy wall-clock or size measures:
+  `upper 0.05–0.10`, `lower _`. `execute-peak-rss` is the execute phase's RSS
+  high-water on every backend that has one (bench-typecheck samples it at the
+  Phase 1/2 boundary; the zkVM hosts' execute peak carries the same name);
+  bare `peak-rss` is a prove-phase (or, for ooc, whole-check) peak.
+- **`throughput`** — higher-is-better: `upper _`, `lower 0.05–0.10`.
+- **`phase:<span>`, `shard-{cycles,time,peak-rss}:<k>`** — uploaded for
+  trend visibility, intentionally left un-thresholded (dynamic names; a
+  re-partition renames the shard keys). The thresholded aggregates
+  (`shards`, `max-shard-cycles`, total `cycles`) do the alerting; the
+  PR-comment drill-down is where per-phase / per-shard attention goes when
+  that view lands.
+
+All thresholds are windowed to the per-workload
+`bencher-thresholds-reset-<workload>` tag.
+
+To re-baseline a workload after an intended step change, comment
+`!bencher-thresholds-reset <workload|all>` on the merging PR, or run the
+`bencher-thresholds-reset` workflow
+(`.github/workflows/bencher-thresholds-reset.yml`).
+
+## Not yet covered
+
+- **zkVM proving** (Zisk/SP1 `prove`) is not wired up — needs a self-hosted
+  GPU runner. `bench.py`'s parse layer treats `zisk`/`sp1` as `execute`-only.
+- **Per-constant phase drill-down** in the PR comment (was removed while the
+  primary table's semantics were stabilised; TODO in `bench.py` marks the
+  reinstatement point — the `phases` data is still populated in the
+  neutral JSON and flattened to `phase:<span>` on bencher).
+- **Non-`main` base branches** — `bench.py fetch-main` hardcodes
+  `branch=main`; a PR against a non-main base always falls through to the
+  local base-run path. TODO in `bench.py` lays out the three-step plan
+  (producer / consumer / fallback).
diff --git a/docs/zisk-cycle-cost-model.md b/docs/zisk-cycle-cost-model.md
index 04e7a192..3c961ccf 100644
--- a/docs/zisk-cycle-cost-model.md
+++ b/docs/zisk-cycle-cost-model.md
@@ -68,7 +68,7 @@ objective for a cost spanning 50M–9B cycles.
 **Full-closure typecheck** — a self-contained closure: a small program, or a
 constant checked with all its dependencies. Calibrated **cross-library** on
 n=76: 12 small programs + 64 diverse constants checked full-closure via
-`--constant`, spanning **Init (51), Std (3), and Mathlib (10)**:
+`--consts`, spanning **Init (51), Std (3), and Mathlib (10)**:
 ```
 cycles ≈ 0.68M + 96,989·hb + 4,151·subst        R² 0.987, MAPE 6%
 ```
@@ -99,23 +99,23 @@ cycles ≈ 0.39M + 6,740·block_bytes + 14·subenv_bytes + 4,070·subst   R² 0.
 Reserved for constants too expensive to full-closure-check that can't be sharded
 (Finding 4).
 
-**Measuring one constant.** `zisk-host --execute --ixe initstd.ixe --constant
+**Measuring one constant.** `zisk-host --execute --ixe initstd.ixe --consts
 <NAME>` resolves the name, builds just its closure sub-env (deps lazily faulted
 in — no separate `.ixe`, no whole-env ingress), and checks it. Full-closure by
 default (`Ix.Claim.check addr none`); add `--skip-deps` for subject-only. Same
-flags as the Aiur `bench-typecheck`.
+`--consts` / `--skip-deps` vocabulary as `ix check`, `sp1-host`, and the Aiur
+`bench-typecheck`.
 
 ---
 
 ## Prover RAM and prove time
 
-Measured proving 7 Init shards on GPU at `--max-witness-stored 5`, STEPS
-0.27–3.79e9:
+Measured proving 7 Init shards on GPU, STEPS 0.27–3.79e9:
 ```
 peak host RAM (GiB) ≈ 50 + 33·STEPS_billions     R² 0.99
 GPU prove time (s)  ≈ 54 + 158·STEPS_billions     R² 0.98   (~6.3M steps/s)
 ```
-RAM scales with `--max-witness-stored N` (this is N=5). Inverting the RAM model
+Inverting the RAM model
 at `RAM_USABLE_FRAC` = 0.85 gives a safe per-shard cap of **~3.6e9 steps** for a
 200 GiB target (`shard.rs:cycle_cap_for_ram`).
 
@@ -148,12 +148,18 @@ side.
    the cap still executes under it. Each leaf pays the ~180M fixed floor and adds
    an aggregation node, so cheap constants are batched rather than proven one at
    a time. (`--shards N` still does balanced bisection, for manual control.)
-4. **A few constants can't be full-closure-proven on a 250 GiB box and aren't
-   shardable** (single atomic constants): `Vector/Array.extract_append._proof_1`,
-   the `instRxcHasSize_eq` family. The escape hatch is `--skip-deps`:
-   `Vector.extract_append` is the 143 GiB OOM case under full-closure but checks
-   subject-only in 74M cycles. The planner flags these via
-   `infeasible_atomic_floor`.
+4. **A few constants can't be proven as a single full-closure leaf on a 250 GiB
+   box**: `Vector/Array.extract_append._proof_1`, the `instRxcHasSize_eq` family.
+   This is a per-leaf ingress/RAM ceiling on the `--consts <name>` (full-closure,
+   `Ix.Claim.check addr none`) mode — not a global unshardability. In
+   env-sharding mode (`--shard-plan`) these same constants are fine: their
+   subject checks fit in one work item and their deps are proved in other
+   shards, folded in through the assumptions root. The only work unit the
+   env-sharding planner truly can't split is a **mutual block** (`build_anon_work`
+   emits one item per Muts block, checked atomically). The escape hatch in
+   single-constant mode is `--skip-deps`: `Vector.extract_append` is the 143 GiB
+   OOM case under full-closure but checks subject-only in 74M cycles. The
+   planner flags these via `infeasible_atomic_floor`.
 5. **The packing order comes from min-cut.** Whole-env profiling of
    Init/Std/**Mathlib** (mathlib = 631k blocks) shows Lean typecheck is uniformly
    reduction-dominated: own-bytes is only **2.6–7% of member cost** (mathlib
@@ -171,7 +177,7 @@ side.
 
 Applying the cost model to every Init constant's native profile (51,003 blocks /
 51,678 constants) vs a single-leaf cap: **~99.98% of Init typechecks on Zisk.**
-Exactly **12 constants** exceed a single 250 GiB leaf (`--max-witness-stored 5`)
+Exactly **12 constants** exceed a single 250 GiB leaf
 — all single atomic constants (un-shardable), `hb`-dominated. Listed by name
 (all are `_private` proof terms; private prefix elided):
 
@@ -190,7 +196,7 @@ Exactly **12 constants** exceed a single 250 GiB leaf (`--max-witness-stored 5`)
 | `Array.extract_append_extract._proof_1_1` | 13,226 | 4.7e9 |
 | `Char.ofOrdinal_ordinal` | 14,136 | 4.5e9 |
 
-Each has a workaround — lower `--max-witness-stored`, a bigger box, `--skip-deps`
+Each has a workaround — a bigger box, `--skip-deps`
 (subject-only), or upstream proof restructuring — so 12 is an upper bound on
 truly-stuck constants. At a 200 GiB cap the list grows to 16. Estimate uses the
 planner's `block_step_cost` model (`162,339·hb + 4,276·subst + 652·bytes`, the
@@ -222,7 +228,7 @@ Shards 630/634 are one atomic constant each (`Int*.instRxcHasSize_eq`): tiny
 `bytes`/`subst` but huge cycles, driven by `hb` (deep nat-range def-eq) — the
 "expensive atomic" case (Finding 4).
 
-### Full-closure single constants — `--constant`, diverse shapes
+### Full-closure single constants — `--consts`, diverse shapes
 
 One library constant each, checked full-closure (the constant and its whole
 dependency closure). The 35 Init constants below (over `initstd.ixe`) are shown
@@ -289,7 +295,7 @@ lake exe ix shard env.ixprof --max-ram 256        # or --max-cycles C / --shards
 cargo run --release --bin zisk-host -- --execute --ixe <env.ixe> \
   [--shard-plan <plan.ixes> --only-shard K]
 cargo run --release --bin zisk-host -- --execute --ixe initstd.ixe \
-  --constant "Nat.add_comm" [--skip-deps]      # full-closure / subject-only
+  --consts "Nat.add_comm" [--skip-deps]        # full-closure / subject-only
 
 # fits
 python3 ~/benchdata/prof/fit_xlib.py         # full-closure model (76 pts, Init+Std+Mathlib)
@@ -311,8 +317,7 @@ native profiling run and compile out on the zkvm target.
   5–8%); the shard model is still n=12 and Init-derived, so the ~190M shard
   intercept is the term most likely to
   shift on Std/Mathlib — worth re-checking there.
-- RAM/prove-time models are for one GPU at `--max-witness-stored 5`; both scale
-  with that setting.
+- RAM/prove-time models are for one GPU.
 - Cycle counts are deterministic for a fixed guest ELF; the profiler counters
   compile out on the zkvm target, so the proven ELF is unaffected.
 - **Regimes have distinct coefficients and don't transfer.**
diff --git a/sp1/Cargo.lock b/sp1/Cargo.lock
index a5f9903d..aa2a68a4 100644
--- a/sp1/Cargo.lock
+++ b/sp1/Cargo.lock
@@ -1029,7 +1029,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -1247,6 +1247,21 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
 
+[[package]]
+name = "generator"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3b854b0e584ead1a33f18b2fcad7cf7be18b3875c78816b753639aa501513ae"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "libc",
+ "log",
+ "rustversion",
+ "windows-link",
+ "windows-result",
+]
+
 [[package]]
 name = "generic-array"
 version = "0.14.9"
@@ -1894,6 +1909,19 @@ version = "0.4.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5"
 
+[[package]]
+name = "loom"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca"
+dependencies = [
+ "cfg-if",
+ "generator",
+ "scoped-tls",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "lru"
 version = "0.12.5"
@@ -2041,7 +2069,7 @@ version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -2831,7 +2859,7 @@ dependencies = [
  "once_cell",
  "socket2 0.6.3",
  "tracing",
- "windows-sys 0.60.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -3123,7 +3151,7 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -3225,6 +3253,12 @@ dependencies = [
  "sdd",
 ]
 
+[[package]]
+name = "scoped-tls"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294"
+
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
@@ -4072,9 +4106,11 @@ dependencies = [
  "ix-common",
  "ix-kernel",
  "ixon",
+ "serde_json",
  "sp1-build",
  "sp1-sdk",
  "tokio",
+ "tracing-texray",
 ]
 
 [[package]]
@@ -4633,7 +4669,17 @@ dependencies = [
  "getrandom 0.4.2",
  "once_cell",
  "rustix",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "terminal_size"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874"
+dependencies = [
+ "rustix",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -5062,6 +5108,18 @@ dependencies = [
  "tracing-log",
 ]
 
+[[package]]
+name = "tracing-texray"
+version = "0.2.0"
+source = "git+https://github.com/argumentcomputer/tracing-texray?rev=bd4faa08a4fa4edb46bde393b4d20c6bd49591d0#bd4faa08a4fa4edb46bde393b4d20c6bd49591d0"
+dependencies = [
+ "loom",
+ "parking_lot",
+ "terminal_size",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "transpose"
 version = "0.2.3"
@@ -5479,15 +5537,6 @@ dependencies = [
  "windows-targets 0.52.6",
 ]
 
-[[package]]
-name = "windows-sys"
-version = "0.60.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
-dependencies = [
- "windows-targets 0.53.5",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.61.2"
@@ -5521,30 +5570,13 @@ dependencies = [
  "windows_aarch64_gnullvm 0.52.6",
  "windows_aarch64_msvc 0.52.6",
  "windows_i686_gnu 0.52.6",
- "windows_i686_gnullvm 0.52.6",
+ "windows_i686_gnullvm",
  "windows_i686_msvc 0.52.6",
  "windows_x86_64_gnu 0.52.6",
  "windows_x86_64_gnullvm 0.52.6",
  "windows_x86_64_msvc 0.52.6",
 ]
 
-[[package]]
-name = "windows-targets"
-version = "0.53.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
-dependencies = [
- "windows-link",
- "windows_aarch64_gnullvm 0.53.1",
- "windows_aarch64_msvc 0.53.1",
- "windows_i686_gnu 0.53.1",
- "windows_i686_gnullvm 0.53.1",
- "windows_i686_msvc 0.53.1",
- "windows_x86_64_gnu 0.53.1",
- "windows_x86_64_gnullvm 0.53.1",
- "windows_x86_64_msvc 0.53.1",
-]
-
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.48.5"
@@ -5557,12 +5589,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
-
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.5"
@@ -5575,12 +5601,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
-
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.5"
@@ -5593,24 +5613,12 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
-[[package]]
-name = "windows_i686_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
-
 [[package]]
 name = "windows_i686_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
-[[package]]
-name = "windows_i686_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
-
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.5"
@@ -5623,12 +5631,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
-[[package]]
-name = "windows_i686_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
-
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.5"
@@ -5641,12 +5643,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
-
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.5"
@@ -5659,12 +5655,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
-
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.5"
@@ -5677,12 +5667,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
-
 [[package]]
 name = "winnow"
 version = "0.5.40"
diff --git a/sp1/host/Cargo.toml b/sp1/host/Cargo.toml
index f45c1d3c..d03b0724 100644
--- a/sp1/host/Cargo.toml
+++ b/sp1/host/Cargo.toml
@@ -30,6 +30,11 @@ sp1-sdk = { git = "https://github.com/argumentcomputer/sp1", branch = "blake3-pr
 tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
 clap = { version = "4.0", features = ["derive"] }
 anyhow = "1"
+# Neutral per-constant results JSON (`--json`), merged by the CI bench driver.
+serde_json = "1"
+# Process-tree RSS sampler (accurate peak RAM) + per-phase timing sink for the
+# CI drill-down.
+tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "bd4faa08a4fa4edb46bde393b4d20c6bd49591d0" }
 # Throughput formatting (e.g. `42.0 consts/s`).
 human-repr = "1"
 # Proof-size measurement: SP1's `SP1ProofWithPublicValues::bytes()` returns
diff --git a/sp1/host/src/main.rs b/sp1/host/src/main.rs
index 5c69741e..6b548514 100644
--- a/sp1/host/src/main.rs
+++ b/sp1/host/src/main.rs
@@ -6,7 +6,7 @@
 //! RUST_LOG=info cargo run --release -- --execute --ixe ../../minimal.ixe
 //! WITHOUT_VK_VERIFICATION=1 RUST_LOG=info cargo run --release  # prove (compressed)
 //! # prove a single constant out of a large env (Anon-only):
-//! WITHOUT_VK_VERIFICATION=1 RUST_LOG=info cargo run --release -- --ixe ../../init.ixe --constant Nat.add_comm
+//! WITHOUT_VK_VERIFICATION=1 RUST_LOG=info cargo run --release -- --ixe ../../init.ixe --consts Nat.add_comm
 //! ```
 //!
 //! Proving (any non-`--execute` run) requires `WITHOUT_VK_VERIFICATION=1` in
@@ -37,40 +37,94 @@ pub const GUEST_ELF: Elf = include_elf!("sp1-guest");
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
-  /// Run the program in the VM only - no proof.
+  /// Execute in the VM only — no proof.
   #[arg(long)]
   execute: bool,
 
-  /// Run the kernel in Meta mode (preserves names + dup-level-param-name
-  /// check). Default is Anon mode, which matches Aiur's `kernel_check_test`
-  /// semantics. Both modes prove the same structural typecheck; Meta is
-  /// strictly more constrained but slightly more expensive.
+  /// Run the kernel in Meta mode (default: Anon). Meta preserves names.
   #[arg(long)]
   meta: bool,
 
-  /// Path to a `.ixe` file produced by `lake exe ix compile`. If omitted, an
-  /// empty `IxonEnv` is used.
+  /// Path to a `.ixe` (default: empty env).
   #[arg(long)]
   ixe: Option<PathBuf>,
 
-  /// Check a single constant selected by its Lean NAME (e.g. "Nat.add_comm").
-  /// The name resolves through the env's `named` metadata to its ingress
-  /// block; the guest receives only that block's closure sub-env, so one
-  /// constant can be proved out of a large env without shipping (or
-  /// typechecking) the whole thing. By default this is the **full-closure**
-  /// typecheck (the constant and its whole dependency closure, matching
-  /// `Ix.Claim.check addr none` / the Aiur `bench-typecheck --constant`); pass
-  /// `--skip-deps` for a subject-only check (deps trusted). Anon-only
-  /// (incompatible with `--meta`). Requires `--ixe`.
+  /// Comma-separated Lean names to check (Anon-only; each is one guest run).
+  #[arg(long, value_delimiter = ',')]
+  consts: Vec<String>,
+
+  /// Additional names from a file (one per line, `#` comments); unions with --consts.
   #[arg(long)]
-  constant: Option<String>,
+  consts_file: Option<PathBuf>,
 
-  /// Modifies `--constant`: check only the named constant itself, trusting its
-  /// dependencies (subject-only), instead of re-checking its whole transitive
-  /// closure. Same flag/semantics as `zisk-host --skip-deps` and the Aiur
-  /// `bench-typecheck --skip-deps`.
-  #[arg(long, requires = "constant")]
+  /// With --consts/--consts-file: check each subject only, trusting its deps.
+  // Validated in main (not clap `requires = "consts"`): names may come from
+  // --consts-file alone, which a clap-level `requires` would wrongly reject.
+  #[arg(long)]
   skip_deps: bool,
+
+  /// Write per-constant results JSON `{ "<name>": { … } }` (accumulated across --consts).
+  #[arg(long)]
+  json: Option<PathBuf>,
+
+  /// Enable tracing-texray; with --json, per-phase spans are written to <json>.spans.
+  #[arg(long)]
+  texray: bool,
+}
+
+/// Peak resident set size (bytes) across this process *and its children*, from
+/// tracing-texray's tree sampler. `0` until the sampler has started or off
+/// Linux.
+fn peak_rss_bytes() -> Option<u64> {
+  match tracing_texray::rss_sampler::peak_tree_rss_bytes() {
+    0 => None,
+    n => Some(n),
+  }
+}
+
+/// Append the per-constant entry `{ "<name>": <metrics> }` to the results JSON
+/// at `path`, merging into any existing object so a multi-const run (`--consts
+/// a,b,c`) accumulates one map with an entry per name.
+fn write_json_entry(
+  path: &PathBuf,
+  name: &str,
+  metrics: serde_json::Value,
+) -> Result<()> {
+  let mut map: serde_json::Map<String, serde_json::Value> = match fs::read(path)
+  {
+    Ok(bytes) => serde_json::from_slice(&bytes).unwrap_or_default(),
+    Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+      serde_json::Map::new()
+    },
+    Err(e) => return Err(anyhow::anyhow!("read {}: {e}", path.display())),
+  };
+  map.insert(name.to_string(), metrics);
+  fs::write(path, serde_json::to_string(&serde_json::Value::Object(map))?)
+    .map_err(|e| anyhow::anyhow!("write {}: {e}", path.display()))
+}
+
+/// Union `--consts` with names from `--consts-file`, preserving first-seen order.
+fn collect_consts(args: &Args) -> Result<Vec<String>> {
+  let mut seen: std::collections::HashSet<String> =
+    std::collections::HashSet::new();
+  let mut out: Vec<String> = Vec::new();
+  for name in &args.consts {
+    let trimmed = name.trim();
+    if !trimmed.is_empty() && seen.insert(trimmed.to_string()) {
+      out.push(trimmed.to_string());
+    }
+  }
+  if let Some(path) = &args.consts_file {
+    let contents = fs::read_to_string(path)
+      .map_err(|e| anyhow::anyhow!("read {}: {e}", path.display()))?;
+    for line in contents.lines() {
+      let name = line.split('#').next().unwrap_or("").trim();
+      if !name.is_empty() && seen.insert(name.to_string()) {
+        out.push(name.to_string());
+      }
+    }
+  }
+  Ok(out)
 }
 
 fn load_env_bytes(ixe: Option<&PathBuf>) -> Vec<u8> {
@@ -112,7 +166,7 @@ fn count_checkable(env_bytes: &[u8], meta_mode: bool) -> usize {
   }
 }
 
-/// Resolve `--constant <name>` to the guest inputs for that one constant: its
+/// Resolve one `--consts` name to the guest inputs for that constant: its
 /// closure sub-env and a check-list. The name resolves through the full env's
 /// `named` metadata to a constant address, which maps to the `build_anon_work`
 /// item whose ingress block owns it (standalone → itself; a mutual-block member
@@ -185,40 +239,79 @@ async fn main() -> Result<()> {
   sp1_sdk::utils::setup_logger();
 
   let args = Args::parse();
+
+  // Start the process-tree RSS sampler (accurate peak RAM) and point the
+  // per-phase timing sink at the drill-down file if requested — both
+  // independent of the SDK's global tracing logger.
+  //
+  // TODO(spans): the sink only receives the coarse `sp1/execute` / `sp1/prove`
+  // phases we `record_manual` below. For a finer drill-down, install a TeXRay
+  // subscriber and examine the sp1-sdk's own tracing spans — which requires
+  // composing it with the SDK's global logger (`sp1_sdk::utils::setup_logger`),
+  // currently the sole subscriber.
+  tracing_texray::rss_sampler::start(std::time::Duration::from_millis(50));
+  // With --texray + --json, per-phase span timings land at `<json>.spans` as
+  // JSON Lines — the CI drill-down input.
+  if args.texray {
+    if let Some(json) = args.json.as_ref().and_then(|p| p.to_str()) {
+      let _ = tracing_texray::json_sink::to_file(&format!("{json}.spans"));
+    }
+  }
+
   let whole_env_bytes = load_env_bytes(args.ixe.as_ref());
+  let client = ProverClient::from_env().await;
+  let consts = collect_consts(&args)?;
+  if !consts.is_empty() && args.meta {
+    bail!("--consts is Anon-only and cannot be combined with --meta");
+  }
+  if consts.is_empty() && args.skip_deps {
+    bail!("--skip-deps requires constants via --consts or --consts-file");
+  }
 
-  // `--constant` ships a closure sub-env + a check-list (Anon only); otherwise
-  // the whole env ships with an empty check-list (= check everything).
-  let (env_bytes, check_list, const_count) =
-    if let Some(name) = &args.constant {
-      if args.meta {
-        bail!("--constant is Anon-only and cannot be combined with --meta");
-      }
-      constant_inputs(&whole_env_bytes, name, args.skip_deps)?
-    } else {
-      let cc = count_checkable(&whole_env_bytes, args.meta);
-      (whole_env_bytes, Vec::new(), cc)
-    };
+  if consts.is_empty() {
+    run_one(&client, &args, &whole_env_bytes, None).await?;
+  } else {
+    for name in &consts {
+      run_one(&client, &args, &whole_env_bytes, Some(name)).await?;
+    }
+  }
+  Ok(())
+}
+
+async fn run_one<C: Prover + Sync>(
+  client: &C,
+  args: &Args,
+  whole_env_bytes: &[u8],
+  name: Option<&str>,
+) -> Result<()> {
+  // A name ships a closure sub-env + a check-list (Anon only); otherwise the
+  // whole env ships with an empty check-list (= check everything).
+  let (env_bytes, check_list, const_count) = match name {
+    Some(n) => constant_inputs(whole_env_bytes, n, args.skip_deps)?,
+    None => {
+      let cc = count_checkable(whole_env_bytes, args.meta);
+      (whole_env_bytes.to_vec(), Vec::new(), cc)
+    },
+  };
 
   // Three guest inputs, in order:
   //   1. 1-byte mode flag (0 = Anon / 1 = Meta).
-  //   2. Serialized Ixon env (whole env, or a closure sub-env under
-  //      `--constant`). Anon enumerates work in-guest via
-  //      `ix_kernel::anon_work::build_anon_work`; Meta walks `env.named`.
-  //   3. Check-list of packed primary addresses (`--constant`), or empty
-  //      to check every work item.
+  //   2. Serialized Ixon env (whole env, or a closure sub-env under --consts).
+  //   3. Check-list of packed primary addresses (--consts), or empty for all.
   let mut stdin = SP1Stdin::new();
   stdin.write::<u8>(&u8::from(args.meta));
   stdin.write_vec(env_bytes);
   stdin.write_vec(check_list);
 
-  let client = ProverClient::from_env().await;
-
   if args.execute {
     let exec_start = Instant::now();
     let (output, report) =
       client.execute(GUEST_ELF, stdin).await.expect("execute");
     let exec_duration = exec_start.elapsed();
+    tracing_texray::json_sink::record_manual(
+      "sp1/execute",
+      exec_duration.as_secs_f64(),
+    );
     let failures = u32::from_le_bytes(
       output.as_slice()[..4].try_into().expect("output too short"),
     );
@@ -258,6 +351,26 @@ async fn main() -> Result<()> {
     if failures > 0 {
       bail!("kernel typecheck produced {failures} failure(s)");
     }
+    if let Some(path) = &args.json {
+      let cycles = report.total_instruction_count();
+      let secs = exec_duration.as_secs_f64();
+      let tput = if secs > 0.0 { cycles as f64 / secs } else { 0.0 };
+      let key =
+        name.map(|s| s.to_string()).unwrap_or_else(|| "env".to_string());
+      write_json_entry(
+        path,
+        &key,
+        serde_json::json!({
+          "cycles": cycles,
+          "execute-time": (secs * 1e6).round() / 1e6,
+          "throughput": tput.round(),
+          // Named for what it measures (the execute phase's RSS high-water),
+          // matching bench-typecheck's execute-peak-rss; bare `peak-rss` is
+          // reserved for prove-phase peaks.
+          "execute-peak-rss": peak_rss_bytes(),
+        }),
+      )?;
+    }
     return Ok(());
   }
 
@@ -269,6 +382,10 @@ async fn main() -> Result<()> {
   // var (see the module doc header and `Cargo.toml`). `--execute` doesn't.
   let proof = client.prove(&pk, stdin).compressed().await.expect("prove");
   let prove_duration = start.elapsed();
+  tracing_texray::json_sink::record_manual(
+    "sp1/prove",
+    prove_duration.as_secs_f64(),
+  );
   let throughput =
     const_count as f64 / prove_duration.as_secs_f64().max(f64::EPSILON);
   // `SP1ProofWithPublicValues::bytes()` is the onchain-verifier encoding
@@ -285,5 +402,77 @@ async fn main() -> Result<()> {
   client.verify(&proof, pk.verifying_key(), None).expect("verify");
   let verify_duration = verify_start.elapsed();
   println!("proof verified in {:.3}s", verify_duration.as_secs_f64());
+  if let Some(path) = &args.json {
+    let key = name.map(|s| s.to_string()).unwrap_or_else(|| "env".to_string());
+    write_json_entry(
+      path,
+      &key,
+      serde_json::json!({
+        "prove-time": (prove_duration.as_secs_f64() * 1e6).round() / 1e6,
+        "peak-rss": peak_rss_bytes(),
+      }),
+    )?;
+  }
   Ok(())
 }
+
+#[cfg(test)]
+mod cli_tests {
+  use clap::Parser;
+
+  use super::{Args, collect_consts};
+
+  fn parse(argv: &[&str]) -> Args {
+    Args::try_parse_from(
+      std::iter::once("sp1-host").chain(argv.iter().copied()),
+    )
+    .expect("parse ok")
+  }
+
+  #[test]
+  fn consts_splits_on_comma() {
+    let a = parse(&["--consts", "Nat.add_comm,Nat.succ"]);
+    assert_eq!(a.consts, vec!["Nat.add_comm", "Nat.succ"]);
+  }
+
+  #[test]
+  fn consts_repeatable_and_comma_lists_stack() {
+    let a = parse(&["--consts", "a", "--consts", "b,c"]);
+    assert_eq!(a.consts, vec!["a", "b", "c"]);
+  }
+
+  #[test]
+  fn skip_deps_parses_with_consts_file_only() {
+    // Names may come from --consts-file alone; clap must accept the parse
+    // (main validates after collect_consts).
+    let a = parse(&["--consts-file", "names.txt", "--skip-deps"]);
+    assert!(a.skip_deps);
+  }
+
+  #[test]
+  fn json_alone_ok() {
+    // sp1-host's --json is not gated on --consts (keys by "env" when no name).
+    let a = parse(&["--json", "out.json"]);
+    assert_eq!(a.json.as_deref(), Some(std::path::Path::new("out.json")));
+  }
+
+  #[test]
+  fn consts_file_alone_ok() {
+    let a = parse(&["--consts-file", "names.txt"]);
+    assert_eq!(
+      a.consts_file.as_deref(),
+      Some(std::path::Path::new("names.txt"))
+    );
+  }
+
+  #[test]
+  fn collect_unions_and_dedups() {
+    let path = std::env::temp_dir().join("sp1_host_cli_test_consts.txt");
+    std::fs::write(&path, "a\nb\n# comment\n  c  \n\na\n").expect("write");
+    let a =
+      parse(&["--consts", "a,d", "--consts-file", path.to_str().unwrap()]);
+    let got = collect_consts(&a).expect("collect");
+    assert_eq!(got, vec!["a", "d", "b", "c"]);
+    let _ = std::fs::remove_file(&path);
+  }
+}
diff --git a/zisk/Cargo.lock b/zisk/Cargo.lock
index 57b66b1c..18ac716b 100644
--- a/zisk/Cargo.lock
+++ b/zisk/Cargo.lock
@@ -4188,7 +4188,7 @@ dependencies = [
  "once_cell",
  "socket2",
  "tracing",
- "windows-sys 0.60.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -5410,6 +5410,16 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
+[[package]]
+name = "terminal_size"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874"
+dependencies = [
+ "rustix",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "thiserror"
 version = "1.0.69"
@@ -5904,6 +5914,18 @@ dependencies = [
  "tracing-serde",
 ]
 
+[[package]]
+name = "tracing-texray"
+version = "0.2.0"
+source = "git+https://github.com/argumentcomputer/tracing-texray?rev=bd4faa08a4fa4edb46bde393b4d20c6bd49591d0#bd4faa08a4fa4edb46bde393b4d20c6bd49591d0"
+dependencies = [
+ "loom",
+ "parking_lot",
+ "terminal_size",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "try-lock"
 version = "0.2.5"
@@ -6504,15 +6526,6 @@ dependencies = [
  "windows-targets 0.52.6",
 ]
 
-[[package]]
-name = "windows-sys"
-version = "0.60.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
-dependencies = [
- "windows-targets 0.53.5",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.61.2"
@@ -6546,30 +6559,13 @@ dependencies = [
  "windows_aarch64_gnullvm 0.52.6",
  "windows_aarch64_msvc 0.52.6",
  "windows_i686_gnu 0.52.6",
- "windows_i686_gnullvm 0.52.6",
+ "windows_i686_gnullvm",
  "windows_i686_msvc 0.52.6",
  "windows_x86_64_gnu 0.52.6",
  "windows_x86_64_gnullvm 0.52.6",
  "windows_x86_64_msvc 0.52.6",
 ]
 
-[[package]]
-name = "windows-targets"
-version = "0.53.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
-dependencies = [
- "windows-link 0.2.1",
- "windows_aarch64_gnullvm 0.53.1",
- "windows_aarch64_msvc 0.53.1",
- "windows_i686_gnu 0.53.1",
- "windows_i686_gnullvm 0.53.1",
- "windows_i686_msvc 0.53.1",
- "windows_x86_64_gnu 0.53.1",
- "windows_x86_64_gnullvm 0.53.1",
- "windows_x86_64_msvc 0.53.1",
-]
-
 [[package]]
 name = "windows-threading"
 version = "0.1.0"
@@ -6600,12 +6596,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
-
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.42.2"
@@ -6618,12 +6608,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
-
 [[package]]
 name = "windows_i686_gnu"
 version = "0.42.2"
@@ -6636,24 +6620,12 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
-[[package]]
-name = "windows_i686_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
-
 [[package]]
 name = "windows_i686_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
-[[package]]
-name = "windows_i686_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
-
 [[package]]
 name = "windows_i686_msvc"
 version = "0.42.2"
@@ -6666,12 +6638,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
-[[package]]
-name = "windows_i686_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
-
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.42.2"
@@ -6684,12 +6650,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
-
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.42.2"
@@ -6702,12 +6662,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
-
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.42.2"
@@ -6720,12 +6674,6 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
-
 [[package]]
 name = "winnow"
 version = "0.7.15"
@@ -7257,7 +7205,9 @@ dependencies = [
  "ix-common",
  "ix-kernel",
  "ixon",
+ "serde_json",
  "tokio",
+ "tracing-texray",
  "zisk-sdk",
 ]
 
diff --git a/zisk/Cargo.toml b/zisk/Cargo.toml
index 308731d2..4f6c64ea 100644
--- a/zisk/Cargo.toml
+++ b/zisk/Cargo.toml
@@ -21,4 +21,4 @@ panic = "abort"
 # `blake3-precompile` branch above (now that v0.18.0 + the blake3 shim are
 # pushed). To iterate against a local checkout instead, re-add:
 #   [patch."https://github.com/argumentcomputer/zisk.git"]
-#   zisk-sdk = { path = "/home/ubuntu/zisk/sdk" }
+#   zisk-sdk = { path = "/path/to/zisk/sdk" }
diff --git a/zisk/host/Cargo.toml b/zisk/host/Cargo.toml
index d77c0e7a..ecc874b0 100644
--- a/zisk/host/Cargo.toml
+++ b/zisk/host/Cargo.toml
@@ -17,6 +17,12 @@ ix-kernel = { path = "../../crates/kernel" }
 zisk-sdk = { workspace = true }
 anyhow = "1"
 clap = { version = "4.0", features = ["derive"] }
+# Neutral per-constant results JSON (`--json`), merged by the CI bench driver.
+serde_json = "1"
+# Accurate peak RAM via the process-tree sampler (captures the ASM
+# microservices' child-process memory that `/proc/self/status` misses) and the
+# per-phase timing sink feeding the CI drill-down.
+tracing-texray = { git = "https://github.com/argumentcomputer/tracing-texray", rev = "bd4faa08a4fa4edb46bde393b4d20c6bd49591d0" }
 tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
 # Throughput formatting (e.g. `42.0 consts/s`).
 human-repr = "1"
diff --git a/zisk/host/src/main.rs b/zisk/host/src/main.rs
index 1eb35094..9243fc73 100644
--- a/zisk/host/src/main.rs
+++ b/zisk/host/src/main.rs
@@ -186,40 +186,118 @@ struct Args {
   #[arg(long)]
   dump_input: Option<PathBuf>,
 
-  /// Check a single constant selected by its Lean NAME (e.g.
-  /// "ByteArray.utf8DecodeChar?_utf8EncodeChar_append"), with no manifest or
-  /// range plumbing: the name is resolved through the env's `named` metadata to
-  /// its ingress block, and the guest receives only its closure sub-env. By
-  /// default this is the **full-closure** typecheck — the constant *and* its
-  /// whole dependency closure are re-checked (matching `Ix.Claim.check addr
-  /// none`, the default of the Aiur `bench-typecheck --constant`). Pass
-  /// `--skip-deps` for a subject-only check (deps trusted). Composes with
-  /// `--execute` (cycles), plain prove (single leaf, subject-bound + verified),
-  /// and `--dump-input` (write the stdin for ziskemu profiling). Requires
-  /// exactly one `--ixe`. Note: a member of a mutual block selects the whole
-  /// block's work item (the kernel checks blocks atomically).
+  /// Comma-separated Lean names to check (each: closure sub-env → one leaf).
+  #[arg(
+    long,
+    value_delimiter = ',',
+    conflicts_with_all = ["shard_plan", "only_shard", "store_dir"]
+  )]
+  consts: Vec<String>,
+
+  /// Additional names from a file (one per line, `#` comments); unions with --consts.
   #[arg(long, conflicts_with_all = ["shard_plan", "only_shard", "store_dir"])]
-  constant: Option<String>,
-
-  /// Modifies `--constant`: check only the named constant itself, trusting its
-  /// dependencies (subject-only), instead of re-checking its whole transitive
-  /// closure. Reserved for constants too expensive to full-closure-check that
-  /// also can't be sharded. Same flag/semantics as the Aiur
-  /// `bench-typecheck --skip-deps`.
-  #[arg(long, requires = "constant")]
+  consts_file: Option<PathBuf>,
+
+  /// With --consts/--consts-file: check each subject only, trusting its deps.
+  // Validated in main (not clap `requires = "consts"`): names may come from
+  // --consts-file alone, which a clap-level `requires` would wrongly reject.
+  #[arg(long)]
   skip_deps: bool,
 
-  /// Cap on resident witness traces during the prove phase, bounding
-  /// peak host RAM per shard. Zisk's prover queues witnesses up to this
-  /// count before committing them; peak RAM ≈ N × avg-witness-size +
-  /// fixed overheads. Zisk's built-in default is 10 (tuned for
-  /// large-memory boxes); we default to 5 here as a safer fit for
-  /// ~256 GB machines. Override up to 10 on bigger boxes for maximum
-  /// parallelism, or down to 3 on smaller ones. See the Zisk section
-  /// of the top-level README for a per-RAM recommendation table. Has
-  /// no effect on `--execute` / `--verify-constraints` modes.
-  #[arg(long, default_value_t = 5)]
-  max_witness_stored: usize,
+  /// Write per-constant results JSON `{ "<name>": { … } }` (accumulated across names).
+  /// With `--shard-plan --execute` it instead gets one env-level row (totals +
+  /// per-shard cycles breakdown).
+  #[arg(long)]
+  json: Option<PathBuf>,
+
+  /// Benchmark key for the env-level row `--shard-plan --execute --json`
+  /// writes (e.g. the CamelCase env slug CI uses). Defaults to the manifest
+  /// file stem.
+  #[arg(long, requires = "shard_plan")]
+  json_name: Option<String>,
+
+  /// Enable tracing-texray; with --json, per-phase spans are written to <json>.spans.
+  #[arg(long)]
+  texray: bool,
+}
+
+/// Peak resident set size (bytes) across this process *and its children*, from
+/// tracing-texray's tree sampler. `0` until [`start`] has run or off Linux.
+/// Unlike a bare `/proc/self/status` read this includes Zisk's ASM
+/// microservices, which mmap large ROMs in separate PIDs.
+fn peak_rss_bytes() -> Option<u64> {
+  match tracing_texray::rss_sampler::peak_tree_rss_bytes() {
+    0 => None,
+    n => Some(n),
+  }
+}
+
+/// Fail FAST on a guest typecheck failure: a rejected constant rejects the
+/// whole workload, so bail before spending cycles (or proofs) on the
+/// remaining shards — mirroring the OOM kill, which also cancels the rest.
+/// Callers write no `--json` row for a rejected workload; the CI harness
+/// keys off this message ("kernel typecheck produced") to record the
+/// `failed` sentinel.
+fn reject_failures(publics: &ShardPublics, ctx: &str) -> Result<()> {
+  if publics.failures > 0 {
+    bail!(
+      "kernel typecheck produced {} failure(s) in {ctx}; \
+       aborting remaining shards",
+      publics.failures
+    );
+  }
+  Ok(())
+}
+
+/// Append the per-constant entry `{ "<name>": <metrics> }` to the neutral
+/// results JSON at `path`. If the file exists, its object is loaded and the new
+/// key is merged in (overwriting on collision), so a multi-const run
+/// (`--consts a,b,c`) accumulates one map with an entry per name. Written after
+/// every constant, so an external `timeout` still leaves a complete file of the
+/// entries collected so far. serde_json handles key escaping so arbitrary Lean
+/// names are safe.
+fn write_json_entry(
+  path: &PathBuf,
+  name: &str,
+  metrics: serde_json::Value,
+) -> Result<()> {
+  let mut map: serde_json::Map<String, serde_json::Value> =
+    match std::fs::read(path) {
+      Ok(bytes) => serde_json::from_slice(&bytes).unwrap_or_default(),
+      Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+        serde_json::Map::new()
+      },
+      Err(e) => return Err(anyhow::anyhow!("read {}: {e}", path.display())),
+    };
+  map.insert(name.to_string(), metrics);
+  std::fs::write(path, serde_json::to_string(&serde_json::Value::Object(map))?)
+    .map_err(|e| anyhow::anyhow!("write {}: {e}", path.display()))
+}
+
+/// Union `--consts` (comma-list) with names read from `--consts-file` (one per
+/// line, `#` comments and blank lines dropped), preserving first-seen order so
+/// the same name is never re-proven.
+fn collect_consts(args: &Args) -> Result<Vec<String>> {
+  let mut seen: std::collections::HashSet<String> =
+    std::collections::HashSet::new();
+  let mut out: Vec<String> = Vec::new();
+  for name in &args.consts {
+    let trimmed = name.trim();
+    if !trimmed.is_empty() && seen.insert(trimmed.to_string()) {
+      out.push(trimmed.to_string());
+    }
+  }
+  if let Some(path) = &args.consts_file {
+    let contents = std::fs::read_to_string(path)
+      .map_err(|e| anyhow::anyhow!("read {}: {e}", path.display()))?;
+    for line in contents.lines() {
+      let name = line.split('#').next().unwrap_or("").trim();
+      if !name.is_empty() && seen.insert(name.to_string()) {
+        out.push(name.to_string());
+      }
+    }
+  }
+  Ok(out)
 }
 
 /// 112-byte public output of one shard-guest proof.
@@ -761,11 +839,7 @@ fn check_input_coherence(
   Ok(failures)
 }
 
-fn build_client(
-  gpu: bool,
-  asm: bool,
-  max_witness_stored: Option<usize>,
-) -> Result<EmbeddedClient> {
+fn build_client(gpu: bool, asm: bool) -> Result<EmbeddedClient> {
   // Executor choice. The default is the Assembly executor (`asm = true`,
   // i.e. no `--emulator`): it is markedly faster at trace generation and is
   // the prerequisite for the hints stream. It historically broke under our
@@ -785,10 +859,8 @@ fn build_client(
   // docs ("Reduce memory footprint during proving at the cost of
   // speed"). We have ~94 GB of free GPU memory, so the speed
   // trade-off is the wrong direction for this hardware.
-  let mut opts = EmbeddedOpts::default();
-  if let Some(n) = max_witness_stored {
-    opts = opts.max_witness_stored(n);
-  }
+  // Zisk's default embedded opts (witness cap 10).
+  let opts = EmbeddedOpts::default();
   let mut builder: EmbeddedClientBuilder =
     ProverClient::embedded().with_embedded_opts(opts);
   if asm {
@@ -800,7 +872,7 @@ fn build_client(
   builder.build()
 }
 
-/// Check a single constant chosen by Lean NAME (the `--constant` path).
+/// Check a single constant chosen by Lean NAME (one iteration of `--consts`).
 /// Resolve name → constant address via the env's `named` metadata, map to its
 /// ingress block's work item, and ship its closure sub-env. By default the
 /// check-list is the ENTIRE closure (full-closure typecheck); with
@@ -895,18 +967,35 @@ async fn run_constant(
 
   // ---- Execute mode: cycles only, no proof. ----
   if args.execute {
+    let t0 = Instant::now();
     let result = client.execute(&SHARD_PROGRAM, stdin).run()?.await?;
+    let execute_secs = t0.elapsed().as_secs_f64();
+    tracing_texray::json_sink::record_manual("zisk/execute", execute_secs);
     let mut buf = [0u8; SHARD_PUBLICS_LEN];
     result.get_public_values_slice(&mut buf);
     let publics = ShardPublics::decode(&buf);
-    println!(
-      "cycles: {}, failures: {}",
-      result.get_execution_steps(),
-      publics.failures
-    );
+    let cycles = result.get_execution_steps();
+    println!("cycles: {cycles}, failures: {}", publics.failures);
     if publics.failures > 0 {
       bail!("kernel typecheck produced {} failure(s)", publics.failures);
     }
+    if let Some(path) = &args.json {
+      let tput =
+        if execute_secs > 0.0 { cycles as f64 / execute_secs } else { 0.0 };
+      write_json_entry(
+        path,
+        name,
+        serde_json::json!({
+          "cycles": cycles,
+          "execute-time": (execute_secs * 1e6).round() / 1e6,
+          "throughput": tput.round(),
+          // Named for what it measures (the execute phase's RSS high-water),
+          // matching bench-typecheck's execute-peak-rss; bare `peak-rss` is
+          // reserved for prove-phase peaks.
+          "execute-peak-rss": peak_rss_bytes(),
+        }),
+      )?;
+    }
     return Ok(());
   }
 
@@ -916,6 +1005,10 @@ async fn run_constant(
   result.get_public_values_slice(&mut buf);
   let publics = ShardPublics::decode(&buf);
   let leaf_ms = result.get_proving_time();
+  tracing_texray::json_sink::record_manual(
+    "zisk/prove",
+    leaf_ms as f64 / 1000.0,
+  );
   let expected = subject_of_cover(&cover);
   if *expected.as_bytes() != publics.subject_root {
     bail!(
@@ -944,6 +1037,17 @@ async fn run_constant(
       publics.failures
     );
   }
+  if let Some(path) = &args.json {
+    write_json_entry(
+      path,
+      name,
+      serde_json::json!({
+        "prove-time": (leaf_ms as f64).round() / 1000.0,
+        "steps": result.get_execution_steps(),
+        "peak-rss": peak_rss_bytes(),
+      }),
+    )?;
+  }
   Ok(())
 }
 
@@ -1239,29 +1343,86 @@ async fn run_shard_plan(
   }
 
   // ---- Execute mode: run each novel shard in the VM for cycles (no proof);
-  // store-covered shards have nothing to execute. ----
+  // store-covered shards have nothing to execute. With `--json`, one
+  // env-level row (keyed by `--json-name`, default the manifest stem)
+  // carries the totals plus a per-shard cycles breakdown under
+  // `shard-cycles` — the CI benchmark's per-shard tracking. ----
   if args.execute {
+    let t0 = Instant::now();
     let mut total_steps = 0u64;
-    let mut total_failures = 0u32;
+    let mut max_shard_cycles = 0u64;
+    let mut max_shard_peak: Option<u64> = None;
+    let mut shard_cycles = serde_json::Map::new();
+    let mut shard_time = serde_json::Map::new();
+    let mut shard_peak_rss = serde_json::Map::new();
     for &(idx, g) in &novel {
       let (check_list, sub_env, _cover) = build_inputs(g)?;
       let stdin = leaf_stdin(0, 0, &sub_env, &check_list);
+      // Windowed RAM high-water: reset before each shard so the per-shard
+      // peaks are independent; the env row's execute-peak-rss is their max.
+      tracing_texray::rss_sampler::reset_peak_tree_rss();
       let result = client.execute(&SHARD_PROGRAM, stdin).run()?.await?;
       let mut buf = [0u8; SHARD_PUBLICS_LEN];
       result.get_public_values_slice(&mut buf);
       let publics = ShardPublics::decode(&buf);
       let cycles = result.get_execution_steps();
+      let exec_secs = result.get_execution_time() as f64 / 1000.0;
+      let peak = peak_rss_bytes();
       total_steps += cycles;
-      total_failures = total_failures.saturating_add(publics.failures);
+      max_shard_cycles = max_shard_cycles.max(cycles);
+      max_shard_peak = max_shard_peak.max(peak);
+      // 1-based zero-padded keys: matches --only-shard's numbering and keeps
+      // the flattened bencher measure list (`shard-cycles:<k>`, …) sorted.
+      let key = format!("{:02}", idx + 1);
+      shard_cycles.insert(key.clone(), serde_json::json!(cycles));
+      shard_time.insert(key.clone(), serde_json::json!(exec_secs));
+      if let Some(p) = peak {
+        shard_peak_rss.insert(key, serde_json::json!(p));
+      }
       println!(
-        "  [shard {idx}] {} work items, failures={}, cycles={cycles}",
+        "  [shard {idx}] {} work items, failures={}, cycles={cycles}, \
+         {exec_secs:.1}s, peak {}",
         g.len(),
         publics.failures,
+        peak.map_or("?".to_string(), |p| format!(
+          "{:.2} GiB",
+          p as f64 / (1 << 30) as f64
+        )),
       );
+      reject_failures(&publics, &format!("shard {idx}"))?;
     }
-    println!("total cycles: {total_steps}, failures: {total_failures}");
-    if total_failures > 0 {
-      bail!("kernel typecheck produced {total_failures} failure(s)");
+    let execute_secs = t0.elapsed().as_secs_f64();
+    tracing_texray::json_sink::record_manual("zisk/execute", execute_secs);
+    println!("total cycles: {total_steps}, failures: 0");
+    if let Some(path) = &args.json {
+      let name = args.json_name.clone().unwrap_or_else(|| {
+        manifest_path
+          .file_stem()
+          .map(|s| s.to_string_lossy().into_owned())
+          .unwrap_or_else(|| "env".to_string())
+      });
+      let tput = if execute_secs > 0.0 {
+        total_steps as f64 / execute_secs
+      } else {
+        0.0
+      };
+      write_json_entry(
+        path,
+        &name,
+        serde_json::json!({
+          "cycles": total_steps,
+          "shards": novel.len(),
+          "max-shard-cycles": max_shard_cycles,
+          "execute-time": (execute_secs * 1e6).round() / 1e6,
+          "throughput": tput.round(),
+          // Max over the per-shard windows == the run's execution-phase
+          // high-water (setup RAM excluded by the resets above).
+          "execute-peak-rss": max_shard_peak,
+          "shard-cycles": shard_cycles,
+          "shard-time": shard_time,
+          "shard-peak-rss": shard_peak_rss,
+        }),
+      )?;
     }
     return Ok(());
   }
@@ -1307,6 +1468,7 @@ async fn run_shard_plan(
       leaf_ms as f64 / 1000.0,
       result.get_execution_steps(),
     );
+    reject_failures(&publics, &format!("shard {idx}"))?;
     // Bind each leaf: its committed subject must equal the env-derived merkle
     // root over the constants it certified. A guest that proved a different set
     // than the manifest assigned would commit a different root and fail here.
@@ -1611,6 +1773,24 @@ async fn main() -> Result<()> {
 
   let args = Args::parse();
 
+  // Start the process-tree RSS sampler so `peak_rss_bytes()` reflects the ASM
+  // microservices' memory, and point the per-phase timing sink at the drill-down
+  // file if requested. Both are independent of the SDK's global tracing logger.
+  //
+  // TODO(spans): the sink only receives the coarse `zisk/execute` / `zisk/prove`
+  // phases we `record_manual` below. For a finer drill-down (setup, trace-gen,
+  // per-microservice), install a TeXRay subscriber and examine the zisk-sdk's
+  // own tracing spans — which requires composing it with the SDK's global logger
+  // (`zisk_sdk::setup_logger`), currently the sole subscriber.
+  tracing_texray::rss_sampler::start(std::time::Duration::from_millis(50));
+  // With --texray + --json, per-phase span timings land at `<json>.spans` as
+  // JSON Lines — the CI drill-down input.
+  if args.texray {
+    if let Some(json) = args.json.as_ref().and_then(|p| p.to_str()) {
+      let _ = tracing_texray::json_sink::to_file(&format!("{json}.spans"));
+    }
+  }
+
   // Collect inputs. No `--ixe` → a single empty env (back-compat).
   let inputs: Vec<Option<PathBuf>> = if args.ixe.is_empty() {
     vec![None]
@@ -1633,9 +1813,18 @@ async fn main() -> Result<()> {
       "--shard-plan requires exactly one --ixe input (the env the manifest was built for)"
     );
   }
-  // `--constant` selects a named constant from one env.
-  if args.constant.is_some() && inputs.len() > 1 {
-    bail!("--constant requires exactly one --ixe input");
+  // Named constants (from --consts and/or --consts-file) select from one env.
+  let consts = collect_consts(&args)?;
+  if !consts.is_empty() && inputs.len() > 1 {
+    bail!("--consts/--consts-file requires exactly one --ixe input");
+  }
+  if consts.is_empty() && args.skip_deps {
+    bail!("--skip-deps requires constants via --consts or --consts-file");
+  }
+  if consts.is_empty() && args.json.is_some() && args.shard_plan.is_none() {
+    bail!(
+      "--json requires constants via --consts/--consts-file, or --shard-plan"
+    );
   }
 
   // ---- Plan every input up front (parse + shard). ----
@@ -1699,8 +1888,7 @@ async fn main() -> Result<()> {
   let grand_target_count: usize = plans.iter().map(|p| p.target_count).sum();
   let total_leaves: usize = plans.iter().map(|p| p.shards.len()).sum();
 
-  let client =
-    build_client(args.gpu, !args.emulator, Some(args.max_witness_stored))?;
+  let client = build_client(args.gpu, !args.emulator)?;
   client.setup(&SHARD_PROGRAM).run()?.await?;
   // Skip agg-guest setup unless we'll produce more than one leaf proof.
   // The shard-plan path sets up the agg program itself, after its leaves.
@@ -1729,9 +1917,11 @@ async fn main() -> Result<()> {
     return Ok(());
   }
 
-  // ---- Single named constant (no manifest/range). ----
-  if let Some(name) = &args.constant {
-    run_constant(&client, &plans[0], name, &args).await?;
+  // ---- Named constants (no manifest/range). Loops one leaf per name. ----
+  if !consts.is_empty() {
+    for name in &consts {
+      run_constant(&client, &plans[0], name, &args).await?;
+    }
     return Ok(());
   }
 
@@ -1739,7 +1929,6 @@ async fn main() -> Result<()> {
   if args.execute {
     let mut total_steps: u64 = 0;
     let mut total_exec_ms: u64 = 0;
-    let mut total_failures: u32 = 0;
     for plan in &plans {
       let num_shards = plan.shards.len();
       for (i, &(start, end)) in plan.shards.iter().enumerate() {
@@ -1751,19 +1940,22 @@ async fn main() -> Result<()> {
         let cycles = result.get_execution_steps();
         total_steps += cycles;
         total_exec_ms += result.get_execution_time();
-        total_failures = total_failures.saturating_add(publics.failures);
         println!(
           "  [{} shard {}/{num_shards}] range [{start}, {end}), failures={}, cycles={cycles}",
           plan.label,
           i + 1,
           publics.failures,
         );
+        reject_failures(
+          &publics,
+          &format!("{} shard {}/{num_shards}", plan.label, i + 1),
+        )?;
       }
     }
     let total_exec = Duration::from_millis(total_exec_ms);
     let throughput =
       grand_target_count as f64 / total_exec.as_secs_f64().max(f64::EPSILON);
-    println!("failures: {total_failures}");
+    println!("failures: 0");
     println!("cycles: {total_steps}");
     println!("inputs: {}", plans.len());
     println!("work items: {grand_total_items}");
@@ -1773,9 +1965,6 @@ async fn main() -> Result<()> {
       total_exec.as_secs_f64(),
       throughput.human_throughput("consts"),
     );
-    if total_failures > 0 {
-      bail!("kernel typecheck produced {total_failures} failure(s)");
-    }
     return Ok(());
   }
 
@@ -1828,6 +2017,10 @@ async fn main() -> Result<()> {
         (leaf_ms as f64) / 1000.0,
         result.get_execution_steps(),
       );
+      reject_failures(
+        &publics,
+        &format!("{} leaf {}/{num_shards}", plan.label, i + 1),
+      )?;
       leaf_proof_bytes.push(result.get_proof_bytes()?);
       input_publics.push(publics);
       last_leaf_result = Some(result);
@@ -2023,3 +2216,76 @@ mod closure_tests {
     );
   }
 }
+
+#[cfg(test)]
+mod cli_tests {
+  use clap::Parser;
+
+  use super::{Args, collect_consts};
+
+  fn parse(argv: &[&str]) -> Args {
+    Args::try_parse_from(
+      std::iter::once("zisk-host").chain(argv.iter().copied()),
+    )
+    .expect("parse ok")
+  }
+  fn parse_err(argv: &[&str]) -> String {
+    Args::try_parse_from(
+      std::iter::once("zisk-host").chain(argv.iter().copied()),
+    )
+    .unwrap_err()
+    .to_string()
+  }
+
+  #[test]
+  fn consts_splits_on_comma() {
+    let a = parse(&["--consts", "Nat.add_comm,Nat.succ,String.append"]);
+    assert_eq!(a.consts, vec!["Nat.add_comm", "Nat.succ", "String.append"]);
+  }
+
+  #[test]
+  fn consts_repeatable_and_comma_lists_stack() {
+    let a = parse(&["--consts", "a", "--consts", "b,c"]);
+    assert_eq!(a.consts, vec!["a", "b", "c"]);
+  }
+
+  #[test]
+  fn skip_deps_and_json_parse_with_consts_file_only() {
+    // --skip-deps/--json need names, but names may come from --consts-file
+    // alone — clap must accept the parse (main validates after
+    // collect_consts).
+    let a = parse(&[
+      "--consts-file",
+      "names.txt",
+      "--skip-deps",
+      "--json",
+      "out.json",
+    ]);
+    assert!(a.skip_deps);
+    assert_eq!(a.json.as_deref(), Some(std::path::Path::new("out.json")));
+  }
+
+  #[test]
+  fn consts_conflicts_with_shard_plan() {
+    let s = parse_err(&["--consts", "a", "--shard-plan", "p.ixes"]);
+    assert!(s.contains("shard-plan") || s.contains("shard_plan"));
+  }
+
+  #[test]
+  fn consts_conflicts_with_only_shard() {
+    let s = parse_err(&["--consts", "a", "--only-shard", "1"]);
+    assert!(s.contains("only-shard") || s.contains("only_shard"));
+  }
+
+  #[test]
+  fn collect_unions_and_dedups() {
+    let dir = std::env::temp_dir();
+    let path = dir.join("zisk_host_cli_test_consts.txt");
+    std::fs::write(&path, "a\nb\n# comment\n  c  \n\na\n").expect("write");
+    let a =
+      parse(&["--consts", "a,d", "--consts-file", path.to_str().unwrap()]);
+    let got = collect_consts(&a).expect("collect");
+    assert_eq!(got, vec!["a", "d", "b", "c"]);
+    let _ = std::fs::remove_file(&path);
+  }
+}