diff --git a/.github/actions/bencher-track/action.yml b/.github/actions/bencher-track/action.yml index 498f19af..e77b2b9e 100644 --- a/.github/actions/bencher-track/action.yml +++ b/.github/actions/bencher-track/action.yml @@ -12,7 +12,7 @@ inputs: description: Bencher testbed slug. required: true workload: - description: Workload key for the `bencher-thresholds-reset-` tag (e.g. ix-compile, aiur). + description: Workload key for the `bencher-thresholds-reset-` tag (ix-compile, aiur-check, zisk-check, sp1-check, ooc-check). required: true file: description: Bencher Metric Format JSON file to upload. diff --git a/.github/actions/install-sp1/action.yml b/.github/actions/install-sp1/action.yml new file mode 100644 index 00000000..91ad4d6b --- /dev/null +++ b/.github/actions/install-sp1/action.yml @@ -0,0 +1,31 @@ +name: Install SP1 +description: >- + Install the system build deps and the SP1 zkVM toolchain (sp1up) needed to + build and run the SP1 host. Assumes a Rust toolchain is already set up. + +runs: + using: composite + steps: + # The shared zkVM apt superset (the ZisK book's full Ubuntu list — the + # prebuilt cargo tooling and proofman's C++ link OpenMPI/OpenMP/GMP/ + # nlohmann-json/nasm/secp256k1/… — plus pkg-config + libssl-dev for SP1's + # host crates). The Nix shells provided this; a bare runner doesn't. + - name: Install system build deps + shell: bash + run: | + # Some warpbuild images ship an unreachable azure mirror that hangs + # `apt-get update`; drop it first (no-op elsewhere). + sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true + sudo apt-get update + sudo apt-get install -y \ + xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \ + nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \ + libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \ + openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \ + pkg-config libssl-dev + - name: Install SP1 toolchain (sp1up, latest) + shell: bash + run: | + curl -L https://sp1up.succinct.xyz | bash + ~/.sp1/bin/sp1up + echo "$HOME/.sp1/bin" >> "$GITHUB_PATH" diff --git a/.github/actions/install-zisk/action.yml b/.github/actions/install-zisk/action.yml new file mode 100644 index 00000000..604fcec8 --- /dev/null +++ b/.github/actions/install-zisk/action.yml @@ -0,0 +1,95 @@ +name: Install Zisk +description: >- + Install the system build deps, the ZisK zkVM toolchain (ziskup, CPU build), + and — unless `proving-key: false` — the fork-matching proving key needed to + RUN the Zisk host. Execute needs the key too (zisk-host's `client.setup()` + loads the circuit's const-tree files before either the execute or the prove + branch), but BUILDING the host does not, so build-only callers skip the + ~3 GB download + const-tree regeneration. Assumes a Rust toolchain is + already set up. + +inputs: + proving-key: + description: >- + Install the fork-matching proving key (required to execute or prove; + not needed to build). Set false for build-only jobs. + required: false + default: "true" + +runs: + using: composite + steps: + # The shared zkVM apt superset — the ZisK book's full Ubuntu list (prebuilt + # cargo-zisk + proofman's C++ link OpenMPI/OpenMP/GMP/nlohmann-json/nasm/ + # secp256k1/…), kept identical to install-sp1 so a host that links both is + # covered. The Nix shells provided this; a bare runner doesn't. + - name: Install system build deps + shell: bash + run: | + # Some warpbuild images ship an unreachable azure mirror that hangs + # `apt-get update`; drop it first (no-op elsewhere). + sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true + sudo apt-get update + sudo apt-get install -y \ + xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \ + nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \ + libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \ + openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \ + pkg-config libssl-dev + # `--version 0.18.0` pins the toolchain to match our deps. Our host links the + # argumentcomputer/zisk `blake3-precompile` fork, which is based on v0.18.0 + # (its cargo-zisk has `check-setup`, used below to regenerate the key's + # const-trees). Without the pin, ziskup installs `releases/latest`, which + # resolves to upstream `v1.0.0-alpha` — a different circuit whose cargo-zisk + # dropped the `check-setup` subcommand, breaking the key step. `--cpu` picks + # the CPU build (no GPU on the runner) and `--nokey` skips ziskup's key + # install — both avoid its interactive /dev/tty prompts. We keep `--nokey` + # because the upstream `zisk-setup` bucket only carries the upstream circuit's + # key; our fork has a different circuit (extra Blake3f AIR), so we restore the + # fork-matching key from our own S3 in the next step. `--prefix $HOME/.zisk` + # pins the install where cargo-zisk's ZiskPaths fallback looks (the runner + # sets XDG_CONFIG_HOME, which would otherwise relocate it). + - name: Install Zisk toolchain (ziskup, pinned v0.18.0) + shell: bash + run: | + curl -L https://raw.githubusercontent.com/0xPolygonHermez/zisk/main/ziskup/install.sh \ + | bash -s -- --cpu --nokey -y --version 0.18.0 --prefix "$HOME/.zisk" + echo "$HOME/.zisk/bin" >> "$GITHUB_PATH" + # Pre-build the proofman C++ sys crate ALONE so its build script runs + # exactly once before any parallel zisk-host build. zisk-host pulls + # zisk-sdk as both a dependency and a build-dependency, so cargo compiles + # proofman-starks-lib-c as two units whose build scripts can run + # CONCURRENTLY — and both run `make` inside the SHARED + # ~/.cargo/git/checkouts/pil2-proofman-* source dir. On a cold runner the + # Makefile stamp is absent, so both units take the `make clean` + + # `make -j` path and race: one unit's clean deletes build/ while the + # other's g++ is mid-compile ("opening dependency file ….d: No such file + # or directory"). Building the crate solo writes the stamp; the second + # unit then skips the clean and its `make -j` is a no-op. + # (Proper fix is an flock in pil2-proofman's build.rs — upstream.) + - name: Pre-build proofman-starks-lib-c (serialize the shared make) + shell: bash + run: cargo build --release -p proofman-starks-lib-c + working-directory: zisk + # Execute still needs a proving key present: zisk-host calls `client.setup()` + # (which the SDK runs before the execute branch), and that loads the circuit's + # const-tree files. We host the fork-matching key in a public S3 bucket + # WITHOUT the const-trees — exactly like Zisk's released + # `zisk-provingkey-*.tar.gz` on `storage.googleapis.com/zisk-setup` — and + # regenerate them here with `cargo-zisk check-setup -a`, which is how `ziskup` + # itself populates them. That keeps the artifact ~3 GB (gzip) instead of + # ~48 GB. The object name carries the fork rev so a circuit change can't + # silently reuse a stale key. Public bucket → plain curl, no AWS creds. + - name: Restore Zisk proving key (fork circuit) from S3 + if: inputs.proving-key == 'true' + shell: bash + run: | + mkdir -p "$HOME/.zisk" + curl -fSL --retry 3 \ + https://argument-zisk-setup.s3.amazonaws.com/zisk-provingkey-blake3-8f9e24d5-cpu.tar.gz \ + -o /tmp/zisk-provingkey.tar.gz + tar -C "$HOME/.zisk" -xzf /tmp/zisk-provingkey.tar.gz + rm -f /tmp/zisk-provingkey.tar.gz + # Regenerate the const-tree files omitted from the artifact (CPU build, so + # no --gpu). This is the "may take a while" step ziskup prints. + cargo-zisk check-setup --proving-key "$HOME/.zisk/provingKey" -a diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py new file mode 100644 index 00000000..27e595ba --- /dev/null +++ b/.github/scripts/bench.py @@ -0,0 +1,704 @@ +#!/usr/bin/env python3 +"""All data-wrangling for the `!benchmark` PR workflow, as subcommands: + + parse COMMENT_BODY env → matrix + config (writes $GITHUB_OUTPUT) + manifest Benchmarks/Vectors.csv → the constant names for one cell + bmf neutral results JSON → Bencher Metric Format (bench-main.yml) + fetch-main base SHA + cell → main.json pulled from bencher.dev + compare main.json + pr.json → a Markdown main-vs-PR table + comment per-cell table files → the final PR comment body + +The neutral results JSON every backend normalises to (see run.sh) is +`{ "": { "": , ... }, ... }`. Most metrics are +lower-is-better (a positive Δ% is a regression); the exceptions live in +HIGHER_IS_BETTER (throughput), where the polarity is flipped. +""" +import argparse +import glob +import json +import os +import time +import urllib.parse +import urllib.request + + +# ─────────────────── backend identity table ──────────────────── +# Single source of truth for what each backend is: +# default_mode — what `!benchmark ` runs. The bare `execute` token +# switches to the "execute" metrics entry when one exists (only aiur has +# a real choice: `prove` is the full pipeline; `execute` skips Phase 2 via +# `--execute-only`). +# testbed — the bencher testbed bench-main.yml uploads main's numbers to. +# MUST match that workflow's `testbed:` strings; fetch-main fails a cell +# loudly (exit 2) when a (backend, mode) has no entry here, so drift shows +# up as a red cell instead of a silent local-rebuild fallback. +# metrics — compare-table columns per supported mode. aiur's execute entry +# reads the SAME testbed as prove (bencher stores only prove runs; the +# execute-side columns — incl. execute-peak-rss, sampled at the Phase 1/2 +# boundary — are extracted from that JSON, apples-to-apples). +# `compile` benchmarks `ix compile .lean → .ixe`; its benchmark name +# on bencher is the CamelCase env slug (ENV_CC below). +BACKEND_TABLE = { + "aiur": { + "default_mode": "prove", + "testbed": "aiur-check-x64-32x", + "metrics": { + "prove": ["fft-cost", "execute-time", "prove-time", "verify-time", + "proof-size", "peak-rss"], + "execute": ["fft-cost", "execute-time", "execute-peak-rss"], + }, + }, + "zisk": { + "default_mode": "execute", + "testbed": "zisk-check-x64-32x", + "metrics": { + "execute": ["cycles", "execute-time", "throughput", "execute-peak-rss"], + }, + }, + "sp1": { + "default_mode": "execute", + "testbed": "sp1-check-x64-32x", + "metrics": { + "execute": ["cycles", "execute-time", "throughput", "execute-peak-rss"], + }, + }, + "ooc": { + "default_mode": "execute", + "testbed": "ooc-check-x64-32x", + "metrics": { + "execute": ["throughput", "check-time", "peak-rss"], + }, + }, + "compile": { + "default_mode": "compile", + "testbed": "ix-compile-x64-32x", + "metrics": { + "compile": ["compile-time", "throughput", "file-size", "constants"], + }, + }, +} +BACKENDS = tuple(BACKEND_TABLE) +ENVS = ("initStd", "lean", "mathlib") +# CamelCase benchmark key per env — must match bench-main.yml's matrix.bench +# values (the names bencher stores env-keyed rows under: ooc whole-env, +# compile). One explicit table, not a first-letter-upper derivation, so an +# env whose CamelCase isn't mechanical (e.g. a future `flt` → `FLT`) can't +# silently diverge from the workflow. +ENV_CC = {"initStd": "InitStd", "lean": "Lean", "mathlib": "Mathlib"} +CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_FULL"} +PASSTHROUGH_KEYS = {"RUST_LOG", "WITHOUT_VK_VERIFICATION", "RUSTFLAGS"} + + +RUNNER = "warp-ubuntu-latest-x64-32x" + + +def cmd_parse(_a): + body = os.environ.get("COMMENT_BODY", "") + lines = [ln.replace("\r", "") for ln in body.splitlines()] + cmd = next((ln for ln in lines if "!benchmark" in ln), "") + toks = cmd.split("!benchmark", 1)[1].split() if "!benchmark" in cmd else [] + + backends, execute_flag = [], False + for t in (t.lower() for t in toks): + if t == "all": + backends = list(BACKENDS) + elif t in BACKENDS and t not in backends: + backends.append(t) + elif t == "execute": + execute_flag = True + if not backends: + backends = ["aiur"] + + cfg, passthrough = {}, [] + for ln in lines[(lines.index(cmd) + 1) if cmd in lines else 0:]: + s = ln.strip() + if not s or "=" not in s: + continue + key, val = (x.strip() for x in s.split("=", 1)) + if key in CONFIG_KEYS: + cfg[key] = val + elif key in PASSTHROUGH_KEYS: + passthrough.append(f"{key}={val}") + + envs = [e.strip() for e in cfg.get("BENCH_ENVS", "initStd").split(",") if e.strip()] + envs = [e for e in envs if e in ENVS] or ["initStd"] + tier = cfg.get("BENCH_TIER", "") + if tier not in ("cheap", "heavy", "all"): + tier = "" # empty ⇒ derived from mode at manifest time + shard = "1" if cfg.get("BENCH_SHARD") == "1" else "0" + full = "1" if cfg.get("BENCH_FULL") == "1" else "0" # full set vs primary subset + + def mode_for(b): + # The bare `execute` token selects a backend's execute entry when it + # has one — a real switch only for aiur (everything else already + # defaults to execute, or has no execute mode at all: compile). + if execute_flag and "execute" in BACKEND_TABLE[b]["metrics"]: + return "execute" + return BACKEND_TABLE[b]["default_mode"] + + cells = [] + for b in backends: + m = mode_for(b) + for e in envs: + cells.append({"backend": b, "env": e, "mode": m, + "runner": RUNNER, + "label": f"{b}-{e}-{m}"}) + + modes = " ".join(f"{b}={mode_for(b)}" for b in backends) + summary = (f"backends: `{modes}` · envs: `{','.join(envs)}` · " + f"set: `{'full' if full == '1' else 'primary'}` · " + f"tier: `{tier or 'auto'}` · shard: `{shard}`") + if passthrough: + summary += " · env: `" + " ".join(passthrough) + "`" + + with open(os.environ.get("GITHUB_OUTPUT", "/dev/stdout"), "a") as f: + f.write(f"matrix={json.dumps(cells)}\n") + f.write(f"tier={tier}\nshard={shard}\nfull={full}\n") + f.write(f"config-summary={summary}\n") + f.write("passthrough-env<= 4 else "0" + rep = cols[4] if len(cols) >= 5 else "0" + if env != a.env: + continue + if a.primary and rep != "1": + continue + if tier in ("cheap", "heavy") and ctier != tier: + continue + if a.shard == "1" and shard != "1": + continue + names.append(name) + if ctier == "heavy": + heavy.append(name) + with open(a.out, "w") as f: + f.write("\n".join(names) + ("\n" if names else "")) + # The selected names that are heavy-tier — the subset the zisk cells run + # through the closure-sharded pipeline (ix extract → profile → shard) + # instead of a single full-closure leaf. + if a.heavy_out: + with open(a.heavy_out, "w") as f: + f.write("\n".join(heavy) + ("\n" if heavy else "")) + print(f"count={len(names)}\ntier={tier}") + + +# ───────────────────────── compare ───────────────────────── +def _num(d, name, metric): + v = d.get(name, {}).get(metric) + return v if isinstance(v, (int, float)) else None + + +# Per-metric formatting kind. Metric names are the neutral-JSON keys the tools +# emit (see BACKEND_TABLE). Unknown metrics fall through to `_human_auto`. +_METRIC_KIND = { + # bytes + "peak-rss": "bytes", + "execute-peak-rss": "bytes", + "file-size": "bytes", + "proof-size": "bytes", + # seconds + "execute-time": "seconds", + "prove-time": "seconds", + "verify-time": "seconds", + "check-time": "seconds", + "compile-time": "seconds", + # large counts (10^6+ typical) + "fft-cost": "count", + "cycles": "count", + "steps": "count", + "max-shard-cycles": "count", + "throughput": "count", + # small integers + "constants": "int", + "shards": "int", +} + + +def _human_bytes(v): + v = float(v) + for unit in ("B", "KiB", "MiB", "GiB", "TiB"): + if abs(v) < 1024: + return f"{int(v):,} {unit}" if unit == "B" else f"{v:,.2f} {unit}" + v /= 1024 + return f"{v:,.2f} PiB" + + +def _human_seconds(v): + v = float(v) + if abs(v) < 1e-3: + return f"{v * 1e6:.1f} µs" + if abs(v) < 1: + return f"{v * 1e3:.1f} ms" + if abs(v) < 60: + return f"{v:.3f} s" + m, s = divmod(v, 60) + return f"{int(m)}m {s:.1f}s" + + +def _human_count(v): + v = float(v) + if abs(v) < 1000: + return f"{int(v):,}" if v == int(v) else f"{v:.3f}" + for unit in ("K", "M", "B", "T"): + v /= 1000 + if abs(v) < 1000: + return f"{v:.2f}{unit}" + return f"{v:.2f}Q" + + +def _human_auto(v): + if isinstance(v, int) or (isinstance(v, float) and v.is_integer()): + return f"{int(v):,}" + return f"{v:,.3f}" + + +def _human(v, metric=None): + if v is None: + return "n/a" + kind = _METRIC_KIND.get(metric, "auto") + if kind == "bytes": return _human_bytes(v) + if kind == "seconds": return _human_seconds(v) + if kind == "count": return _human_count(v) + if kind == "int": return f"{int(v):,}" + return _human_auto(v) + + +# Metrics where a LARGER value is the improvement. Everything else is +# lower-is-better (times, RAM, cycles, fft-cost, sizes). +HIGHER_IS_BETTER = {"throughput"} + + +def _delta(main, pr): + if main is None or pr is None or main == 0: + return None + return (pr - main) / main * 100.0 + + +def _badness(dp, metric): + """Signed regression magnitude: positive ⇒ the PR got worse on `metric`. + For lower-is-better metrics that's a positive Δ%; for higher-is-better + (throughput) it's a negative Δ%.""" + if dp is None: + return None + return -dp if metric in HIGHER_IS_BETTER else dp + + +# Ratio direction words per metric kind (grew, shrank). Rates and times read +# as faster/slower; sizes as larger/smaller; counts (cycles, fft-cost, …) as +# more/fewer — "1.15× slower" is meaningless for a byte or count metric. +_RATIO_WORDS = { + "seconds": ("slower", "faster"), + "bytes": ("larger", "smaller"), + "count": ("more", "fewer"), + "int": ("more", "fewer"), +} + + +def _ratio(main, pr, metric): + """(factor, direction word) with `factor` always ≥ 1.0. Wording follows + the metric's kind and polarity: throughput (a rate) and the time metrics + read as faster/slower, sizes as larger/smaller, counts as more/fewer. + Returns None if either side is missing or non-positive.""" + if main is None or pr is None or main <= 0 or pr <= 0: + return None + grew = pr >= main + factor = pr / main if grew else main / pr + if metric in HIGHER_IS_BETTER: # rate: more per second = faster + return (factor, "faster" if grew else "slower") + kind = _METRIC_KIND.get(metric, "auto") + words = _RATIO_WORDS.get(kind, ("larger", "smaller")) + return (factor, words[0] if grew else words[1]) + + +def _load(path): + try: + with open(path) as f: + d = json.load(f) + return d if isinstance(d, dict) else {} + except (FileNotFoundError, json.JSONDecodeError): + return {} + + +# TODO: re-add the per-constant Aiur phase (sub-span) drill-down. run.sh's +# `merge_phases` still folds tracing-texray JSON-Lines into `phases: { span: +# seconds }` on each entry; the compare renderer previously emitted a +# collapsible `
` block per constant showing main-vs-PR per-span deltas +# so a regression could be traced to `aiur/execute`, `aiur/witness`, +# `stark/fri_open`, etc. Removed while the compare surface is being stabilised; +# reinstate once we've settled on the primary table's flag/threshold semantics. + + +def cmd_compare(a): + metrics = a.metric or BACKEND_TABLE.get(a.backend, {}).get("metrics", {}).get(a.mode) + if not metrics: + raise SystemExit("compare: pass --metric or a known --backend/--mode") + title = a.title + if title is None and a.backend: + src = a.main_source or "unknown" + cnt = f"{a.count} constants · " if a.count else "" + title = f"### `{a.backend}` · `{a.env}` · `{a.mode}` — {cnt}main from: {src}" + + def emit(text): + if a.out: + open(a.out, "w").write(text + "\n") + else: + print(text) + + main_d, pr_d = _load(a.main), _load(a.pr) + names = sorted(set(main_d) | set(pr_d)) + if not names: + emit((title or "") + "\n\n_No results were produced (every constant failed, " + "timed out, or was dropped). See the workflow logs._") + return + # One side empty while the other measured is almost always a broken side + # (e.g. the base-run fallback hit a CLI-incompatible base), not a real + # all-regressed/all-new comparison — say so instead of a silent n/a column. + side_note = "" + if not main_d: + side_note = ("\n\n_⚠️ main produced no results — the base-side run " + "failed entirely (often a CLI-incompatible base binary " + "when bencher had no data). Deltas unavailable; see the " + "workflow logs._") + elif not pr_d: + side_note = ("\n\n_⚠️ the PR side produced no results — every " + "constant failed or was dropped. See the workflow logs._") + + primary = metrics[0] + names.sort(key=lambda n: (0, -v) if (v := (_num(pr_d, n, primary) + if _num(pr_d, n, primary) is not None else _num(main_d, n, primary))) is not None + else (1, 0)) + + head = ["constant"] + for m in metrics: + head += [f"{m} (main)", f"{m} (PR)", "Δ%"] + rows = ["| " + " | ".join(head) + " |", "|" + "|".join(["---"] * len(head)) + "|"] + + def _oom(d, n): + return isinstance(d.get(n), dict) and d[n].get("oom") is True + + def _failed(d, n): + return isinstance(d.get(n), dict) and d[n].get("failed") is True + + regressed, improved = set(), set() + failures = [] # (name, side) — typecheck failures, surfaced loudly below + worst = None # (badness, dp, name, metric) + for n in names: + cells = [f"`{n}`"] + main_oom, pr_oom = _oom(main_d, n), _oom(pr_d, n) + main_failed, pr_failed = _failed(main_d, n), _failed(pr_d, n) + if main_failed: + failures.append((n, "main")) + if pr_failed: + failures.append((n, "PR")) + for m in metrics: + mv, pv = _num(main_d, n, m), _num(pr_d, n, m) + # An OOM entry may still carry real Phase-1 measurements (run.sh + # merges the sentinel into whatever was recorded before the kill); + # render those, and OOM only for the metrics the kill prevented. + # A typecheck FAILURE outranks everything — the constant is + # rejected, not benchmarked. Spell it out in the cell: a bare ❌ + # would read as any generic failure. + mv_h = ("❌ failed typecheck" if main_failed + else "OOM" if (main_oom and mv is None) else _human(mv, m)) + pv_h = ("❌ failed typecheck" if pr_failed + else "OOM" if (pr_oom and pv is None) else _human(pv, m)) + dp = _delta(mv, pv) + bad = _badness(dp, m) + cell = "n/a" if dp is None else f"{dp:+.1f}%" + if dp is not None: + # Ratio only when the change is big enough that "1.18× slower" + # carries new signal beyond the percentage — sub-5% deltas would + # just add `(1.03× slower)` noise to the cell. + r = _ratio(mv, pv, m) + if r is not None and r[0] >= 1.05: + cell += f" ({r[0]:.2f}× {r[1]})" + if bad > a.threshold: + cell += " ⚠️"; regressed.add(n) + elif bad < -a.threshold: + cell += " 🟢"; improved.add(n) + if worst is None or bad > worst[0]: + worst = (bad, dp, n, m) + cells += [mv_h, pv_h, cell] + rows.append("| " + " | ".join(cells) + " |") + + out = ([title, ""] if title else []) + rows + [""] + # Typecheck failures first and loud — a constant the kernel REJECTS is a + # correctness signal, not a benchmark blip. + for n, side in failures: + out.append(f"❌ **`{n}` FAILED TO TYPECHECK on the {side} side** — " + "the kernel rejected it; see the workflow logs.") + if failures: + out.append("") + s = (f"_{len(names)} constants · {len(regressed)} regressed · " + f"{len(improved)} improved (|Δ| > {a.threshold:g}% on any metric)._") + if worst and worst[0] is not None and worst[0] > a.threshold: + s += f" Worst: `{worst[2]}` `{worst[3]}` {worst[1]:+.1f}%." + out.append(s) + if side_note: + out.append(side_note.strip()) + # TODO: emit per-constant phase drill-down (see the TODO by _phase_details). + emit("\n".join(out)) + + +# ───────────────────────── comment ───────────────────────── +def cmd_comment(a): + commit = f"[`{a.head[:7]}`]({a.repo_url}/commit/{a.head})" + parts = [f"## `!benchmark` — main vs {commit}", "", a.summary, ""] + tables = sorted(glob.glob(os.path.join(a.tables, "table-*.md"))) + if tables: + for t in tables: + parts += [open(t).read().rstrip(), ""] + else: + parts += ["_No result tables were produced — see the workflow logs._", ""] + parts.append(f"[Workflow logs]({a.repo_url}/actions/runs/{a.run_id})") + open(a.out, "w").write("\n".join(parts) + "\n") + print(open(a.out).read()) + + +# ──────────────────────── bmf ───────────────────────── +def cmd_bmf(a): + """Neutral results JSON → Bencher Metric Format. + + One converter for every bench-main.yml upload site (previously four + hand-copied jq pipelines): flattens each entry's `phases` object into + `phase:` measures, strips the boolean `oom` sentinel (BMF values + must be numeric — one boolean would fail the whole `bencher run` upload; + the sentinel is for the PR comparison table only), and drops entries left + with no measures. + """ + with open(a.infile) as f: + neutral = json.load(f) + out = {} + for name, entry in (neutral or {}).items(): + if not isinstance(entry, dict): + continue + measures = {} + for k, v in entry.items(): + if k in ("oom", "failed"): + continue + # Nested objects are per-sub-measure breakdowns: `phases` (span → + # seconds) flattens to `phase:`; anything else (e.g. the + # zisk env row's `shard-cycles`) to `:`. Both stay + # un-thresholded on bencher (dynamic names). + if isinstance(v, dict): + prefix = "phase" if k == "phases" else k + for sub, sv in v.items(): + if isinstance(sv, (int, float)) and not isinstance(sv, bool): + measures[f"{prefix}:{sub}"] = {"value": sv} + elif isinstance(v, (int, float)) and not isinstance(v, bool): + measures[k] = {"value": v} + if measures: + out[name] = measures + with open(a.out, "w") as f: + json.dump(out, f, indent=1) + print(f"bmf: {len(out)} benchmark(s) → {a.out}") + + +# ─────────────────────── fetch-main ────────────────────── +def cmd_fetch_main(a): + """Pull the base SHA's neutral results JSON from bencher.dev. + + The testbed comes from BACKEND_TABLE — supported (backend, mode) pairs are + exactly the table's metrics keys. Exit codes are load-bearing for + bench-pr.yml: 3 = transient (bencher has no report at that hash yet, or + the API failed after retries) — the caller falls back to running main + locally; 2 = permanent config error ((backend, mode) not in BACKEND_TABLE, + i.e. table / bench-main.yml drift) — the caller fails the cell loudly + instead of paying the fallback forever. + + A PARTIAL miss (bencher answered, but some --names entries have no data — + e.g. constants the PR adds to Vectors.csv) still exits 0: main.json holds + what bencher had, and --missing-out lists the uncovered names so the + caller can measure just those against the base checkout and merge. + """ + entry = BACKEND_TABLE.get(a.backend) + testbed = entry["testbed"] if entry and a.mode in entry["metrics"] else None + if not testbed: + print(f"fetch-main: no main testbed for {a.backend}/{a.mode}") + raise SystemExit(2) + wanted = set(open(a.names).read().split()) if a.names else None + # ooc's headline row is keyed by the CamelCase env slug (not a Vectors.csv + # constant), so names.txt alone would filter it out — admit it explicitly. + if wanted is not None and a.env: + wanted.add(ENV_CC.get(a.env, a.env)) + # TODO: support any base/PR branch, not just `main`. Today bench-main.yml + # only runs on push to main and this query hardcodes `branch=main`, so a PR + # against a non-main base branch (e.g. a long-running feature branch) always + # falls through to the local base-run path. To generalise: (1) let + # bench-main.yml (or a sibling) upload reports for other tracked branches, + # (2) plumb `--branch` here from `github.base_ref` in bench-pr.yml, (3) fall + # back to `main` when the base branch has no bencher data. + # Bencher stores the git hash at `branch.head.version.hash`. + def _report_hash(r): + return (((r.get("branch") or {}).get("head") or {}).get("version") or {}).get("hash") + + def _get_json(url, attempts=3): + for i in range(attempts): + try: + with urllib.request.urlopen(url, timeout=15) as f: + return json.load(f) + except Exception as e: + if i == attempts - 1: + raise + print(f"fetch-main: attempt {i + 1} failed ({e}); retrying") + time.sleep(2 ** i) + + # Page newest-first until the base SHA's reports are found (a matrix env + # uploads one report each, all within one push's CI window, so once we've + # matched and a later page yields nothing new we're past it). A transient + # API error is retried before the expensive local-base fallback fires. + per_page = 255 + at_sha, page = [], 1 + while page <= 8: # 2040 newest reports — far beyond a realistic backlog + params = {"branch": "main", "testbed": testbed, + "per_page": per_page, "page": page} + url = ("https://api.bencher.dev/v0/projects/ix/reports?" + + urllib.parse.urlencode(params)) + try: + reports = _get_json(url) + except Exception as e: + print(f"fetch-main: bencher API error: {e}") + raise SystemExit(3) + matches = [r for r in reports if _report_hash(r) == a.sha] + if at_sha and not matches: + break # past the SHA's window + at_sha += matches + if len(reports) < per_page: + break # end of data + page += 1 + if not at_sha: + print(f"fetch-main: no reports for {a.backend}/{a.mode} @ {a.sha[:8]}") + raise SystemExit(3) + # Matrix envs upload separately to the same testbed at the same commit, + # each contributing its own benchmark subset — aggregate across reports. + # Filter/emit by `name` (Bencher's `slug` is a lower-kebab-cased derivation + # that would mangle Lean names like `Nat.add_comm` → `nat-add-comm`). + out = {} + for r in at_sha: + for iteration in r.get("results", []): + for bench in iteration: + name = bench["benchmark"]["name"] + if wanted is not None and name not in wanted: + continue + metrics = { + m["measure"]["name"]: m["metric"]["value"] + for m in bench.get("measures", []) + } + if metrics: + out[name] = metrics + if not out: + print(f"fetch-main: reports found but no matching benchmarks in --names") + raise SystemExit(3) + # Names the PR side selected (its Vectors.csv) that bencher has no data + # for at this SHA — typically constants the PR itself adds to the CSV. + # The caller runs the base checkout on JUST these and merges, so a new + # constant still gets a real main-vs-PR delta on its first !benchmark. + # Computed against names.txt verbatim (not the ENV_CC-augmented `wanted`): + # the env-keyed row is an admit-filter, not a per-constant expectation. + if a.missing_out: + name_set = set(open(a.names).read().split()) if a.names else set() + missing = sorted(name_set - set(out)) + with open(a.missing_out, "w") as f: + f.write("\n".join(missing) + ("\n" if missing else "")) + if missing: + print(f"fetch-main: {len(missing)} name(s) not on bencher @ " + f"{a.sha[:8]} (base run will measure): " + ", ".join(missing)) + with open(a.out, "w") as f: + json.dump(out, f) + print(f"fetch-main: {len(out)} constant(s) from bencher for {a.backend}/{a.mode}") + + +# ────────────────────────── cli ────────────────────────── +def main(): + ap = argparse.ArgumentParser(description=__doc__) + sub = ap.add_subparsers(dest="cmd", required=True) + + sub.add_parser("parse").set_defaults(fn=cmd_parse) + + m = sub.add_parser("manifest") + m.add_argument("--csv", required=True); m.add_argument("--env", required=True) + m.add_argument("--mode", required=True); m.add_argument("--tier", default="") + m.add_argument("--shard", default="0"); m.add_argument("--out", required=True) + m.add_argument("--backend", default="", + help="Backend for this cell (used to special-case `compile`, " + "which doesn't consume Vectors.csv).") + m.add_argument("--primary", action="store_true", + help="Restrict to the primary subset (the primary=1 column).") + m.add_argument("--heavy-out", dest="heavy_out", + help="Also write the selected heavy-tier names (one per " + "line) — the subset zisk runs closure-sharded.") + m.set_defaults(fn=cmd_manifest) + + b = sub.add_parser("bmf") + b.add_argument("--in", dest="infile", required=True, + help="Neutral results JSON (run.sh output).") + b.add_argument("--out", required=True, + help="Bencher Metric Format JSON for `bencher run`.") + b.set_defaults(fn=cmd_bmf) + + fm = sub.add_parser("fetch-main") + fm.add_argument("--sha", required=True) + fm.add_argument("--backend", required=True) + fm.add_argument("--mode", required=True) + fm.add_argument("--env", default="", + help="Cell env; admits the env-keyed row (ooc whole-env) " + "past the --names filter.") + fm.add_argument("--names", help="Only fetch benchmarks whose names appear in this file.") + fm.add_argument("--missing-out", dest="missing_out", + help="Write the --names entries bencher had no data for " + "(one per line; empty file when none) — the subset " + "the caller should measure against the base checkout.") + fm.add_argument("--out", required=True) + fm.set_defaults(fn=cmd_fetch_main) + + c = sub.add_parser("compare") + c.add_argument("--main", required=True); c.add_argument("--pr", required=True) + c.add_argument("--metric", action="append", default=[]) + c.add_argument("--threshold", type=float, default=3.0) + c.add_argument("--title"); c.add_argument("--backend"); c.add_argument("--env") + c.add_argument("--mode"); c.add_argument("--count"); c.add_argument("--main-source", default="") + c.add_argument("--out") + c.set_defaults(fn=cmd_compare) + + cm = sub.add_parser("comment") + cm.add_argument("--tables", required=True); cm.add_argument("--summary", default="") + cm.add_argument("--head", required=True); cm.add_argument("--repo-url", required=True) + cm.add_argument("--run-id", required=True); cm.add_argument("--out", required=True) + cm.set_defaults(fn=cmd_comment) + + a = ap.parse_args() + a.fn(a) + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh new file mode 100644 index 00000000..9acba71f --- /dev/null +++ b/.github/scripts/run.sh @@ -0,0 +1,439 @@ +#!/usr/bin/env bash +# Compile one library env to a `.ixe` from a checked-out repo (unless REUSE_IXE), +# then benchmark the given backend, emitting the neutral results JSON +# { "": { "": , ... }, ... } +# that bench.py compare / the bencher jobs consume. +# +# run.sh +# repo_dir : checked-out worktree (has .lake/build/bin/{ix,bench-typecheck}) +# env : initStd | lean | mathlib (any case; used verbatim for .ixe) +# backend : aiur | zisk | sp1 | ooc | compile | cutshards +# mode : execute | prove | compile (ignored by cutshards) +# +# `ix` / `bench-typecheck` come from (so base measures base's code, PR +# the PR's — the caller puts /.lake/build/bin on PATH). For the +# per-constant backends (aiur, zisk, sp1, ooc), each name is its own subprocess +# so a failure/timeout drops only that row. The `compile` backend is per-env +# (the env slug is the benchmark name) and measures the compile step directly. +# Only JSON is written to stdout — tool output and `::warning::`/`::notice::` +# go to logs / stderr so they never corrupt the merged JSON stream. +set -uo pipefail + +repo=${1:?repo_dir}; benv=${2:?env}; backend=${3:?backend}; mode=${4:?mode} +names=${5:?names}; out=${6:?out} +# Absolute repo path: the zkVM branch cd's into the host workspace, so the .ixe +# path passed to the host must not be relative to the original cwd. +repo=$(cd "$repo" && pwd) +: > "$out" +emit_empty() { [ -s "$out" ] || echo '{}' > "$out"; } + +# Fold a tool's per-phase span timings (tracing-texray JSONL, one +# `{"span":"…","seconds":N}` per closed span, possibly repeated per shard) into +# its per-constant results file under a `phases` object, summed by span name — +# the source bench.py renders as the comparative drill-down. No-op if the tool +# emitted no spans. +merge_phases() { # + local res="$1" spans="$2" ph + [ -s "$spans" ] || return 0 + ph=$(jq -s 'reduce .[] as $o ({}; .[$o.span] += $o.seconds)' "$spans" 2>/dev/null) + [ -n "$ph" ] && [ "$ph" != "{}" ] || return 0 + jq --argjson ph "$ph" 'map_values(. + {phases: $ph})' "$res" > "$res.p" \ + && mv "$res.p" "$res" || true +} + +# Background RAM watchdog. Every ~3 s, sum RSS across `root_pid` and every +# descendant (via `ps -eo pid,ppid,rss` + a small BFS); when the total exceeds +# `max_gb`, touch `marker` and SIGKILL the whole process GROUP (the root is +# started with `setsid`, so `kill -- -pid` reaches every descendant, not just +# depth-1 children). Callers detect the kill by testing `-f "$marker"` after +# wait. The 3 s cadence can lose a fast spike to the kernel OOM killer first — +# callers treat exit 137 without a marker as OOM too. +watch_ram_kill() { # + local root_pid=$1 max_gb=$2 marker=$3 + local max_kb=$((max_gb * 1024 * 1024)) total_kb + while kill -0 "$root_pid" 2>/dev/null; do + total_kb=$(ps -eo pid,ppid,rss --no-headers 2>/dev/null | awk -v root="$root_pid" ' + { rss[$1]=$3; parent[$1]=$2 } + END { + alive[root]=1; changed=1 + while (changed) { + changed=0 + for (p in parent) if (alive[parent[p]] && !alive[p]) { alive[p]=1; changed=1 } + } + s=0; for (p in alive) s += rss[p]+0 + print s + }') + if [ -n "$total_kb" ] && [ "$total_kb" -gt "$max_kb" ]; then + echo "::warning::RAM watchdog: killing pid=$root_pid tree-RSS=${total_kb}kB > ${max_kb}kB (~${max_gb} GB)" >&2 + : > "$marker" + kill -KILL -- "-$root_pid" 2>/dev/null || kill -KILL "$root_pid" 2>/dev/null || true + return + fi + sleep 3 + done +} + +# Merge the OOM sentinel into a constant's results file, PRESERVING any +# metrics measured before the kill (bench-typecheck persists Phase-1 +# fft-cost/execute-time before the prove starts). bench.py compare renders +# `OOM` only for the metrics that are absent. +mark_oom() { # + local res="$1" c="$2" + if [ -s "$res" ]; then + jq --arg n "$c" '.[$n] = ((.[$n] // {}) + {oom: true})' "$res" > "$res.o" \ + && mv "$res.o" "$res" \ + || jq -n --arg n "$c" '{($n): {oom: true}}' > "$res" + else + jq -n --arg n "$c" '{($n): {oom: true}}' > "$res" + fi +} + +# A prove can die of memory three ways: the watchdog's group-kill (marker +# file), the kernel OOM killer (SIGKILL → exit 137, no marker), or an +# ALLOCATION-FAILURE ABORT — one huge trace allocation fails while total RSS +# is still under the watchdog ceiling, and the Rust/Lean runtime aborts +# (SIGABRT → exit 134) with an allocator message in the log. All three are +# OOM for the benchmark table. +looks_like_oom() { # + local code="$1" marker="$2" log="$3" + [ -f "$marker" ] && return 0 + [ "$code" -eq 137 ] && return 0 + [ "$code" -eq 134 ] && grep -qiE \ + 'memory allocation of .* failed|std::bad_alloc|out of memory|(unable|failed) to allocate' \ + "$log" 2>/dev/null && return 0 + return 1 +} + +# `$benv` is used verbatim for the `.ixe` filename (bench-pr compiles `initStd.ixe`; +# the bencher jobs reuse the compile job's cached `InitStd.ixe`), and lowercased +# only to pick the Compile module. `$benv_cc` is the CamelCase form — the +# canonical BENCHMARK KEY for env-keyed rows (ooc whole-env, compile), so the +# PR side (`initStd`) and the bencher side (`InitStd`, from bench-main's +# matrix.bench) agree on one name. +case "$(printf '%s' "$benv" | tr '[:upper:]' '[:lower:]')" in + initstd) module=CompileInitStd; benv_cc=InitStd ;; + lean) module=CompileLean; benv_cc=Lean ;; + mathlib) module=CompileMathlib; benv_cc=Mathlib ;; + flt) module=CompileFLT; benv_cc=FLT ;; + *) echo "unknown env: $benv" >&2; exit 2 ;; +esac + +# Tool resolution: prefer the in-tree build (so base measures base's code, PR +# the PR's), fall back to PATH — CI restores cached binaries onto PATH instead +# of building in-tree. Resolved LAZILY at each use site: the zkVM branch needs +# neither `ix` nor `bench-typecheck` when REUSE_IXE short-circuits the compile +# (bench-main's zkvm-execute job restores only the `.ixe` cache, no binaries). +resolve_bin() { # → prints the path, or fails + local name="$1" in_tree="$repo/.lake/build/bin/$1" + if [ -x "$in_tree" ]; then printf '%s' "$in_tree" + else command -v "$name" || { echo "::error::$name not found (in-tree or PATH)" >&2; return 2; } + fi +} + +tmp=$(mktemp -d) +compile_log="$tmp/compile.log" + +# Closure-shard artifacts for the zisk heavy tier: `ix shard extract` cuts a +# standalone closure-only env (no recompile), `ix profile` → `ix shard` cut +# its manifest (the canonical partitioner: profiled heartbeats + min-cut, +# capped by predicted RAM). One dir per env; slugs must match the zkvm loop's +# result keys (same `tr` set). +shards_dir_for_env() { printf '%s' "$repo/zkshards-$benv"; } +cut_closure_shards() { # → 0 when /$slug.{ixe,ixes} are ready + local c="$1" slug="$2" dir ix_bin rc + dir=$(shards_dir_for_env) + [ -f "$dir/$slug.ixes" ] && [ -f "$dir/$slug.ixe" ] && return 0 + ix_bin=$(resolve_bin ix 2>/dev/null) || { + echo "::warning::no ix binary to cut closure shards for '$c'" >&2 + return 1 + } + mkdir -p "$dir" + echo "::group::ix shard extract + profile + shard: $c" + "$ix_bin" shard extract "$ixe" --consts "$c" --out "$dir/$slug.ixe" \ + && "$ix_bin" profile "$dir/$slug.ixe" --out "$dir/$slug.ixprof" \ + && "$ix_bin" shard "$dir/$slug.ixprof" \ + --max-ram "${SHARD_MAX_RAM_GB:-120}" --out "$dir/$slug.ixes" + rc=$? + echo "::endgroup::" + [ "$rc" -eq 0 ] || { + echo "::warning::extract/profile/shard failed for '$c'" >&2 + rm -f "$dir/$slug.ixes" + return 1 + } +} + +# `compile` backend needs a fresh compile to measure — never honor REUSE_IXE. +ixe="$repo/$benv.ixe" +if [ "${REUSE_IXE:-0}" = 1 ] && [ "$backend" != compile ] && [ -f "$ixe" ]; then + echo "reusing existing $ixe (REUSE_IXE)" >&2 +else + ix_bin=$(resolve_bin ix) || exit 2 + echo "::group::ix compile $module → $benv.ixe ($backend/$mode)" + "$ix_bin" compile "$repo/Benchmarks/Compile/$module.lean" \ + --out "$ixe" 2>&1 | tee "$compile_log" + echo "::endgroup::" +fi + +case "$backend" in + aiur) + # One bench-typecheck per constant (isolation + per-constant peak-rss). + # Execute mode → Phase 1 only (--execute-only). Prove mode → always attempt + # a full prove (no tier gate), bounded two ways: a RAM watchdog SIGKILLs + # the process group when tree-RSS nears the runner's ceiling (the constant + # then records the `oom: true` sentinel — merged into any Phase-1 metrics + # already measured — so bench.py compare renders `OOM` instead of dropping + # the row), and a wall-clock `timeout` bounds a runaway prove. `$out` is + # re-merged after every constant so a job-level kill mid-loop still leaves + # the completed rows on disk. + ceiling_gb=${AIUR_PROVE_MAX_RSS_GB:-120} + bt_bin=$(resolve_bin bench-typecheck) || exit 2 + rows="$tmp/rows"; mkdir -p "$rows" + while IFS= read -r c; do + [ -z "$c" ] && continue + slug=$(printf '%s' "$c" | tr '/ .:' '____') + res="$tmp/$slug.json"; spans="$res.spans"; oom="$tmp/$slug.oom" + rm -f "$oom" + # bench-typecheck self-reports peak-rss (texray tree sampler) in its --json; + # with --texray + --json it also writes per-phase aiur/*, stark/* timings + # to `.spans` for the drill-down. + if [ "$mode" = execute ]; then + timeout "${AIUR_EXECUTE_TIMEOUT:-25m}" \ + "$bt_bin" --ixe "$ixe" --consts "$c" --json "$res" --execute-only --texray \ + > "$tmp/$slug.log" 2>&1 \ + || { echo "::warning::aiur execute '$c' failed/timed out; dropping" >&2; continue; } + else + # setsid: bench-typecheck leads its own process group so the watchdog's + # group-kill reaches every descendant. + setsid timeout "${AIUR_PROVE_TIMEOUT:-50m}" \ + "$bt_bin" --ixe "$ixe" --consts "$c" --json "$res" --texray \ + > "$tmp/$slug.log" 2>&1 & + bt_pid=$! + watch_ram_kill "$bt_pid" "$ceiling_gb" "$oom" & + w_pid=$! + wait "$bt_pid" 2>/dev/null; bt_exit=$? + kill "$w_pid" 2>/dev/null || true + wait "$w_pid" 2>/dev/null || true + if looks_like_oom "$bt_exit" "$oom" "$tmp/$slug.log"; then + echo "::warning::aiur prove '$c' OOM (exit $bt_exit, marker=$([ -f "$oom" ] && echo watchdog || echo runtime), ceiling ${ceiling_gb} GB)" >&2 + mark_oom "$res" "$c" + elif [ "$bt_exit" -ne 0 ]; then + echo "::warning::aiur prove '$c' failed (exit $bt_exit); dropping" >&2 + { sed -n '1,5p' "$tmp/$slug.log"; echo " …"; tail -n 3 "$tmp/$slug.log"; } >&2 || true + continue + fi + fi + merge_phases "$res" "$spans" + if [ -s "$res" ]; then + cp "$res" "$rows/$slug.json" + jq -s 'reduce .[] as $o ({}; . + $o)' "$rows"/*.json > "$out" 2>/dev/null || true + fi + done < "$names" + emit_empty + ;; + + zisk|sp1) + # zkVM prove is not currently wired up (no GPU runner), so this branch runs + # execute only. The workflow filters `zisk|sp1 prove` at parse time. + if [ "$mode" != execute ]; then + echo "::error::$backend $mode: only execute mode is supported" >&2 + emit_empty; exit 2 + fi + host="${backend}-host"; work="$repo/$backend" + # Build the host once so per-constant timing excludes compilation. The host + # self-measures and writes its own neutral results JSON via `--json` + # (cycles/execute-time/throughput/peak-rss), so there is nothing to grep — + # `timeout` only bounds a runaway constant. + echo "::group::cargo build $host" + ( cd "$work" && cargo build --quiet --release --bin "$host" ) + echo "::endgroup::" + bin="$work/target/release/$host" + # ZisK's ASM microservices mmap the ROM with MAP_LOCKED, needing unlimited + # locked memory (the Zisk book's DefaultLimitMEMLOCK=infinity). The warp + # runner caps the memlock hard limit and can't be rebooted, so raise it + # in-session as root; the host children inherit it. Without this the ASM + # services die with `mmap(rom) errno=11`. SP1 needs no such raise. + [ "$backend" = zisk ] && sudo prlimit --pid $$ --memlock=unlimited:unlimited + # A group-killed Zisk run skips the host's Drop-time cleanup of its + # /dev/shm/ZISK_* segments and semaphores (multi-GB — the MT output segment + # alone starts at 6 GB), so the NEXT host launch fails creating its own + # segments (tmpfs / MAP_LOCKED exhaustion) before Zisk's startup stale-pid + # sweep can save it — one dropped constant per watchdog kill. Sweep the + # dead run's debris ourselves; nothing zisk-related is alive at call time. + zisk_shm_sweep() { + [ "$backend" = zisk ] || return 0 + pkill -KILL -f -- '--shm_prefix ZISK_' 2>/dev/null + rm -f /dev/shm/ZISK_* /dev/shm/sem.ZISK_* 2>/dev/null + return 0 + } + ceiling_gb=${ZKVM_EXECUTE_MAX_RSS_GB:-120} + rows="$tmp/rows"; mkdir -p "$rows" + # One watchdog-guarded guest run, keyed `$key` in the results. Full + # closures are RAM-unbounded (the ASM microservices mmap multi-GB ROMs on + # top of the guest trace), so the same watchdog as the aiur prove path + # guards the runner. `exec setsid`: the subshell (whose pid is $!) + # replaces itself with the session leader, so the watchdog's group-kill + # (`kill -- -$!`) reaches the host and every descendant — without it a + # plain subshell wrapper's pgid would be run.sh's own. The host writes + # $res only on a clean (zero-failure) run; `$out` is re-merged per run so + # a job-level kill keeps completed rows. + zkvm_run() { # + local run_timeout="$1" key="$2" run_ixe="$3"; shift 3 + local slug; slug=$(printf '%s' "$key" | tr '/ .:' '____') + local res="$tmp/$slug.json" log="$tmp/$slug.log" oom="$tmp/$slug.oom" + local spans="$res.spans" zk_pid w_pid zk_exit + rm -f "$oom" + ( cd "$work" && exec setsid timeout "$run_timeout" "$bin" --execute \ + --ixe "$run_ixe" --json "$res" --texray "$@" ) \ + > "$log" 2>&1 & + zk_pid=$! + watch_ram_kill "$zk_pid" "$ceiling_gb" "$oom" & + w_pid=$! + wait "$zk_pid" 2>/dev/null; zk_exit=$? + kill "$w_pid" 2>/dev/null || true + wait "$w_pid" 2>/dev/null || true + [ "$zk_exit" -ne 0 ] && zisk_shm_sweep + if looks_like_oom "$zk_exit" "$oom" "$log"; then + echo "::warning::$backend execute '$key' OOM (exit $zk_exit, marker=$([ -f "$oom" ] && echo watchdog || echo runtime), ceiling ${ceiling_gb} GB)" >&2 + mark_oom "$res" "$key" + elif [ "$zk_exit" -ne 0 ] && grep -q 'kernel typecheck produced' "$log" 2>/dev/null; then + # The kernel REJECTED the constant (the host fails fast and aborts + # any remaining shards). Record the `failed` sentinel — compare + # renders a ❌ row + loud note, and the workflow fails at the end. + echo "::error::$backend: '$key' FAILED TO TYPECHECK — kernel rejected it" >&2 + tail -n 3 "$log" >&2 || true + jq -n --arg n "$key" '{($n): {failed: true}}' > "$res" + elif [ "$zk_exit" -ne 0 ]; then + echo "::warning::$backend execute '$key' failed/timed out (exit $zk_exit); dropping" >&2 + # Head for early failures (name resolution), tail for late ones. + { sed -n '1,5p' "$log"; echo " …"; tail -n 3 "$log"; } >&2 || true + return 0 + fi + merge_phases "$res" "$spans" + if [ -s "$res" ]; then + cp "$res" "$rows/$slug.json" + jq -s 'reduce .[] as $o ({}; . + $o)' "$rows"/*.json > "$out" 2>/dev/null || true + fi + } + # Closure-sharded pipeline for the heavy tier (zisk only). A heavy + # constant's full closure blows the runner's RAM as a single leaf, so it + # runs as its shard-manifest partition instead: `ix shard extract` cuts a + # standalone closure-only env, `ix profile` → `ix shard` cut the manifest + # (the canonical partitioner: profiled heartbeats + min-cut, capped by + # predicted RAM), and one `--shard-plan` host run executes the shards + # sequentially, emitting the constant's row (totals + per-shard + # breakdown). bench-main pre-cuts the artifacts in the compile job and + # ships them via cache; the PR side cuts its own — a PR can change the + # cost profile, and profiling counts heartbeats (not wall time) so an + # unchanged tree re-partitions deterministically. If cutting isn't + # possible (no ix binary, or a failure), fall back to the single-leaf + # run — the watchdog then records the honest OOM row. + heavy_file="${ZISK_HEAVY_NAMES:-}" + is_heavy() { + [ -n "$heavy_file" ] && [ -f "$heavy_file" ] && grep -qxF "$1" "$heavy_file" + } + shards_dir=$(shards_dir_for_env) + # Full-closure check (no --skip-deps) so this is directly comparable to + # the ooc `ix check-rs --anon --consts` run — the delta then isolates the + # in-circuit-vs-out-of-circuit overhead rather than mixing in subject- + # only vs full-closure scope. + while IFS= read -r c; do + [ -z "$c" ] && continue + slug=$(printf '%s' "$c" | tr '/ .:' '____') + if [ "$backend" = zisk ] && is_heavy "$c" && cut_closure_shards "$c" "$slug"; then + zkvm_run "${ZKVM_EXECUTE_TIMEOUT:-25m}" "$c" "$shards_dir/$slug.ixe" \ + --shard-plan "$shards_dir/$slug.ixes" --json-name "$c" + else + zkvm_run "${ZKVM_EXECUTE_TIMEOUT:-25m}" "$c" "$ixe" --consts "$c" + fi + done < "$names" + emit_empty + ;; + + ooc) + # Out-of-circuit Rust kernel (far faster than proving). Two views, both keyed + # off the structured line + # `##check## ` + # (peak-rss from ix check's tracing-texray tree sampler): the whole env in + # parallel (`--anon`, keyed by env), and a per-primary check + # (`--anon --consts`, keyed by constant) that runs the constant's FULL + # dependency closure in anon mode — the same mode and scope as the zkVM + # execute above, so the delta isolates in-circuit vs out-of-circuit + # overhead rather than mixing in closure-size or metadata effects. + ix_bin=$(resolve_bin ix) || exit 2 + ooc_one() { #