From 2e6528beb77f2d64e85fdf9248ce5c640ee7d1ae Mon Sep 17 00:00:00 2001 From: samuelburnham <45365069+samuelburnham@users.noreply.github.com> Date: Tue, 30 Jun 2026 02:17:06 +0000 Subject: [PATCH 01/27] fix(zisk): drop --max-witness-stored flag, use Zisk's default cap Lowering the witness cap below Zisk's built-in default (10) was measured to have a negligible effect on host RAM and prove time for the kernel typecheck workload, so the CLI override (which defaulted to 5) is removed and the prover uses EmbeddedOpts::default(). Drop the now-stale flag mentions and per-RAM tuning guidance from the README, the cost-model doc, and the shard.rs model comments. --- README.md | 33 ++++++++------------------------- crates/kernel/src/shard.rs | 11 +++++------ docs/zisk-cycle-cost-model.md | 12 +++++------- zisk/host/src/main.rs | 21 +++------------------ 4 files changed, 21 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index d17190b0..892b7162 100644 --- a/README.md +++ b/README.md @@ -487,31 +487,14 @@ Non-Nix users: install Zisk manually per the *Sharding large environments* below): each shard ships only its own closure sub-env, so the pieces fit the cap even when the whole env does not. - **Host RAM cap (`--max-witness-stored`).** Distinct from the in-guest - heap cap above, the prover side (Zisk's `proofman`) holds in-flight - witness traces in host RAM during `CALCULATING_CONTRIBUTIONS`. Peak - host RAM per shard ≈ `fixed-overhead + N × avg-witness-size`, where - `N` is the `max_witness_stored` setting. With the Blake3f precompile the - Ix kernel typecheck workload measures roughly `40 GB + N × 16 GB` on - typical 200–300 kB anon-byte shards — e.g. `N = 10` peaks near 200 GB - (a `--shard-bytes 250000 --max-witness-stored 10` mergesort run completes - under a 200 GiB guard without tripping it). An earlier pre-Blake3f figure - of ~25 GB per witness is stale; the precompile shrank the witness. - - The `zisk-host` CLI defaults to `--max-witness-stored 5` (Zisk's - built-in default is 10, tuned for larger-memory boxes). Override per - machine: - - | Host RAM | `--max-witness-stored` | Notes | - | -------- | ---------------------- | ------------------------------------------------------ | - | ≤ 128 GB | `3` | Override down; consider smaller shards too | - | 256 GB | `5` (project default) | Comfortable margin on the typical setup | - | 512 GB | `10` (Zisk default) | Override up for maximum prover parallelism | - | ≥ 1 TB | `10` (Zisk default) | Override up; default is conservative for this workload | - - Lowering the cap roughly linearly bounds peak RAM but throttles - prover parallelism (~10–30 % slower in practice). Raise it if your - machine has more RAM headroom; lower it if you OOM during + **Host RAM during proving.** Distinct from the in-guest heap cap above, + the prover side (Zisk's `proofman`) holds in-flight witness traces in host + RAM during `CALCULATING_CONTRIBUTIONS`. The number of resident witnesses + (Zisk's `max_witness_stored`) is left at Zisk's built-in default of 10: + we measured that lowering it does not materially reduce peak host RAM or + prove time for the Ix kernel typecheck workload, so it is not exposed as a + knob. Peak host RAM per shard is instead governed by shard size — prove + smaller shards (`--shard-bytes`) if you OOM during `CALCULATING_CONTRIBUTIONS`. Not relevant for `--execute` or `--verify-constraints` modes. diff --git a/crates/kernel/src/shard.rs b/crates/kernel/src/shard.rs index 8e312f6c..e594ff09 100644 --- a/crates/kernel/src/shard.rs +++ b/crates/kernel/src/shard.rs @@ -1653,8 +1653,8 @@ pub fn block_step_cost(b: &BlockEntry) -> u64 { /// size shards straight from `MemTotal` without ever picking a budget. Inverts /// the measured single-leaf prover model on this setup /// (`peak_RAM_GiB ≈ 50 + 33 × steps_billions`, measured by a guarded 7-shard GPU -/// prove sweep over 0.27–3.79e9-step Init shards, R²=0.99, `--max-witness-stored -/// 5`) at [`RAM_USABLE_FRAC`] of RAM (reserving the rest for the OS, cross-shard +/// prove sweep over 0.27–3.79e9-step Init shards, R²=0.99) at +/// [`RAM_USABLE_FRAC`] of RAM (reserving the rest for the OS, cross-shard /// re-ingress, and run-to-run variance). Returns 0 when the box can't even hold the ~50 GiB /// prover base (nothing will prove). Approximate by design — pair with /// [`partition_for_cycle_cap`] to get N. The earlier `45 + 32` model was @@ -1662,8 +1662,7 @@ pub fn block_step_cost(b: &BlockEntry) -> u64 { /// target actually used ~225 GB). /// Measured prover-RAM model (the single source of truth, used by both /// [`cycle_cap_for_ram`] and [`ram_gib_for_steps`]): peak host RAM ≈ -/// `RAM_BASE_GIB + RAM_GIB_PER_BCYCLE × steps_billions` (at -/// `--max-witness-stored 5`). +/// `RAM_BASE_GIB + RAM_GIB_PER_BCYCLE × steps_billions`. pub const RAM_BASE_GIB: f64 = 50.0; pub const RAM_GIB_PER_BCYCLE: f64 = 33.0; /// Usable fraction of a host-RAM budget (headroom for OS + variance) — applied @@ -1685,8 +1684,8 @@ pub fn cycle_cap_for_ram(ram_gb: f64) -> u64 { } /// Measured single-GPU **leaf prove time**: `≈ PROVE_SETUP_SECS + -/// PROVE_SECS_PER_BCYCLE × steps_billions` per shard (RTX PRO 6000, -/// `--max-witness-stored 5`). Aggregation adds a smaller per-fold term this model +/// PROVE_SECS_PER_BCYCLE × steps_billions` per shard (RTX PRO 6000). +/// Aggregation adds a smaller per-fold term this model /// omits — minutes next to hours of leaf proving at large shard counts. pub const PROVE_SETUP_SECS: f64 = 54.0; pub const PROVE_SECS_PER_BCYCLE: f64 = 158.0; diff --git a/docs/zisk-cycle-cost-model.md b/docs/zisk-cycle-cost-model.md index 04e7a192..86e18822 100644 --- a/docs/zisk-cycle-cost-model.md +++ b/docs/zisk-cycle-cost-model.md @@ -109,13 +109,12 @@ flags as the Aiur `bench-typecheck`. ## Prover RAM and prove time -Measured proving 7 Init shards on GPU at `--max-witness-stored 5`, STEPS -0.27–3.79e9: +Measured proving 7 Init shards on GPU, STEPS 0.27–3.79e9: ``` peak host RAM (GiB) ≈ 50 + 33·STEPS_billions R² 0.99 GPU prove time (s) ≈ 54 + 158·STEPS_billions R² 0.98 (~6.3M steps/s) ``` -RAM scales with `--max-witness-stored N` (this is N=5). Inverting the RAM model +Inverting the RAM model at `RAM_USABLE_FRAC` = 0.85 gives a safe per-shard cap of **~3.6e9 steps** for a 200 GiB target (`shard.rs:cycle_cap_for_ram`). @@ -171,7 +170,7 @@ side. Applying the cost model to every Init constant's native profile (51,003 blocks / 51,678 constants) vs a single-leaf cap: **~99.98% of Init typechecks on Zisk.** -Exactly **12 constants** exceed a single 250 GiB leaf (`--max-witness-stored 5`) +Exactly **12 constants** exceed a single 250 GiB leaf — all single atomic constants (un-shardable), `hb`-dominated. Listed by name (all are `_private` proof terms; private prefix elided): @@ -190,7 +189,7 @@ Exactly **12 constants** exceed a single 250 GiB leaf (`--max-witness-stored 5`) | `Array.extract_append_extract._proof_1_1` | 13,226 | 4.7e9 | | `Char.ofOrdinal_ordinal` | 14,136 | 4.5e9 | -Each has a workaround — lower `--max-witness-stored`, a bigger box, `--skip-deps` +Each has a workaround — a bigger box, `--skip-deps` (subject-only), or upstream proof restructuring — so 12 is an upper bound on truly-stuck constants. At a 200 GiB cap the list grows to 16. Estimate uses the planner's `block_step_cost` model (`162,339·hb + 4,276·subst + 652·bytes`, the @@ -311,8 +310,7 @@ native profiling run and compile out on the zkvm target. 5–8%); the shard model is still n=12 and Init-derived, so the ~190M shard intercept is the term most likely to shift on Std/Mathlib — worth re-checking there. -- RAM/prove-time models are for one GPU at `--max-witness-stored 5`; both scale - with that setting. +- RAM/prove-time models are for one GPU. - Cycle counts are deterministic for a fixed guest ELF; the profiler counters compile out on the zkvm target, so the proven ELF is unaffected. - **Regimes have distinct coefficients and don't transfer.** diff --git a/zisk/host/src/main.rs b/zisk/host/src/main.rs index 1eb35094..6bc6b11f 100644 --- a/zisk/host/src/main.rs +++ b/zisk/host/src/main.rs @@ -208,18 +208,6 @@ struct Args { /// `bench-typecheck --skip-deps`. #[arg(long, requires = "constant")] skip_deps: bool, - - /// Cap on resident witness traces during the prove phase, bounding - /// peak host RAM per shard. Zisk's prover queues witnesses up to this - /// count before committing them; peak RAM ≈ N × avg-witness-size + - /// fixed overheads. Zisk's built-in default is 10 (tuned for - /// large-memory boxes); we default to 5 here as a safer fit for - /// ~256 GB machines. Override up to 10 on bigger boxes for maximum - /// parallelism, or down to 3 on smaller ones. See the Zisk section - /// of the top-level README for a per-RAM recommendation table. Has - /// no effect on `--execute` / `--verify-constraints` modes. - #[arg(long, default_value_t = 5)] - max_witness_stored: usize, } /// 112-byte public output of one shard-guest proof. @@ -764,7 +752,6 @@ fn check_input_coherence( fn build_client( gpu: bool, asm: bool, - max_witness_stored: Option, ) -> Result { // Executor choice. The default is the Assembly executor (`asm = true`, // i.e. no `--emulator`): it is markedly faster at trace generation and is @@ -785,10 +772,8 @@ fn build_client( // docs ("Reduce memory footprint during proving at the cost of // speed"). We have ~94 GB of free GPU memory, so the speed // trade-off is the wrong direction for this hardware. - let mut opts = EmbeddedOpts::default(); - if let Some(n) = max_witness_stored { - opts = opts.max_witness_stored(n); - } + // Zisk's default embedded opts (witness cap 10). + let opts = EmbeddedOpts::default(); let mut builder: EmbeddedClientBuilder = ProverClient::embedded().with_embedded_opts(opts); if asm { @@ -1700,7 +1685,7 @@ async fn main() -> Result<()> { let total_leaves: usize = plans.iter().map(|p| p.shards.len()).sum(); let client = - build_client(args.gpu, !args.emulator, Some(args.max_witness_stored))?; + build_client(args.gpu, !args.emulator)?; client.setup(&SHARD_PROGRAM).run()?.await?; // Skip agg-guest setup unless we'll produce more than one leaf proof. // The shard-plan path sets up the agg program itself, after its leaves. From 8c3909a830736fbb88a191c4dbfe18df3cacf42b Mon Sep 17 00:00:00 2001 From: samuelburnham <45365069+samuelburnham@users.noreply.github.com> Date: Tue, 30 Jun 2026 18:55:04 +0000 Subject: [PATCH 02/27] feat(ci): CSV-driven !benchmark + Zisk/SP1/native benchmarks Rebased onto main (post #411 + #459) and integrated with the renamed bench-main.yml and reworked bencher-track interface. - Benchmarks/Vectors.csv: single shared source of truth (71 library constants from Init/Std/Mathlib/Lean). Consumed by Aiur (bench-typecheck --manifest), the zkVM hosts, and shell. - bench-pr.yml: `!benchmark [aiur] [zisk] [sp1|all] [execute|prove]` over the curated set, posting a main-vs-PR table; main results cached by base SHA. Hardened: comment body from env (no injection), allowlisted env parsing. - .github/scripts/{bench.py,run.sh}: parse/manifest/compare/comment, and the compile-.ixe + backend driver (cycles/execute-time/throughput/peak-rss, plus shards/max-shard-cycles for sharded runs). - .github/actions/install-{sp1,zisk}: shared zkVM toolchain + deps install, used by bench-pr.yml, bench-main.yml, and riscv-bench.yml. - bench-main.yml: add zkvm-execute (Zisk/SP1 cycle counts + time/RAM) and native-check (native parallel `ix check --anon` throughput) jobs, using the new bencher-track workload/thresholds interface. - bench-typecheck: add --constant / --skip-deps (align with the zkVM hosts; --skip-deps replaces --subject-only) and --execute-only (fast execute path). --- .github/actions/install-sp1/action.yml | 31 +++ .github/actions/install-zisk/action.yml | 66 +++++ .github/scripts/bench.py | 270 +++++++++++++++++++++ .github/scripts/run.sh | 135 +++++++++++ .github/workflows/bench-main.yml | 183 +++++++++++++- .github/workflows/bench-pr.yml | 304 +++++++++++++++--------- .github/workflows/riscv-bench.yml | 65 +---- Benchmarks/Typecheck.lean | 57 +++-- Benchmarks/Vectors.csv | 93 ++++++++ 9 files changed, 1009 insertions(+), 195 deletions(-) create mode 100644 .github/actions/install-sp1/action.yml create mode 100644 .github/actions/install-zisk/action.yml create mode 100644 .github/scripts/bench.py create mode 100644 .github/scripts/run.sh create mode 100644 Benchmarks/Vectors.csv diff --git a/.github/actions/install-sp1/action.yml b/.github/actions/install-sp1/action.yml new file mode 100644 index 00000000..71aa2037 --- /dev/null +++ b/.github/actions/install-sp1/action.yml @@ -0,0 +1,31 @@ +name: Install SP1 +description: >- + Install the system build deps and the SP1 zkVM toolchain (sp1up) needed to + build and run the SP1 host. Assumes a Rust toolchain is already set up. + +runs: + using: composite + steps: + # The shared zkVM apt superset (the ZisK book's full Ubuntu list — the + # prebuilt cargo tooling and proofman's C++ link OpenMPI/OpenMP/GMP/ + # nlohmann-json/nasm/secp256k1/… — plus pkg-config + libssl-dev for SP1's + # host crates). The Nix shells provided this; a bare runner doesn't. + - name: Install system build deps + shell: bash + run: | + # Some warpbuild images ship an unreachable azure mirror that hangs + # `apt-get update`; drop it first (no-op elsewhere). + sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true + sudo apt-get update + sudo apt-get install -y \ + xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \ + nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \ + libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \ + openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \ + pkg-config libssl-dev time + - name: Install SP1 toolchain (sp1up, latest) + shell: bash + run: | + curl -L https://sp1up.succinct.xyz | bash + ~/.sp1/bin/sp1up + echo "$HOME/.sp1/bin" >> "$GITHUB_PATH" diff --git a/.github/actions/install-zisk/action.yml b/.github/actions/install-zisk/action.yml new file mode 100644 index 00000000..ff883957 --- /dev/null +++ b/.github/actions/install-zisk/action.yml @@ -0,0 +1,66 @@ +name: Install Zisk +description: >- + Install the system build deps and the ZisK zkVM toolchain (ziskup, CPU build, + no proving keys) needed to build and run the Zisk host. Assumes a Rust + toolchain is already set up. + +runs: + using: composite + steps: + # The shared zkVM apt superset — the ZisK book's full Ubuntu list (prebuilt + # cargo-zisk + proofman's C++ link OpenMPI/OpenMP/GMP/nlohmann-json/nasm/ + # secp256k1/…), kept identical to install-sp1 so a host that links both is + # covered. The Nix shells provided this; a bare runner doesn't. + - name: Install system build deps + shell: bash + run: | + # Some warpbuild images ship an unreachable azure mirror that hangs + # `apt-get update`; drop it first (no-op elsewhere). + sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true + sudo apt-get update + sudo apt-get install -y \ + xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \ + nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \ + libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \ + openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \ + pkg-config libssl-dev time + # `--version 0.18.0` pins the toolchain to match our deps. Our host links the + # argumentcomputer/zisk `blake3-precompile` fork, which is based on v0.18.0 + # (its cargo-zisk has `check-setup`, used below to regenerate the key's + # const-trees). Without the pin, ziskup installs `releases/latest`, which + # resolves to upstream `v1.0.0-alpha` — a different circuit whose cargo-zisk + # dropped the `check-setup` subcommand, breaking the key step. `--cpu` picks + # the CPU build (no GPU on the runner) and `--nokey` skips ziskup's key + # install — both avoid its interactive /dev/tty prompts. We keep `--nokey` + # because the upstream `zisk-setup` bucket only carries the upstream circuit's + # key; our fork has a different circuit (extra Blake3f AIR), so we restore the + # fork-matching key from our own S3 in the next step. `--prefix $HOME/.zisk` + # pins the install where cargo-zisk's ZiskPaths fallback looks (the runner + # sets XDG_CONFIG_HOME, which would otherwise relocate it). + - name: Install Zisk toolchain (ziskup, pinned v0.18.0) + shell: bash + run: | + curl -L https://raw.githubusercontent.com/0xPolygonHermez/zisk/main/ziskup/install.sh \ + | bash -s -- --cpu --nokey -y --version 0.18.0 --prefix "$HOME/.zisk" + echo "$HOME/.zisk/bin" >> "$GITHUB_PATH" + # Execute still needs a proving key present: zisk-host calls `client.setup()` + # (which the SDK runs before the execute branch), and that loads the circuit's + # const-tree files. We host the fork-matching key in a public S3 bucket + # WITHOUT the const-trees — exactly like Zisk's released + # `zisk-provingkey-*.tar.gz` on `storage.googleapis.com/zisk-setup` — and + # regenerate them here with `cargo-zisk check-setup -a`, which is how `ziskup` + # itself populates them. That keeps the artifact ~3 GB (gzip) instead of + # ~48 GB. The object name carries the fork rev so a circuit change can't + # silently reuse a stale key. Public bucket → plain curl, no AWS creds. + - name: Restore Zisk proving key (fork circuit) from S3 + shell: bash + run: | + mkdir -p "$HOME/.zisk" + curl -fSL --retry 3 \ + https://argument-zisk-setup.s3.amazonaws.com/zisk-provingkey-blake3-8f9e24d5-cpu.tar.gz \ + -o /tmp/zisk-provingkey.tar.gz + tar -C "$HOME/.zisk" -xzf /tmp/zisk-provingkey.tar.gz + rm -f /tmp/zisk-provingkey.tar.gz + # Regenerate the const-tree files omitted from the artifact (CPU build, so + # no --gpu). This is the "may take a while" step ziskup prints. + cargo-zisk check-setup --proving-key "$HOME/.zisk/provingKey" -a diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py new file mode 100644 index 00000000..c27582c9 --- /dev/null +++ b/.github/scripts/bench.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python3 +"""All data-wrangling for the `!benchmark` PR workflow, as subcommands: + + parse COMMENT_BODY env → matrix + config (writes $GITHUB_OUTPUT) + manifest Benchmarks/Vectors.csv → the constant names for one cell + compare main.json + pr.json → a Markdown main-vs-PR table + comment per-cell table files → the final PR comment body + +The neutral results JSON every backend normalises to (see run.sh) is +`{ "": { "": , ... }, ... }`. All metrics are +lower-is-better, so a positive Δ% is a regression. +""" +import argparse +import glob +import hashlib +import json +import os + + +# ───────────────────────── parse ───────────────────────── +BACKENDS = ("aiur", "zisk", "sp1") +MODES = ("execute", "prove") +ENVS = ("initStd", "lean", "mathlib") +CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_GPU"} +PASSTHROUGH_KEYS = {"RUST_LOG", "WITHOUT_VK_VERIFICATION", "RUSTFLAGS"} + + +def runner_for(backend, mode, gpu): + """(runs-on label, skip?) for a cell.""" + if backend == "aiur": + return "warp-ubuntu-latest-x64-32x", False + if mode == "execute": + return "warp-ubuntu-latest-x64-16x", False + if gpu: # zkVM prove needs a GPU + return "self-hosted-gpu", False + return "ubuntu-latest", True + + +def cmd_parse(_a): + body = os.environ.get("COMMENT_BODY", "") + lines = [ln.replace("\r", "") for ln in body.splitlines()] + cmd = next((ln for ln in lines if "!benchmark" in ln), "") + toks = cmd.split("!benchmark", 1)[1].split() if "!benchmark" in cmd else [] + + backends, mode = [], "execute" + for t in (t.lower() for t in toks): + if t == "all": + backends = list(BACKENDS) + elif t in BACKENDS and t not in backends: + backends.append(t) + elif t in MODES: + mode = t + if not backends: + backends = ["aiur"] + + cfg, passthrough = {}, [] + for ln in lines[(lines.index(cmd) + 1) if cmd in lines else 0:]: + s = ln.strip() + if not s or "=" not in s: + continue + key, val = (x.strip() for x in s.split("=", 1)) + if key in CONFIG_KEYS: + cfg[key] = val + elif key in PASSTHROUGH_KEYS: + passthrough.append(f"{key}={val}") + + envs = [e.strip() for e in cfg.get("BENCH_ENVS", "initStd").split(",") if e.strip()] + envs = [e for e in envs if e in ENVS] or ["initStd"] + tier = cfg.get("BENCH_TIER", "") + if tier not in ("cheap", "heavy", "all"): + tier = "" # empty ⇒ derived from mode at manifest time + shard = "1" if cfg.get("BENCH_SHARD") == "1" else "0" + gpu = cfg.get("BENCH_GPU") == "1" + + cells = [] + for b in backends: + for e in envs: + runner, skip = runner_for(b, mode, gpu) + cells.append({"backend": b, "env": e, "mode": mode, "runner": runner, + "skip": "true" if skip else "false", "label": f"{b}-{e}-{mode}"}) + + summary = (f"backends: `{' '.join(backends)}` · mode: `{mode}` · " + f"envs: `{','.join(envs)}` · tier: `{tier or 'auto'}` · " + f"shard: `{shard}` · gpu: `{int(gpu)}`") + if passthrough: + summary += " · env: `" + " ".join(passthrough) + "`" + + with open(os.environ.get("GITHUB_OUTPUT", "/dev/stdout"), "a") as f: + f.write(f"matrix={json.dumps(cells)}\n") + f.write(f"mode={mode}\ntier={tier}\nshard={shard}\n") + f.write(f"config-summary={summary}\n") + f.write("passthrough-env< a.threshold: + cell += " ⚠️"; n_reg += 1 + elif dp < -a.threshold: + cell += " 🟢"; n_imp += 1 + if worst is None or dp > worst[0]: + worst = (dp, n) + cells += [_human(mv), _human(pv), cell] + rows.append("| " + " | ".join(cells) + " |") + + out = ([title, ""] if title else []) + rows + [""] + s = (f"_{len(names)} constants · {n_reg} regressed · {n_imp} improved " + f"(|Δ| > {a.threshold:g}% on `{primary}`)._") + if worst and worst[0] is not None and worst[0] > a.threshold: + s += f" Worst: `{worst[1]}` {worst[0]:+.1f}%." + out.append(s) + emit("\n".join(out)) + + +# ───────────────────────── comment ───────────────────────── +def cmd_comment(a): + commit = f"[`{a.head[:7]}`]({a.repo_url}/commit/{a.head})" + parts = [f"## `!benchmark` — main vs {commit}", "", a.summary, ""] + tables = sorted(glob.glob(os.path.join(a.tables, "table-*.md"))) + if tables: + for t in tables: + parts += [open(t).read().rstrip(), ""] + else: + parts += ["_No result tables were produced — see the workflow logs._", ""] + parts.append(f"[Workflow logs]({a.repo_url}/actions/runs/{a.run_id})") + open(a.out, "w").write("\n".join(parts) + "\n") + print(open(a.out).read()) + + +# ────────────────────────── cli ────────────────────────── +def main(): + ap = argparse.ArgumentParser(description=__doc__) + sub = ap.add_subparsers(dest="cmd", required=True) + + sub.add_parser("parse").set_defaults(fn=cmd_parse) + + m = sub.add_parser("manifest") + m.add_argument("--csv", required=True); m.add_argument("--env", required=True) + m.add_argument("--mode", required=True); m.add_argument("--tier", default="") + m.add_argument("--shard", default="0"); m.add_argument("--out", required=True) + m.set_defaults(fn=cmd_manifest) + + c = sub.add_parser("compare") + c.add_argument("--main", required=True); c.add_argument("--pr", required=True) + c.add_argument("--metric", action="append", default=[]) + c.add_argument("--threshold", type=float, default=3.0) + c.add_argument("--title"); c.add_argument("--backend"); c.add_argument("--env") + c.add_argument("--mode"); c.add_argument("--count"); c.add_argument("--cache-hit", default="") + c.add_argument("--out") + c.set_defaults(fn=cmd_compare) + + cm = sub.add_parser("comment") + cm.add_argument("--tables", required=True); cm.add_argument("--summary", default="") + cm.add_argument("--head", required=True); cm.add_argument("--repo-url", required=True) + cm.add_argument("--run-id", required=True); cm.add_argument("--out", required=True) + cm.set_defaults(fn=cmd_comment) + + a = ap.parse_args() + a.fn(a) + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh new file mode 100644 index 00000000..045a1f4f --- /dev/null +++ b/.github/scripts/run.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +# Compile one library env to a `.ixe` from a checked-out repo, then benchmark the +# given backend over a manifest, emitting the neutral results JSON +# { "": { "": , ... }, ... } +# that bench.py compare consumes. +# +# run.sh +# repo_dir : checked-out worktree (has .lake/build/bin/{ix,bench-typecheck}) +# env : initStd | lean | mathlib +# backend : aiur | zisk | sp1 +# mode : execute | prove +# +# `ix` / `bench-typecheck` are taken from (so base measures base's +# code, PR the PR's — the caller puts /.lake/build/bin on PATH). The +# zkVM hosts run from /. Per-constant subprocesses for the +# zkVM hosts so one OOM/timeout drops only that row. +set -uo pipefail + +repo=${1:?repo_dir}; benv=${2:?env}; backend=${3:?backend}; mode=${4:?mode} +names=${5:?names}; out=${6:?out} +# Absolute repo path: the zkVM branch cd's into the host workspace, so the .ixe +# path passed to the host must not be relative to the original cwd. +repo=$(cd "$repo" && pwd) +: > "$out" +emit_empty() { [ -s "$out" ] || echo '{}' > "$out"; } + +case "$benv" in + initStd) module=CompileInitStd ;; + lean) module=CompileLean ;; + mathlib) module=CompileMathlib ;; + *) echo "unknown env: $benv" >&2; exit 2 ;; +esac + +ixe="$repo/$benv.ixe" +if [ "${REUSE_IXE:-0}" = 1 ] && [ -f "$ixe" ]; then + echo "reusing existing $ixe (REUSE_IXE)" +else + echo "::group::ix compile $module → $benv.ixe ($backend/$mode)" + "$repo/.lake/build/bin/ix" compile "$repo/Benchmarks/Compile/$module.lean" --out "$ixe" + echo "::endgroup::" +fi + +case "$backend" in + aiur) + # bench-typecheck runs Phase 1 (execute) always; Phase 2 (prove) unless + # --execute-only. It writes the neutral JSON itself via --json. + args=(--ixe "$ixe" --manifest "$names" --json "$out") + if [ "$mode" = execute ]; then + bench-typecheck "${args[@]}" --execute-only || true + else + bench-typecheck "${args[@]}" --texray 2> tx.log || true + # Fold texray's proving RSS high-water mark into every entry (max over + # spans; $2+0 stops at the first non-digit) — same parse as aiur-bench.yml. + rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>m {m=$2+0} END {if (m>0) print m}' tx.log) + if [ -n "${rss:-}" ] && [ -s "$out" ]; then + jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' "$out" > "$out.t" \ + && mv "$out.t" "$out" || true + fi + fi + emit_empty + ;; + + zisk|sp1) + host="${backend}-host"; work="$repo/$backend" + # Build the host once so per-constant timing excludes compilation, and run the + # binary directly under `/usr/bin/time` — a `timeout`/`cargo run` wrapper would + # report the wrapper's RSS, not the host's. (No per-constant timeout in execute + # mode; the job-level timeout bounds a hang.) + echo "::group::cargo build $host" + ( cd "$work" && cargo build --quiet --release --bin "$host" ) + echo "::endgroup::" + bin="$work/target/release/$host" + # ZisK's ASM microservices mmap the ROM with MAP_LOCKED, needing unlimited + # locked memory (the Zisk book's DefaultLimitMEMLOCK=infinity). The warp + # runner caps the memlock hard limit and can't be rebooted, so raise it + # in-session as root; the host children inherit it. Without this the ASM + # services die with `mmap(rom) errno=11`. SP1 needs no such raise. + [ "$backend" = zisk ] && sudo prlimit --pid $$ --memlock=unlimited:unlimited + tmp=$(mktemp -d) + while IFS= read -r c; do + [ -z "$c" ] && continue + slug=$(printf '%s' "$c" | tr '/ .:' '____') + log="$tmp/$slug.out"; tmf="$tmp/$slug.time" + if [ "$mode" = execute ]; then + # `/usr/bin/time -f '%e %M'` → elapsed seconds + max RSS (kB). + ( cd "$work" && /usr/bin/time -f '%e %M' -o "$tmf" \ + "$bin" --execute --ixe "$ixe" --constant "$c" --skip-deps ) > "$log" 2>>"$log" \ + || { echo "::warning::$backend execute '$c' failed; dropping"; continue; } + fail=$(grep -oE 'failures[:=] ?[0-9]+' "$log" | head -1 | grep -oE '[0-9]+') + if [ "${fail:-1}" != 0 ]; then + echo "::warning::$backend '$c': nonzero/missing failures; dropping"; continue + fi + # Total cycles: sharded prints "total cycles: N", single prints "cycles: N". + cyc=$(grep -oE 'total cycles: [0-9]+' "$log" | tail -1 | grep -oE '[0-9]+') + [ -z "$cyc" ] && cyc=$(grep -oE 'cycles: [0-9]+' "$log" | tail -1 | grep -oE '[0-9]+') + [ -z "$cyc" ] && { echo "::warning::$backend '$c': no cycle count; dropping"; continue; } + secs=$(awk 'NR==1{print $1}' "$tmf"); rssk=$(awk 'NR==1{print $2}' "$tmf") + rss=$(( ${rssk:-0} * 1024 )) + tput=$(awk -v c="$cyc" -v s="${secs:-0}" 'BEGIN{ if (s>0) printf "%.0f", c/s; else print 0 }') + # Per-shard "cycles=" lines appear only for sharded runs. + mapfile -t sh < <(grep -oE 'cycles=[0-9]+' "$log" | grep -oE '[0-9]+') + base="cycles:\$cyc, \"execute-time\":\$secs, throughput:\$tput, \"peak-rss\":\$rss" + if [ "${#sh[@]}" -gt 0 ]; then + maxsh=$(printf '%s\n' "${sh[@]}" | sort -n | tail -1) + jq -n --arg n "$c" --argjson cyc "$cyc" --argjson secs "${secs:-0}" \ + --argjson tput "$tput" --argjson rss "$rss" \ + --argjson nsh "${#sh[@]}" --argjson maxsh "$maxsh" \ + "{(\$n): {$base, shards:\$nsh, \"max-shard-cycles\":\$maxsh}}" + else + jq -n --arg n "$c" --argjson cyc "$cyc" --argjson secs "${secs:-0}" \ + --argjson tput "$tput" --argjson rss "$rss" \ + "{(\$n): {$base}}" + fi + else + # prove (single-leaf, GPU runner only — the workflow gates this cell). + ( cd "$work" && timeout 60m cargo run --quiet --release --bin "$host" -- \ + --gpu --ixe "$ixe" --constant "$c" --skip-deps ) > "$log" 2>&1 \ + || { echo "::warning::$backend prove '$c' failed/timed out; dropping"; continue; } + secs=$(grep -oE 'prove [0-9.]+s' "$log" | head -1 | grep -oE '[0-9.]+') + steps=$(grep -oE '\([0-9]+ steps\)' "$log" | head -1 | grep -oE '[0-9]+') + fail=$(grep -oE 'failures=[0-9]+' "$log" | head -1 | grep -oE '[0-9]+') + if [ -n "${secs:-}" ] && [ "${fail:-1}" = 0 ]; then + jq -n --arg n "$c" --argjson t "$secs" --argjson s "${steps:-0}" \ + '{($n): {"prove-time": $t, steps: $s}}' + else + echo "::warning::$backend prove '$c': no clean prove line; dropping" + fi + fi + done < "$names" | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null + emit_empty + ;; + + *) echo "unknown backend: $backend" >&2; exit 2 ;; +esac +echo "rows in $out: $(jq 'length' "$out" 2>/dev/null || echo '?')" diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml index 957b51b3..734bf787 100644 --- a/.github/workflows/bench-main.yml +++ b/.github/workflows/bench-main.yml @@ -1,14 +1,19 @@ -name: Aiur benchmarks +name: Benchmarks -# One workflow, two benchmarks per library env, on every push to main: -# 1. compile job — `ix compile` the Lean env to a `.ixe` (compile-throughput -# metrics) and cache the `.ixe`. -# 2. prove job — restore that `.ixe` from the cache (no recompile) and -# STARK-check selected constants over it via bench-typecheck -# (Aiur execute + prove metrics). -# The prove job reuses the exact `.ixe` the compile job built, so the compiler -# runs once. Compile and prove report to separate bencher testbeds so each one's -# `--thresholds-reset` only touches its own measures. +# Benchmarks tracked on Bencher on every push to main, all reusing the one +# compiled `.ixe` so the compiler runs once: +# 1. compile — `ix compile` the Lean env to a `.ixe` (compile-throughput +# metrics) and cache the `.ixe`. +# 2. prove — restore that `.ixe` (no recompile) and STARK-check selected +# constants over it via bench-typecheck (Aiur execute + prove). +# 3. zkvm-execute — restore that `.ixe` and execute the same constants through +# the Zisk and SP1 zkVM hosts (deterministic cycle counts + +# time/throughput/RAM; proving needs a GPU, so execute-only). +# 4. native-check — restore that `.ixe` and run the native Rust kernel (the same +# kernel, out-of-circuit and parallel — far faster) over the +# whole env via `ix check --anon`, tracking throughput. +# Each job reports to its own bencher testbed/workload so a threshold reset only +# touches its own measures. on: push: @@ -272,3 +277,161 @@ jobs: --threshold-measure throughput --threshold-test percentage --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _ --threshold-lower-boundary 0.10 + + # Execute the same constants through the Zisk and SP1 zkVM hosts and track + # cycles / execute-time / throughput / peak-rss (and shards / max-shard-cycles + # for any sharded run). Lean-free: reuses the compile job's cached `.ixe` and + # only builds the Rust host. zkVM proving needs a GPU (absent here), so this is + # execute-only. Toolchain + deps come from the shared install-{zisk,sp1} actions. + zkvm-execute: + needs: compile + runs-on: warp-ubuntu-latest-x64-16x + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + backend: [zisk, sp1] + include: + - bench: InitStd + consts: Nat.add_comm Nat.sub_le_of_le_add String.append Array.append_assoc + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + fetch-tags: true # bencher-track reads the baseline-reset tag + - uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + cache-workspaces: ${{ matrix.backend }} + - name: Install Zisk + if: matrix.backend == 'zisk' + uses: ./.github/actions/install-zisk + - name: Install SP1 + if: matrix.backend == 'sp1' + uses: ./.github/actions/install-sp1 + # Pull the `.ixe` the compile job built — no recompile (REUSE_IXE). + - uses: actions/cache/restore@v5 + with: + path: ${{ matrix.bench }}.ixe + key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }} + fail-on-cache-miss: true + - name: Run ${{ matrix.backend }} execute benchmark + env: + REUSE_IXE: "1" + run: | + printf '%s\n' ${{ matrix.consts }} > names.txt + bash .github/scripts/run.sh . ${{ matrix.bench }} ${{ matrix.backend }} execute \ + names.txt neutral.json + # Wrap neutral { name: { metric: v } } → Bencher Metric Format. + jq 'map_values(map_values({value: .}))' neutral.json > bench.json + cat bench.json + # cycles / shards / max-shard-cycles are deterministic per guest ELF → + # pinned (0/0). execute-time / peak-rss / throughput are noisy wall-clock → + # percentage bounds (throughput's regression is a drop). + - uses: ./.github/actions/bencher-track + with: + testbed: ${{ matrix.backend }}-execute-x64-16x + workload: ${{ matrix.backend }} + file: bench.json + key: ${{ secrets.BENCHER_API_KEY }} + github-token: ${{ secrets.GITHUB_TOKEN }} + thresholds: | + --threshold-measure cycles --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0 + --threshold-lower-boundary 0 + --threshold-measure shards --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0 + --threshold-lower-boundary 0 + --threshold-measure max-shard-cycles --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0 + --threshold-lower-boundary 0 + --threshold-measure execute-time --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10 + --threshold-lower-boundary _ + --threshold-measure peak-rss --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10 + --threshold-lower-boundary _ + --threshold-measure throughput --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _ + --threshold-lower-boundary 0.10 + + # Native Rust kernel typecheck — the same kernel as the zkVM guest, but run + # out-of-circuit and in parallel (`--workers` defaults to the core count), so + # far faster than proving. Checks the whole env via `ix check --anon`, tracking + # throughput (constants/sec), wall time, and peak RAM. Reuses the compile job's + # cached `.ixe` and the staged `ix` binary — no recompile. + native-check: + needs: compile + runs-on: warp-ubuntu-latest-x64-32x + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + bench: [InitStd, Mathlib] + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + fetch-tags: true # bencher-track reads the baseline-reset tag + - uses: actions/cache/restore@v5 + with: + path: ~/.local/bin + key: aiur-bench-bins-${{ github.sha }} + - run: echo "$HOME/.local/bin" >> $GITHUB_PATH + # Provision the toolchain so `ix` finds libleanshared (no package build). + - uses: leanprover/lean-action@v1 + with: + auto-config: false + build: false + use-github-cache: false + - name: Install GNU time + run: | + sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true + sudo apt-get update && sudo apt-get install -y time + - uses: actions/cache/restore@v5 + with: + path: ${{ matrix.bench }}.ixe + key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }} + fail-on-cache-miss: true + # `ix check --anon` checks every kernel-checkable address across + # available_parallelism workers and prints a machine-readable + # `##check## ` line. ix check is a + # single multi-threaded process, so `/usr/bin/time -f '%M'` is the true peak RSS. + - name: Run native kernel check + run: | + /usr/bin/time -f '%e %M' -o time.txt \ + ix check ${{ matrix.bench }}.ixe --anon 2>&1 | tee out.txt + line=$(grep '^##check##' out.txt | tail -1) + elapsed_ms=$(echo "$line" | awk '{print $2}') + failures=$(echo "$line" | awk '{print $4}') + total=$(echo "$line" | awk '{print $5}') + [ "${failures:-1}" = 0 ] || { echo "kernel check reported ${failures:-?} failure(s)"; exit 1; } + rssk=$(awk 'NR==1{print $2}' time.txt); rss=$(( ${rssk:-0} * 1024 )) + check_s=$(awk -v e="$elapsed_ms" 'BEGIN{printf "%.3f", e/1000}') + tput=$(awk -v t="$total" -v e="$elapsed_ms" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }') + jq -n --arg n "${{ matrix.bench }}" --argjson c "$total" --argjson s "$check_s" \ + --argjson tp "$tput" --argjson rss "$rss" \ + '{($n): {constants:{value:$c}, "check-time":{value:$s}, throughput:{value:$tp}, "peak-rss":{value:$rss}}}' \ + > bench.json + cat bench.json + # constants is deterministic → pinned (0/0); check-time / throughput / + # peak-rss are noisy parallel wall-clock → percentage bounds. + - uses: ./.github/actions/bencher-track + with: + testbed: native-check-x64-32x + workload: native-check + file: bench.json + key: ${{ secrets.BENCHER_API_KEY }} + github-token: ${{ secrets.GITHUB_TOKEN }} + thresholds: | + --threshold-measure constants --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0 + --threshold-lower-boundary 0 + --threshold-measure check-time --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10 + --threshold-lower-boundary _ + --threshold-measure throughput --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _ + --threshold-lower-boundary 0.10 + --threshold-measure peak-rss --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10 + --threshold-lower-boundary _ diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index a2a1823c..c623d432 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -1,4 +1,17 @@ -# Creates a PR benchmark comment with a comparison to main +# `!benchmark` PR command: run the curated constant set (Benchmarks/Vectors.csv) +# through chosen prover backend(s) and post a main-vs-PR comparison table. +# +# !benchmark [aiur] [zisk] [sp1 | all] [execute|prove] +# BENCH_ENVS=initStd,mathlib # which compiled envs (default initStd) +# BENCH_TIER=cheap|heavy|all # override the mode default (execute=all, prove=cheap) +# BENCH_SHARD=1 # restrict to the multi-shard target constants +# BENCH_GPU=1 # allow zkVM prove on a self-hosted GPU runner +# RUST_LOG=info # passthrough env (allowlisted) +# +# Per-PR scope: Aiur runs execute (fast, --execute-only) or prove on CPU; the +# Zisk/SP1 zkVM hosts run execute (deterministic cycle counts). zkVM prove needs +# a GPU and is skipped with a note unless BENCH_GPU=1 selects a GPU runner. main's +# numbers are cached by base SHA so they are not recomputed on every comment. name: Benchmark pull requests on: @@ -11,145 +24,220 @@ permissions: pull-requests: read concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + group: ${{ github.workflow }}-${{ github.event.issue.number }} cancel-in-progress: true jobs: setup: - name: Comparative PR benchmark comment - if: - github.event.issue.pull_request - && github.event.issue.state == 'open' - && (contains(github.event.comment.body, '!benchmark')) - && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') + name: Parse !benchmark comment + if: >- + github.event.issue.pull_request && + github.event.issue.state == 'open' && + contains(github.event.comment.body, '!benchmark') && + (github.event.comment.author_association == 'MEMBER' || + github.event.comment.author_association == 'OWNER') runs-on: ubuntu-latest outputs: - benches: ${{ steps.bench-params.outputs.benches }} - env-vars: ${{ steps.bench-params.outputs.env-vars }} + matrix: ${{ steps.parse.outputs.matrix }} + mode: ${{ steps.parse.outputs.mode }} + tier: ${{ steps.parse.outputs.tier }} + shard: ${{ steps.parse.outputs.shard }} + passthrough-env: ${{ steps.parse.outputs.passthrough-env }} + config-summary: ${{ steps.parse.outputs.config-summary }} + base-sha: ${{ steps.comment-branch.outputs.base_sha }} + head-sha: ${{ steps.comment-branch.outputs.head_sha }} steps: - uses: actions/checkout@v6 - - name: Parse PR comment body - id: bench-params - run: | - # Parse `issue_comment` body - printf '${{ github.event.comment.body }}' > comment.txt - BENCH_COMMAND=$(head -n 1 comment.txt | tr -d '\r') - echo "$BENCH_COMMAND" - - BENCHES=$(echo $BENCH_COMMAND | awk -F'!benchmark ' '{ print $2 }') - # Set default benches to run if none specified - BENCHES=${BENCHES:-"bench-aiur"} - echo "BENCHES:" - echo "$BENCHES" - JSON=$(echo $BENCHES | jq -R -c 'split(" ")') - - echo "JSON:" - echo "$JSON" - - echo "benches=$JSON" | tee -a $GITHUB_OUTPUT - - # Can't persist env vars between jobs, so we pass them as an output and set them in the next job - echo "env-vars=$(tail -n +2 comment.txt | tr -d '\r' | tr '\n' ' ')" | tee -a $GITHUB_OUTPUT + # Resolve the PR's base/head SHAs from the comment's issue. + - uses: xt0rted/pull-request-comment-branch@v3 + id: comment-branch + # Parse the comment from an env var (never inline-interpolated) → matrix + + # config. The allowlist drops anything that isn't a known flag/env key. + - name: Parse comment + id: parse + env: + COMMENT_BODY: ${{ github.event.comment.body }} + run: python3 .github/scripts/bench.py parse benchmark: - needs: [ setup ] - runs-on: warp-ubuntu-latest-x64-16x + needs: setup + runs-on: ${{ matrix.cell.runner }} + timeout-minutes: 120 strategy: + fail-fast: false matrix: - # Runs a job for each benchmark specified in the `issue_comment` input - bench: ${{ fromJSON(needs.setup.outputs.benches) }} + cell: ${{ fromJSON(needs.setup.outputs.matrix) }} + env: + BACKEND: ${{ matrix.cell.backend }} + BENV: ${{ matrix.cell.env }} + MODE: ${{ matrix.cell.mode }} + LABEL: ${{ matrix.cell.label }} + BASE_SHA: ${{ needs.setup.outputs.base-sha }} + HEAD_SHA: ${{ needs.setup.outputs.head-sha }} + TIER: ${{ needs.setup.outputs.tier }} + SHARD: ${{ needs.setup.outputs.shard }} steps: - - name: Set env vars + # ---------- skipped cell (zkVM prove without a GPU runner) ---------- + - name: Skip note + if: matrix.cell.skip == 'true' run: | - # Overrides default env vars with those specified in the `issue_comment` input if identically named - for var in ${{ needs.setup.outputs.env-vars }} - do - echo "$var" | tee -a $GITHUB_ENV - done - - uses: actions/checkout@v6 - # Get base branch of the PR - - uses: xt0rted/pull-request-comment-branch@v3 - id: comment-branch - - name: Checkout base branch + mkdir -p out + { + echo "### \`$BACKEND\` · \`$BENV\` · \`$MODE\`" + echo + echo "_Skipped: zkVM proving needs a GPU runner. Re-run with \`BENCH_GPU=1\` on a GPU-enabled runner._" + } > "out/table-$LABEL.md" + + # ---------- real cell ---------- + # PR checked out at the workspace root so the local install actions and the + # helper scripts resolve; base (cache-miss only) goes under base/. + - name: Checkout PR + if: matrix.cell.skip != 'true' uses: actions/checkout@v6 with: - ref: ${{ steps.comment-branch.outputs.base_sha }} - path : ${{ github.workspace }}/base - - name: Run `lake build` on base branch + ref: ${{ env.HEAD_SHA }} + - name: Apply passthrough env + if: matrix.cell.skip != 'true' + run: | + while IFS= read -r line; do + [ -n "$line" ] && printf '%s\n' "$line" >> "$GITHUB_ENV" + done <<'PTENV' + ${{ needs.setup.outputs.passthrough-env }} + PTENV + # Select the constants for this cell → names.txt; emit count/vhash/tier + # (vhash is part of the main cache key, so editing Vectors.csv invalidates + # stale main results). + - name: Select constants from Benchmarks/Vectors.csv + if: matrix.cell.skip != 'true' + id: man + run: | + python3 .github/scripts/bench.py manifest \ + --csv Benchmarks/Vectors.csv --env "$BENV" --mode "$MODE" \ + --tier "$TIER" --shard "$SHARD" --out "$GITHUB_WORKSPACE/names.txt" \ + | tee -a "$GITHUB_OUTPUT" + + # Lean toolchain + build `ix` and `bench-typecheck` for the PR side (every + # backend needs `ix` to compile the env to a `.ixe`). Mathlib cache only + # pulled for the mathlib env. + - name: Build PR (ix, bench-typecheck) + if: matrix.cell.skip != 'true' uses: leanprover/lean-action@v1 with: - lake-package-directory: ${{ github.workspace }}/base - test: false - - name: Run bench on base branch - run: | - if $(lake run get-exe-targets | grep -q ${{ matrix.bench }}); then - lake exe ${{ matrix.bench }} - else - echo "No matching bench target found on base branch" - fi - working-directory: ${{ github.workspace }}/base - - name: Checkout PR branch + lake-package-directory: . + auto-config: false + build: true + build-args: "ix bench-typecheck" + use-github-cache: false + use-mathlib-cache: ${{ matrix.cell.env == 'mathlib' && 'true' || 'false' }} + # zkVM cells additionally need the Rust toolchain + the backend's toolchain + # and system deps (the shared composite install actions). + - name: Set up zkVM Rust toolchain + if: matrix.cell.skip != 'true' && matrix.cell.backend != 'aiur' + uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + cache-workspaces: ${{ matrix.cell.backend }} + - name: Install SP1 + if: matrix.cell.skip != 'true' && matrix.cell.backend == 'sp1' + uses: ./.github/actions/install-sp1 + - name: Install Zisk + if: matrix.cell.skip != 'true' && matrix.cell.backend == 'zisk' + uses: ./.github/actions/install-zisk + + # ---------- main side (cached by base SHA) ---------- + - name: Restore cached main results + if: matrix.cell.skip != 'true' + id: main-cache + uses: actions/cache/restore@v4 + with: + path: main.json + key: bench-${{ matrix.cell.label }}-${{ env.BASE_SHA }}-${{ steps.man.outputs.vhash }} + - name: Checkout base + if: matrix.cell.skip != 'true' && steps.main-cache.outputs.cache-hit != 'true' uses: actions/checkout@v6 with: - path: ${{ github.workspace }}/pr - ref: ${{ steps.comment-branch.outputs.head_sha }} - - name: Run `lake build` on PR branch + ref: ${{ env.BASE_SHA }} + path: base + - name: Build base (ix, bench-typecheck) + if: matrix.cell.skip != 'true' && steps.main-cache.outputs.cache-hit != 'true' uses: leanprover/lean-action@v1 with: - lake-package-directory: ${{ github.workspace }}/pr - test: false - - name: Copy base benchmarks into PR dir for comparison + lake-package-directory: base + auto-config: false + build: true + build-args: "ix bench-typecheck" + use-github-cache: false + use-mathlib-cache: ${{ matrix.cell.env == 'mathlib' && 'true' || 'false' }} + - name: Run backend on base → main.json + if: matrix.cell.skip != 'true' && steps.main-cache.outputs.cache-hit != 'true' + run: | + export PATH="$PWD/base/.lake/build/bin:$PATH" + bash .github/scripts/run.sh base "$BENV" "$BACKEND" "$MODE" \ + "$GITHUB_WORKSPACE/names.txt" "$GITHUB_WORKSPACE/main.json" + - name: Save cached main results + if: matrix.cell.skip != 'true' && steps.main-cache.outputs.cache-hit != 'true' + uses: actions/cache/save@v4 + with: + path: main.json + key: bench-${{ matrix.cell.label }}-${{ env.BASE_SHA }}-${{ steps.man.outputs.vhash }} + + # ---------- PR side ---------- + - name: Run backend on PR → pr.json + if: matrix.cell.skip != 'true' run: | - BENCH_DIR_PR=${{ github.workspace }}/pr/.lake/benches - BENCH_DIR_BASE=${{ github.workspace }}/base/.lake/benches - mkdir -p $BENCH_DIR_PR - [ -d "$BENCH_DIR_BASE" ] && cp -r $BENCH_DIR_BASE/. $BENCH_DIR_PR/ - ls $BENCH_DIR_PR - - name: Run bench on PR branch and generate comparison report + export PATH="$PWD/.lake/build/bin:$PATH" + bash .github/scripts/run.sh . "$BENV" "$BACKEND" "$MODE" \ + "$GITHUB_WORKSPACE/names.txt" "$GITHUB_WORKSPACE/pr.json" + + # ---------- compare ---------- + - name: Build comparison table + if: matrix.cell.skip != 'true' run: | - BENCH_REPORT=1 lake exe ${{ matrix.bench }} - working-directory: ${{ github.workspace }}/pr - - name: Get env for PR body + mkdir -p out + python3 .github/scripts/bench.py compare \ + --main main.json --pr pr.json --out "out/table-$LABEL.md" \ + --backend "$BACKEND" --env "$BENV" --mode "$MODE" \ + --count "${{ steps.man.outputs.count }}" \ + --cache-hit "${{ steps.main-cache.outputs.cache-hit }}" + cat "out/table-$LABEL.md" + + - name: Upload table if: always() + uses: actions/upload-artifact@v4 + with: + name: table-${{ env.LABEL }} + path: out/table-${{ env.LABEL }}.md + if-no-files-found: warn + + comment: + needs: [setup, benchmark] + if: always() && needs.setup.result == 'success' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Download tables + uses: actions/download-artifact@v4 + with: + path: tables + pattern: table-* + merge-multiple: true + - name: Build comment body + env: + SUMMARY: ${{ needs.setup.outputs.config-summary }} run: | - SHORT_SHA_PR=$(git rev-parse --short HEAD) - REPO_URL=${{ github.server_url }}/${{ github.repository }} - echo "COMMIT_LINK=[\`$SHORT_SHA_PR\`]($REPO_URL/commit/${{ steps.comment-branch.outputs.head_sha }})" | tee -a $GITHUB_ENV - echo "WORKFLOW_LINK=[Workflow logs]($REPO_URL/actions/runs/${{ github.run_id }})" | tee -a $GITHUB_ENV - working-directory: ${{ github.workspace }}/pr + python3 .github/scripts/bench.py comment \ + --tables tables --summary "$SUMMARY" \ + --head "${{ needs.setup.outputs.head-sha }}" \ + --repo-url "${{ github.server_url }}/${{ github.repository }}" \ + --run-id "${{ github.run_id }}" --out comment-body.md - name: Generate token to write PR comment - uses: actions/create-github-app-token@v3 - if: always() id: app-token + uses: actions/create-github-app-token@v3 with: app-id: ${{ secrets.TOKEN_APP_ID }} private-key: ${{ secrets.TOKEN_APP_PRIVATE_KEY }} - - name: Build benchmark comment body - if: success() - run: | - { - echo '## Benchmark for `${{ matrix.bench }}` at ${{ env.COMMIT_LINK }}'; - echo ""; - for file in .lake/benches/*/report.md; do - [ -f "$file" ] && cat "$file" && echo "" - done - echo "${{ env.WORKFLOW_LINK }}"; - } > ${{ github.workspace }}/comment-body.md - working-directory: ${{ github.workspace }}/pr - - name: Comment on successful run - if: success() + - name: Post / update comment uses: peter-evans/create-or-update-comment@v5 with: token: ${{ steps.app-token.outputs.token }} issue-number: ${{ github.event.issue.number }} - body-path: 'comment-body.md' - - name: Comment on failing run - if: failure() - uses: peter-evans/create-or-update-comment@v5 - with: - token: ${{ steps.app-token.outputs.token }} - issue-number: ${{ github.event.issue.number }} - body: | - ## Benchmark for `${{ matrix.bench }}` at ${{ env.COMMIT_LINK }} failed :x: - - [Workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) + body-path: comment-body.md diff --git a/.github/workflows/riscv-bench.yml b/.github/workflows/riscv-bench.yml index d9ec22b7..79ab4950 100644 --- a/.github/workflows/riscv-bench.yml +++ b/.github/workflows/riscv-bench.yml @@ -60,23 +60,10 @@ jobs: - uses: actions-rust-lang/setup-rust-toolchain@v1 with: cache-workspaces: sp1 - - name: Install system build deps - run: | - sudo apt-get update - sudo apt-get install -y \ - xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \ - nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \ - libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \ - openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \ - pkg-config libssl-dev + - uses: ./.github/actions/install-sp1 - uses: actions/download-artifact@v4 with: name: minimal-ixe - - name: Install SP1 toolchain (sp1up, latest) - run: | - curl -L https://sp1up.succinct.xyz | bash - ~/.sp1/bin/sp1up - echo "$HOME/.sp1/bin" >> "$GITHUB_PATH" # The precompile-aware SP1 runner-binary is auto-built from the fork git # dep by `sp1-core-executor-runner`'s build script — no manual override. - name: SP1 — execute minimal.ixe (assert failures == 0) @@ -94,58 +81,10 @@ jobs: - uses: actions-rust-lang/setup-rust-toolchain@v1 with: cache-workspaces: zisk - - name: Install system build deps - run: | - sudo apt-get update - sudo apt-get install -y \ - xz-utils jq curl build-essential qemu-system libomp-dev libgmp-dev \ - nlohmann-json3-dev protobuf-compiler uuid-dev libgrpc++-dev \ - libsecp256k1-dev libsodium-dev libpqxx-dev nasm libopenmpi-dev \ - openmpi-bin openmpi-common libclang-dev clang gcc-riscv64-unknown-elf \ - pkg-config libssl-dev + - uses: ./.github/actions/install-zisk - uses: actions/download-artifact@v4 with: name: minimal-ixe - - name: Install Zisk toolchain (ziskup, pinned v0.18.0) - # `--version 0.18.0` pins the toolchain to match our deps. Our host links - # the argumentcomputer/zisk `blake3-precompile` fork, which is based on - # v0.18.0 (its cargo-zisk has `check-setup`, used below to regenerate the - # key's const-trees). Without the pin, ziskup installs `releases/latest`, - # which resolves to upstream `v1.0.0-alpha` — a different circuit whose - # cargo-zisk dropped the `check-setup` subcommand, breaking the key step. - # `--cpu` picks the CPU build (no GPU on the runner) and `--nokey` skips - # ziskup's key install — both avoid its interactive /dev/tty prompts. We - # keep `--nokey` because the upstream `zisk-setup` bucket only carries the - # upstream circuit's key; our fork has a different circuit (extra Blake3f - # AIR), so we restore the fork-matching key from our own S3 in the next - # step. `--prefix $HOME/.zisk` pins the install where cargo-zisk's - # ZiskPaths fallback looks (the runner sets XDG_CONFIG_HOME, which would - # otherwise relocate it). - run: | - curl -L https://raw.githubusercontent.com/0xPolygonHermez/zisk/main/ziskup/install.sh \ - | bash -s -- --cpu --nokey -y --version 0.18.0 --prefix "$HOME/.zisk" - echo "$HOME/.zisk/bin" >> "$GITHUB_PATH" - # Execute still needs a proving key present: zisk-host calls - # `client.setup()` (which the SDK runs before the execute branch), and that - # loads the circuit's const-tree files. We host the fork-matching key in a - # public S3 bucket WITHOUT the const-trees — exactly like Zisk's released - # `zisk-provingkey-*.tar.gz` on `storage.googleapis.com/zisk-setup` — and - # regenerate them here with `cargo-zisk check-setup -a`, which is how - # `ziskup` itself populates them. That keeps the artifact ~3 GB (gzip) - # instead of ~48 GB. The object name carries the fork rev so a circuit - # change can't silently reuse a stale key. Public bucket → plain curl, no - # AWS creds. - - name: Restore Zisk proving key (fork circuit) from S3 - run: | - mkdir -p "$HOME/.zisk" - curl -fSL --retry 3 \ - https://argument-zisk-setup.s3.amazonaws.com/zisk-provingkey-blake3-8f9e24d5-cpu.tar.gz \ - -o /tmp/zisk-provingkey.tar.gz - tar -C "$HOME/.zisk" -xzf /tmp/zisk-provingkey.tar.gz - rm -f /tmp/zisk-provingkey.tar.gz - # Regenerate the const-tree files omitted from the artifact (CPU build, - # so no --gpu). This is the "may take a while" step ziskup prints. - cargo-zisk check-setup --proving-key "$HOME/.zisk/provingKey" -a - name: Zisk — execute minimal.ixe (assert failures == 0) run: | cd zisk diff --git a/Benchmarks/Typecheck.lean b/Benchmarks/Typecheck.lean index 047cbec7..1196c055 100644 --- a/Benchmarks/Typecheck.lean +++ b/Benchmarks/Typecheck.lean @@ -18,25 +18,34 @@ runtime. Useful standalone (per-constant timeline + RAM breakdown via tracing-texray) and as a machine source (neutral results JSON). ``` -lake exe bench-typecheck --ixe [names…] [flags] +lake exe bench-typecheck --ixe [--constant ] [names…] [flags] --ixe serialized `Ixon.Env`, e.g. from `ix compile Foo.lean` (writes `foo.ixe`). Required. - [names…] zero or more fully-qualified constant names to benchmark, + --constant constant to benchmark, by fully-qualified Lean name. The + canonical single-target flag, shared with the Zisk + `zisk-host --constant`. Unions with names / manifest. + [names…] zero or more additional constant names to benchmark, e.g. `Nat.add_comm String.append`. --manifest additionally read names from a file: one per line, blank lines and `#` comments ignored. Unions with [names…]. - Names (from either source) resolve against the env's named map via + Names (from any source) resolve against the env's named map via `String.toName` plus a `toString` fallback (mirrors `ix check --ixe`), so numeric / private components round-trip (`Foo.0.Bar`, `_private.M.0.foo`). - Pass at least one name or a `--manifest`. + Pass at least one name via --constant, positional args, or --manifest. + --skip-deps check only each target itself (verify_const, trusting its + deps) instead of its whole transitive closure (verify_claim, + the default). Same flag as `zisk-host --skip-deps`; reserved + for targets too expensive to full-closure-check. --json write per-constant results JSON to . Off by default: normal CLI usage prints only the human-readable summary. --texray force the tracing-texray timeline + RAM breakdown on. --no-texray force it off. Default: on iff `--json` was NOT given, so a plain local run gets the breakdown while a JSON run stays quiet. + --execute-only run only Phase 1 (constants / fft-cost / execute-time) and skip + proving — the fast `execute`-mode signal. ``` For each constant the harness STARK-checks `Ix.Claim.check addr none` (the full @@ -130,7 +139,13 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do let some ixeArg := p.flag? "ixe" | IO.eprintln "error: --ixe is required"; return 1 let ixePath := ixeArg.as! String - -- Names come from the variadic positional args and/or a `--manifest` file. + -- Names come from `--constant`, the variadic positional args, and/or a + -- `--manifest` file. `--constant` is the canonical single-target flag shared + -- with the Zisk `zisk-host --constant`; positional names and `--manifest` + -- remain for benchmarking many constants at once. + let constName : Array String := match p.flag? "constant" with + | some f => #[f.as! String] + | none => #[] let cliNames := p.variableArgsAs! String let fileNames ← match p.flag? "manifest" with | some f => pure (parseManifest (← IO.FS.readFile (f.as! String))) @@ -139,16 +154,19 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do let nameArgs := Id.run do let mut seen : Std.HashSet String := {} let mut acc : Array String := #[] - for n in cliNames ++ fileNames do + for n in constName ++ cliNames ++ fileNames do if !seen.contains n then seen := seen.insert n; acc := acc.push n return acc if nameArgs.isEmpty then - IO.eprintln "error: provide one or more constant names and/or --manifest " + IO.eprintln "error: provide a constant via --constant, positional name(s), and/or --manifest " return 1 let jsonOut : Option String := (p.flag? "json").map (·.as! String) - -- subject-only: check just the target (`verify_const`, trusting its deps) + -- skip-deps: check just the target (`verify_const`, trusting its deps) -- instead of re-checking the whole transitive closure (`verify_claim`). - let subjectOnly := p.hasFlag "subject-only" + let skipDeps := p.hasFlag "skip-deps" + -- Execute-only: run just Phase 1 (constants / fft-cost / execute-time) and + -- skip the Phase 2 prove loop. + let executeOnly := p.hasFlag "execute-only" -- Default: trace iff we're not in JSON/bencher mode. let useTexray := if p.hasFlag "texray" then true @@ -160,7 +178,7 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do | throw (IO.userError "Merging IxVM kernel failed") let .ok compiled := toplevel.compile | throw (IO.userError "Compilation of IxVM kernel failed") - let entrypoint := if subjectOnly then `verify_const else `verify_claim + let entrypoint := if skipDeps then `verify_const else `verify_claim let some funIdx := compiled.getFuncIdx entrypoint | throw (IO.userError s!"{entrypoint} entrypoint missing") let aiurSystem := Aiur.AiurSystem.build compiled.bytecode commitmentParameters @@ -190,7 +208,7 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do -- Phase 1: execute every constant (cheap, deterministic structural metrics). -- For full-closure check claims, use `checkAddrWithEnv` against the - -- shared `envHandle`. For `--subject-only` (`buildVerifyConst`), the + -- shared `envHandle`. For `--skip-deps` (`buildVerifyConst`), the -- witness is a small subject-only blob — keep Lean witness + -- `executeIxVM`. IO.println "── Phase 1: execute (witness generation) ──" @@ -198,7 +216,7 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do for (label, addr) in targets do try let (res, execSec) ← timed fun _ => - if subjectOnly then + if skipDeps then let witness := IxVM.ClaimHarness.buildVerifyConst ixonEnv addr compiled.bytecode.executeIxVM funIdx witness.input witness.inputIOBuffer else @@ -223,6 +241,15 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do IO.FS.writeFile path (Json.mkObj (results.map Result.toJsonEntry).toList).pretty | none => pure () + -- `--execute-only`: stop after Phase 1; the results JSON (if requested) is + -- already complete with the execute metrics. + if executeOnly then + writeJson (execed.map (·.1)) + match jsonOut with + | some path => IO.println s!"wrote {execed.size} execute-only benchmarks to {path}" + | none => IO.println s!"executed {execed.size} constants (--execute-only); pass --json to emit results" + return 0 + -- Phase 2: prove cheap→expensive. Refine each entry with its prove-time as it -- lands. Install texray first so the prove spans (timeline + RAM Δ/peak) render. if useTexray then TracingTexray.init {} @@ -234,7 +261,7 @@ def runTypecheckCmd (p : Cli.Parsed) : IO UInt32 := do let (r, addr) := ordered[i]! try let (proveRes, proveSec) ← timed fun _ => - if subjectOnly then + if skipDeps then let witness := IxVM.ClaimHarness.buildVerifyConst ixonEnv addr let (claim, proof, ioBuf) := aiurSystem.proveIxVM friParameters funIdx witness.input witness.inputIOBuffer @@ -270,9 +297,11 @@ def typecheckCmd : Cli.Cmd := `[Cli| FLAGS: "ixe" : String; "Path to a serialized `Ixon.Env` (e.g. produced by `ix compile`). Required." + "constant" : String; "Constant to benchmark, by fully-qualified Lean name. The canonical single-target flag (shared with `zisk-host --constant`). Unions with positional names / --manifest." "manifest" : String; "Additionally read constant names from a file (one per line; `#` comments and blank lines ignored). Unions with the positional names." "json" : String; "Write per-constant results JSON to this path. Off by default; normal CLI usage prints only the human-readable summary." - "subject-only"; "Check only each target itself (verify_const, trusting its deps) instead of re-checking its whole transitive closure (verify_claim)." + "skip-deps"; "Check only each target itself (verify_const, trusting its deps) instead of re-checking its whole transitive closure (verify_claim). Same flag as `zisk-host --skip-deps`." + "execute-only"; "Execute only (Phase 1: constants / fft-cost / execute-time) and skip proving. The fast per-PR `execute`-mode signal." texray; "Force the tracing-texray timeline + RAM breakdown on (per-prove spans on stderr)." "no-texray"; "Force the breakdown off. Default: on iff --json was not given." diff --git a/Benchmarks/Vectors.csv b/Benchmarks/Vectors.csv new file mode 100644 index 00000000..43adb520 --- /dev/null +++ b/Benchmarks/Vectors.csv @@ -0,0 +1,93 @@ +# Benchmark constant vectors -- single shared source of truth. +# Consumed identically by Aiur (bench-typecheck --manifest), the Zisk/SP1 hosts +# (--constant loop), and CI shell (awk). One row per curated constant. +# +# Provenance: measured across this box's benchmarking dirs, de-duplicated -- +# aiur_fft / env : ~/ix-aiur/Benchmarks/Statistics/data/aiur/cost.csv +# zisk_cycles : ~/ix-aiur/Benchmarks/Statistics/data/zisk/single_shard.csv +# 27 IxVM kernel-primitive constants from cost.csv are intentionally excluded +# (not Init/Std/Mathlib library constants). +# +# Columns: +# name fully-qualified Lean name (resolves via NameResolve.resolveIxeAddr). +# env compile target / .ixe it resolves in: initStd | lean | mathlib. +# tier cheap = prove-feasible per-PR; heavy = execute-only / sharded. +# Boundary: Aiur fft >= 1e9 => heavy. +# shard_target 1 = heavy constant designated as a multi-shard prove target. +# aiur_fft measured Aiur fft-cost (proving-cost proxy). Informational. +# zisk_cycles measured Zisk ziskemu step count; empty if unmeasured. Informational. +# +# CI filters on the env column (not line number), so these '#' lines and the +# header are skipped by: awk -F, '$1!~/^#/ && $1!="name" && $2==env ...' +name,env,tier,shard_target,aiur_fft,zisk_cycles +HEq,initStd,cheap,0,1716582, +Nat,initStd,cheap,0,1857523,975244 +Eq.rec,initStd,cheap,0,2575400,2348520 +HEq.rec,initStd,cheap,0,2692988,2727278 +Trans.mk,initStd,cheap,0,2911629,7229214 +Array.toList,initStd,cheap,0,3332563,2580844 +Acc.rec,initStd,cheap,0,3505064,5105888 +Std.Time.Month.Offset.ofNat,initStd,cheap,0,3607673,1493508 +Sum.elim,initStd,cheap,0,5589905,6618130 +Prod.map,initStd,cheap,0,6904183,8177456 +Option.bind,initStd,cheap,0,7329183,7440608 +Except.bind,initStd,cheap,0,7667869,9427477 +WellFounded.fix,initStd,cheap,0,10125144,13415585 +Nat.add,initStd,cheap,0,13343000,10606339 +List.foldr,initStd,cheap,0,18579757,16707100 +List.dropLast,initStd,cheap,0,19509718,17522863 +List.range,initStd,cheap,0,20251801,13666491 +List.zipWith,initStd,cheap,0,20439088,20229977 +List.filterMap,initStd,cheap,0,25335779,21435279 +List.foldlM,initStd,cheap,0,39202740, +Int.add,initStd,cheap,0,44714703,27635032 +BitVec.toFin,initStd,cheap,0,50437466,28681028 +Nat.add_comm,initStd,cheap,0,56084908,53239676 +UInt32.toNat,initStd,cheap,0,59331806,29980254 +USize.toNat,initStd,cheap,0,71607481,35811906 +Nat.decEq,initStd,cheap,0,71921625,57411966 +ByteSlice.ofByteArray,initStd,cheap,0,107574377,53555107 +Nat.decLe,initStd,cheap,0,209641496,143391161 +Nat.strongRecOn,initStd,cheap,0,273068854,190849758 +Int.emod,initStd,cheap,0,422940733,269380418 +Int.ediv,initStd,cheap,0,430476738,270987292 +Array.foldlM,initStd,cheap,0,434577494, +Array.foldl,initStd,cheap,0,449323126,278537034 +Array.filter,initStd,cheap,0,464818232,285847515 +Nat.sub_le_of_le_add,initStd,cheap,0,567575653,373184538 +BitVec.add,initStd,cheap,0,617113462,373772656 +Int.gcd,initStd,cheap,0,657502637,409112011 +Nat.toDigits,initStd,cheap,0,663606297,357145741 +Array.map,initStd,cheap,0,734574964,443199245 +Array.zipWith,initStd,cheap,0,736658636,445195121 +String.Internal.append,initStd,cheap,0,793580333, +Lean.Name.hash,initStd,cheap,0,861742653,447441591 +BitVec.umod,initStd,cheap,0,926177790,526467117 +Nat.repr,initStd,cheap,0,966452765,498729913 +Int.repr,initStd,cheap,0,993792541,504234535 +String.intercalate,initStd,heavy,0,1089240518,599428829 +_private.Init.Prelude.0.Lean.extractMainModule._unsafe_rec,initStd,heavy,0,1197925029, +Char.toLower,initStd,heavy,0,1198467414,665920824 +Nat.gcd_comm,initStd,heavy,0,1954958779,1144352360 +Int.emod_emod_of_dvd,initStd,heavy,0,3856852693,2201588182 +Array.append_assoc,initStd,heavy,0,3938574533,1570256148 +Vector.append,initStd,heavy,0,4023268168,1614275115 +Fin.foldl,initStd,heavy,0,10853255199,5110854190 +List.mergeSort,initStd,heavy,1,13825318985,6706906294 +Array.binSearch,initStd,heavy,1,14397133548,6785827470 +Array.qsort,initStd,heavy,0,15781689533,7199288749 +Array.qsortOrd,initStd,heavy,0,15841062472,7206704674 +String.split,initStd,heavy,0,19578088286,8657387499 +Std.Time.Week.Offset.ofMilliseconds,initStd,heavy,0,24577209792,6653972854 +Vector.extract_append,initStd,heavy,1,61830646478, +Lean.Expr.replace,lean,cheap,0,859625514, +List.Sorted,mathlib,cheap,0,9578666, +Nat.choose,mathlib,cheap,0,29018862, +Nat.factorial,mathlib,cheap,0,33562426, +Nat.fib,mathlib,cheap,0,34171209, +GCDMonoid.gcd,mathlib,heavy,0,1005736276, +Nat.Prime.two_le,mathlib,heavy,0,1504045298, +Finset.prod,mathlib,heavy,0,3045165822, +Finset.sum,mathlib,heavy,0,3045189408, +Polynomial.eval,mathlib,heavy,0,5342731754, +Multiset.sort,mathlib,heavy,1,18670960624, From 457ff2e6824de5610006a8694e4a91909f029073 Mon Sep 17 00:00:00 2001 From: samuelburnham <45365069+samuelburnham@users.noreply.github.com> Date: Tue, 30 Jun 2026 18:55:19 +0000 Subject: [PATCH 03/27] test(ci): trigger bench-pr.yml on pull_request (TEMPORARY) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit issue_comment workflows only run from the default branch, so the `!benchmark` path can't be exercised on a PR branch. Add a pull_request trigger (base/head from the PR payload; empty comment → parser defaults) to test pre-merge. Revert before merge — delete the `pull_request:` trigger; the dual base/head resolution and `|| pull_request.number` fallbacks are harmless to keep. --- .github/workflows/bench-pr.yml | 48 ++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index c623d432..f162e847 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -17,6 +17,12 @@ name: Benchmark pull requests on: issue_comment: types: [created] + # TEST: temporary trigger so this workflow can be exercised from a PR branch. + # issue_comment only fires once the workflow is on the default branch, so a + # `!benchmark` comment can't test it pre-merge. On pull_request there is no + # comment, so the run uses the parser defaults (aiur / initStd / execute). + # Remove this trigger before merge. + pull_request: permissions: contents: read @@ -24,18 +30,19 @@ permissions: pull-requests: read concurrency: - group: ${{ github.workflow }}-${{ github.event.issue.number }} + group: ${{ github.workflow }}-${{ github.event.issue.number || github.event.pull_request.number }} cancel-in-progress: true jobs: setup: name: Parse !benchmark comment if: >- - github.event.issue.pull_request && - github.event.issue.state == 'open' && - contains(github.event.comment.body, '!benchmark') && - (github.event.comment.author_association == 'MEMBER' || - github.event.comment.author_association == 'OWNER') + github.event_name == 'pull_request' || + (github.event.issue.pull_request && + github.event.issue.state == 'open' && + contains(github.event.comment.body, '!benchmark') && + (github.event.comment.author_association == 'MEMBER' || + github.event.comment.author_association == 'OWNER')) runs-on: ubuntu-latest outputs: matrix: ${{ steps.parse.outputs.matrix }} @@ -44,16 +51,29 @@ jobs: shard: ${{ steps.parse.outputs.shard }} passthrough-env: ${{ steps.parse.outputs.passthrough-env }} config-summary: ${{ steps.parse.outputs.config-summary }} - base-sha: ${{ steps.comment-branch.outputs.base_sha }} - head-sha: ${{ steps.comment-branch.outputs.head_sha }} + base-sha: ${{ steps.shas.outputs.base }} + head-sha: ${{ steps.shas.outputs.head }} steps: - uses: actions/checkout@v6 - # Resolve the PR's base/head SHAs from the comment's issue. - - uses: xt0rted/pull-request-comment-branch@v3 + # issue_comment exposes the PR's base/head only via this action; the + # pull_request event carries them on the payload directly. + - if: github.event_name == 'issue_comment' + uses: xt0rted/pull-request-comment-branch@v3 id: comment-branch - # Parse the comment from an env var (never inline-interpolated) → matrix + - # config. The allowlist drops anything that isn't a known flag/env key. - - name: Parse comment + - name: Resolve base/head SHAs + id: shas + run: | + if [ "${{ github.event_name }}" = pull_request ]; then + echo "base=${{ github.event.pull_request.base.sha }}" >> "$GITHUB_OUTPUT" + echo "head=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT" + else + echo "base=${{ steps.comment-branch.outputs.base_sha }}" >> "$GITHUB_OUTPUT" + echo "head=${{ steps.comment-branch.outputs.head_sha }}" >> "$GITHUB_OUTPUT" + fi + # Parse the !benchmark command from an env var (never inline-interpolated). + # The allowlist drops anything that isn't a known flag/env key; an empty + # body (pull_request) yields the parser defaults. + - name: Parse command id: parse env: COMMENT_BODY: ${{ github.event.comment.body }} @@ -239,5 +259,5 @@ jobs: uses: peter-evans/create-or-update-comment@v5 with: token: ${{ steps.app-token.outputs.token }} - issue-number: ${{ github.event.issue.number }} + issue-number: ${{ github.event.issue.number || github.event.pull_request.number }} body-path: comment-body.md From 2937a9d900abf8d86f1e45c90d7ec268cd167ee4 Mon Sep 17 00:00:00 2001 From: samuelburnham <45365069+samuelburnham@users.noreply.github.com> Date: Tue, 30 Jun 2026 19:52:12 +0000 Subject: [PATCH 04/27] feat(ci): default to ~11 primary constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The full curated set (~60 InitStd) is too slow to run on every !benchmark and on the bencher prove/zkVM jobs. Add a `primary` column to Vectors.csv marking 11 constants spanning shape (nat/list/array/int/string/vector/multiset, defs + proofs) and the cheap→heavy cost range (incl. 3 shard targets), and make it the default: - bench.py manifest --primary; parse honors BENCH_FULL (run the full set). - bench-pr.yml: !benchmark defaults to the primary subset; BENCH_FULL=1 runs the whole curated set. - bench-main.yml: prove + zkvm-execute derive constants from `manifest --primary` (replacing the hardcoded lists), so all backends bench the same set from the one source of truth. The tier filter keeps prove on the cheap primaries (heavy ones would OOM a single-shard prove); execute/native get the heavy ones for scale. --- .github/scripts/bench.py | 14 ++- .github/workflows/bench-main.yml | 24 +++-- .github/workflows/bench-pr.yml | 9 +- Benchmarks/Vectors.csv | 149 ++++++++++++++++--------------- 4 files changed, 110 insertions(+), 86 deletions(-) diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py index c27582c9..4622b9f4 100644 --- a/.github/scripts/bench.py +++ b/.github/scripts/bench.py @@ -21,7 +21,7 @@ BACKENDS = ("aiur", "zisk", "sp1") MODES = ("execute", "prove") ENVS = ("initStd", "lean", "mathlib") -CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_GPU"} +CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_GPU", "BENCH_FULL"} PASSTHROUGH_KEYS = {"RUST_LOG", "WITHOUT_VK_VERIFICATION", "RUSTFLAGS"} @@ -70,6 +70,7 @@ def cmd_parse(_a): if tier not in ("cheap", "heavy", "all"): tier = "" # empty ⇒ derived from mode at manifest time shard = "1" if cfg.get("BENCH_SHARD") == "1" else "0" + full = "1" if cfg.get("BENCH_FULL") == "1" else "0" # full set vs primary subset gpu = cfg.get("BENCH_GPU") == "1" cells = [] @@ -80,14 +81,14 @@ def cmd_parse(_a): "skip": "true" if skip else "false", "label": f"{b}-{e}-{mode}"}) summary = (f"backends: `{' '.join(backends)}` · mode: `{mode}` · " - f"envs: `{','.join(envs)}` · tier: `{tier or 'auto'}` · " - f"shard: `{shard}` · gpu: `{int(gpu)}`") + f"envs: `{','.join(envs)}` · set: `{'full' if full == '1' else 'primary'}` · " + f"tier: `{tier or 'auto'}` · shard: `{shard}` · gpu: `{int(gpu)}`") if passthrough: summary += " · env: `" + " ".join(passthrough) + "`" with open(os.environ.get("GITHUB_OUTPUT", "/dev/stdout"), "a") as f: f.write(f"matrix={json.dumps(cells)}\n") - f.write(f"mode={mode}\ntier={tier}\nshard={shard}\n") + f.write(f"mode={mode}\ntier={tier}\nshard={shard}\nfull={full}\n") f.write(f"config-summary={summary}\n") f.write("passthrough-env<= 5 else "0" if env != a.env: continue + if a.primary and rep != "1": + continue if tier in ("cheap", "heavy") and ctier != tier: continue if a.shard == "1" and shard != "1": @@ -245,6 +249,8 @@ def main(): m.add_argument("--csv", required=True); m.add_argument("--env", required=True) m.add_argument("--mode", required=True); m.add_argument("--tier", default="") m.add_argument("--shard", default="0"); m.add_argument("--out", required=True) + m.add_argument("--primary", action="store_true", + help="Restrict to the primary subset (the primary=1 column).") m.set_defaults(fn=cmd_manifest) c = sub.add_parser("compare") diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml index 734bf787..5ddda359 100644 --- a/.github/workflows/bench-main.yml +++ b/.github/workflows/bench-main.yml @@ -178,11 +178,7 @@ jobs: strategy: fail-fast: false matrix: - include: - - bench: InitStd - consts: Nat.add_comm Nat.sub_le_of_le_add String.append Array.append_assoc - - bench: Mathlib - consts: Nat.factorial Nat.Coprime Nat.Prime.two_le + bench: [InitStd, Mathlib] steps: - uses: actions/checkout@v6 with: @@ -230,7 +226,14 @@ jobs: "res-$c.json" > "res-$c.json.tmp" && mv "res-$c.json.tmp" "res-$c.json" || true fi } - for c in ${{ matrix.consts }}; do measure "$c"; done + # The primary subset's prove tier (cheap primaries) — heavy primaries are + # execute-only (a single-shard prove would OOM the runner). + benv="${{ matrix.bench }}"; benv="${benv,}" # InitStd→initStd, Mathlib→mathlib + python3 .github/scripts/bench.py manifest \ + --csv Benchmarks/Vectors.csv --env "$benv" --mode prove --primary \ + --out names.txt + echo "proving $(wc -l < names.txt) primary constants:"; cat names.txt + while IFS= read -r c; do [ -n "$c" ] && measure "$c"; done < names.txt # Merge the per-constant results; if none produced anything, emit `{}`. jq -s 'reduce .[] as $o ({}; . + $o)' res-*.json > results.json 2>/dev/null \ || echo '{}' > results.json @@ -293,7 +296,6 @@ jobs: backend: [zisk, sp1] include: - bench: InitStd - consts: Nat.add_comm Nat.sub_le_of_le_add String.append Array.append_assoc steps: - uses: actions/checkout@v6 with: @@ -318,7 +320,13 @@ jobs: env: REUSE_IXE: "1" run: | - printf '%s\n' ${{ matrix.consts }} > names.txt + # The primary subset in execute mode = all primaries for the env + # (cheap + heavy); execute handles the heavy ones, unlike prove. + benv="${{ matrix.bench }}"; benv="${benv,}" # InitStd→initStd + python3 .github/scripts/bench.py manifest \ + --csv Benchmarks/Vectors.csv --env "$benv" --mode execute --primary \ + --out names.txt + echo "executing $(wc -l < names.txt) primary constants:"; cat names.txt bash .github/scripts/run.sh . ${{ matrix.bench }} ${{ matrix.backend }} execute \ names.txt neutral.json # Wrap neutral { name: { metric: v } } → Bencher Metric Format. diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index f162e847..9f75a308 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -3,6 +3,7 @@ # # !benchmark [aiur] [zisk] [sp1 | all] [execute|prove] # BENCH_ENVS=initStd,mathlib # which compiled envs (default initStd) +# BENCH_FULL=1 # run the full curated set, not the ~11 primary # BENCH_TIER=cheap|heavy|all # override the mode default (execute=all, prove=cheap) # BENCH_SHARD=1 # restrict to the multi-shard target constants # BENCH_GPU=1 # allow zkVM prove on a self-hosted GPU runner @@ -49,6 +50,7 @@ jobs: mode: ${{ steps.parse.outputs.mode }} tier: ${{ steps.parse.outputs.tier }} shard: ${{ steps.parse.outputs.shard }} + full: ${{ steps.parse.outputs.full }} passthrough-env: ${{ steps.parse.outputs.passthrough-env }} config-summary: ${{ steps.parse.outputs.config-summary }} base-sha: ${{ steps.shas.outputs.base }} @@ -96,6 +98,7 @@ jobs: HEAD_SHA: ${{ needs.setup.outputs.head-sha }} TIER: ${{ needs.setup.outputs.tier }} SHARD: ${{ needs.setup.outputs.shard }} + FULL: ${{ needs.setup.outputs.full }} steps: # ---------- skipped cell (zkVM prove without a GPU runner) ---------- - name: Skip note @@ -126,14 +129,16 @@ jobs: PTENV # Select the constants for this cell → names.txt; emit count/vhash/tier # (vhash is part of the main cache key, so editing Vectors.csv invalidates - # stale main results). + # stale main results). Defaults to the primary subset; BENCH_FULL=1 + # (→ full=1) runs the whole curated set. - name: Select constants from Benchmarks/Vectors.csv if: matrix.cell.skip != 'true' id: man run: | + PRIMARY=--primary; [ "$FULL" = 1 ] && PRIMARY= python3 .github/scripts/bench.py manifest \ --csv Benchmarks/Vectors.csv --env "$BENV" --mode "$MODE" \ - --tier "$TIER" --shard "$SHARD" --out "$GITHUB_WORKSPACE/names.txt" \ + --tier "$TIER" --shard "$SHARD" $PRIMARY --out "$GITHUB_WORKSPACE/names.txt" \ | tee -a "$GITHUB_OUTPUT" # Lean toolchain + build `ix` and `bench-typecheck` for the PR side (every diff --git a/Benchmarks/Vectors.csv b/Benchmarks/Vectors.csv index 43adb520..65fe4dcf 100644 --- a/Benchmarks/Vectors.csv +++ b/Benchmarks/Vectors.csv @@ -14,80 +14,85 @@ # tier cheap = prove-feasible per-PR; heavy = execute-only / sharded. # Boundary: Aiur fft >= 1e9 => heavy. # shard_target 1 = heavy constant designated as a multi-shard prove target. +# primary 1 = part of the ~11-constant primary subset spanning +# shape + the cheap->heavy cost range. Default for the !benchmark +# PR comment and the bencher prove / zkVM jobs (full set via +# BENCH_FULL=1). Heavy reps run only in execute/native; prove +# mode keeps the cheap primaries (tier filter). # aiur_fft measured Aiur fft-cost (proving-cost proxy). Informational. # zisk_cycles measured Zisk ziskemu step count; empty if unmeasured. Informational. # # CI filters on the env column (not line number), so these '#' lines and the # header are skipped by: awk -F, '$1!~/^#/ && $1!="name" && $2==env ...' -name,env,tier,shard_target,aiur_fft,zisk_cycles -HEq,initStd,cheap,0,1716582, -Nat,initStd,cheap,0,1857523,975244 -Eq.rec,initStd,cheap,0,2575400,2348520 -HEq.rec,initStd,cheap,0,2692988,2727278 -Trans.mk,initStd,cheap,0,2911629,7229214 -Array.toList,initStd,cheap,0,3332563,2580844 -Acc.rec,initStd,cheap,0,3505064,5105888 -Std.Time.Month.Offset.ofNat,initStd,cheap,0,3607673,1493508 -Sum.elim,initStd,cheap,0,5589905,6618130 -Prod.map,initStd,cheap,0,6904183,8177456 -Option.bind,initStd,cheap,0,7329183,7440608 -Except.bind,initStd,cheap,0,7667869,9427477 -WellFounded.fix,initStd,cheap,0,10125144,13415585 -Nat.add,initStd,cheap,0,13343000,10606339 -List.foldr,initStd,cheap,0,18579757,16707100 -List.dropLast,initStd,cheap,0,19509718,17522863 -List.range,initStd,cheap,0,20251801,13666491 -List.zipWith,initStd,cheap,0,20439088,20229977 -List.filterMap,initStd,cheap,0,25335779,21435279 -List.foldlM,initStd,cheap,0,39202740, -Int.add,initStd,cheap,0,44714703,27635032 -BitVec.toFin,initStd,cheap,0,50437466,28681028 -Nat.add_comm,initStd,cheap,0,56084908,53239676 -UInt32.toNat,initStd,cheap,0,59331806,29980254 -USize.toNat,initStd,cheap,0,71607481,35811906 -Nat.decEq,initStd,cheap,0,71921625,57411966 -ByteSlice.ofByteArray,initStd,cheap,0,107574377,53555107 -Nat.decLe,initStd,cheap,0,209641496,143391161 -Nat.strongRecOn,initStd,cheap,0,273068854,190849758 -Int.emod,initStd,cheap,0,422940733,269380418 -Int.ediv,initStd,cheap,0,430476738,270987292 -Array.foldlM,initStd,cheap,0,434577494, -Array.foldl,initStd,cheap,0,449323126,278537034 -Array.filter,initStd,cheap,0,464818232,285847515 -Nat.sub_le_of_le_add,initStd,cheap,0,567575653,373184538 -BitVec.add,initStd,cheap,0,617113462,373772656 -Int.gcd,initStd,cheap,0,657502637,409112011 -Nat.toDigits,initStd,cheap,0,663606297,357145741 -Array.map,initStd,cheap,0,734574964,443199245 -Array.zipWith,initStd,cheap,0,736658636,445195121 -String.Internal.append,initStd,cheap,0,793580333, -Lean.Name.hash,initStd,cheap,0,861742653,447441591 -BitVec.umod,initStd,cheap,0,926177790,526467117 -Nat.repr,initStd,cheap,0,966452765,498729913 -Int.repr,initStd,cheap,0,993792541,504234535 -String.intercalate,initStd,heavy,0,1089240518,599428829 -_private.Init.Prelude.0.Lean.extractMainModule._unsafe_rec,initStd,heavy,0,1197925029, -Char.toLower,initStd,heavy,0,1198467414,665920824 -Nat.gcd_comm,initStd,heavy,0,1954958779,1144352360 -Int.emod_emod_of_dvd,initStd,heavy,0,3856852693,2201588182 -Array.append_assoc,initStd,heavy,0,3938574533,1570256148 -Vector.append,initStd,heavy,0,4023268168,1614275115 -Fin.foldl,initStd,heavy,0,10853255199,5110854190 -List.mergeSort,initStd,heavy,1,13825318985,6706906294 -Array.binSearch,initStd,heavy,1,14397133548,6785827470 -Array.qsort,initStd,heavy,0,15781689533,7199288749 -Array.qsortOrd,initStd,heavy,0,15841062472,7206704674 -String.split,initStd,heavy,0,19578088286,8657387499 -Std.Time.Week.Offset.ofMilliseconds,initStd,heavy,0,24577209792,6653972854 -Vector.extract_append,initStd,heavy,1,61830646478, -Lean.Expr.replace,lean,cheap,0,859625514, -List.Sorted,mathlib,cheap,0,9578666, -Nat.choose,mathlib,cheap,0,29018862, -Nat.factorial,mathlib,cheap,0,33562426, -Nat.fib,mathlib,cheap,0,34171209, -GCDMonoid.gcd,mathlib,heavy,0,1005736276, -Nat.Prime.two_le,mathlib,heavy,0,1504045298, -Finset.prod,mathlib,heavy,0,3045165822, -Finset.sum,mathlib,heavy,0,3045189408, -Polynomial.eval,mathlib,heavy,0,5342731754, -Multiset.sort,mathlib,heavy,1,18670960624, +name,env,tier,shard_target,primary,aiur_fft,zisk_cycles +HEq,initStd,cheap,0,0,1716582, +Nat,initStd,cheap,0,0,1857523,975244 +Eq.rec,initStd,cheap,0,0,2575400,2348520 +HEq.rec,initStd,cheap,0,0,2692988,2727278 +Trans.mk,initStd,cheap,0,0,2911629,7229214 +Array.toList,initStd,cheap,0,0,3332563,2580844 +Acc.rec,initStd,cheap,0,0,3505064,5105888 +Std.Time.Month.Offset.ofNat,initStd,cheap,0,0,3607673,1493508 +Sum.elim,initStd,cheap,0,0,5589905,6618130 +Prod.map,initStd,cheap,0,0,6904183,8177456 +Option.bind,initStd,cheap,0,0,7329183,7440608 +Except.bind,initStd,cheap,0,0,7667869,9427477 +WellFounded.fix,initStd,cheap,0,0,10125144,13415585 +Nat.add,initStd,cheap,0,0,13343000,10606339 +List.foldr,initStd,cheap,0,1,18579757,16707100 +List.dropLast,initStd,cheap,0,0,19509718,17522863 +List.range,initStd,cheap,0,0,20251801,13666491 +List.zipWith,initStd,cheap,0,0,20439088,20229977 +List.filterMap,initStd,cheap,0,0,25335779,21435279 +List.foldlM,initStd,cheap,0,0,39202740, +Int.add,initStd,cheap,0,0,44714703,27635032 +BitVec.toFin,initStd,cheap,0,0,50437466,28681028 +Nat.add_comm,initStd,cheap,0,1,56084908,53239676 +UInt32.toNat,initStd,cheap,0,0,59331806,29980254 +USize.toNat,initStd,cheap,0,0,71607481,35811906 +Nat.decEq,initStd,cheap,0,0,71921625,57411966 +ByteSlice.ofByteArray,initStd,cheap,0,0,107574377,53555107 +Nat.decLe,initStd,cheap,0,0,209641496,143391161 +Nat.strongRecOn,initStd,cheap,0,0,273068854,190849758 +Int.emod,initStd,cheap,0,0,422940733,269380418 +Int.ediv,initStd,cheap,0,0,430476738,270987292 +Array.foldlM,initStd,cheap,0,0,434577494, +Array.foldl,initStd,cheap,0,1,449323126,278537034 +Array.filter,initStd,cheap,0,0,464818232,285847515 +Nat.sub_le_of_le_add,initStd,cheap,0,0,567575653,373184538 +BitVec.add,initStd,cheap,0,0,617113462,373772656 +Int.gcd,initStd,cheap,0,1,657502637,409112011 +Nat.toDigits,initStd,cheap,0,0,663606297,357145741 +Array.map,initStd,cheap,0,0,734574964,443199245 +Array.zipWith,initStd,cheap,0,0,736658636,445195121 +String.Internal.append,initStd,cheap,0,1,793580333, +Lean.Name.hash,initStd,cheap,0,0,861742653,447441591 +BitVec.umod,initStd,cheap,0,0,926177790,526467117 +Nat.repr,initStd,cheap,0,0,966452765,498729913 +Int.repr,initStd,cheap,0,0,993792541,504234535 +String.intercalate,initStd,heavy,0,0,1089240518,599428829 +_private.Init.Prelude.0.Lean.extractMainModule._unsafe_rec,initStd,heavy,0,0,1197925029, +Char.toLower,initStd,heavy,0,0,1198467414,665920824 +Nat.gcd_comm,initStd,heavy,0,1,1954958779,1144352360 +Int.emod_emod_of_dvd,initStd,heavy,0,0,3856852693,2201588182 +Array.append_assoc,initStd,heavy,0,0,3938574533,1570256148 +Vector.append,initStd,heavy,0,0,4023268168,1614275115 +Fin.foldl,initStd,heavy,0,0,10853255199,5110854190 +List.mergeSort,initStd,heavy,1,1,13825318985,6706906294 +Array.binSearch,initStd,heavy,1,0,14397133548,6785827470 +Array.qsort,initStd,heavy,0,0,15781689533,7199288749 +Array.qsortOrd,initStd,heavy,0,0,15841062472,7206704674 +String.split,initStd,heavy,0,1,19578088286,8657387499 +Std.Time.Week.Offset.ofMilliseconds,initStd,heavy,0,0,24577209792,6653972854 +Vector.extract_append,initStd,heavy,1,1,61830646478, +Lean.Expr.replace,lean,cheap,0,0,859625514, +List.Sorted,mathlib,cheap,0,0,9578666, +Nat.choose,mathlib,cheap,0,0,29018862, +Nat.factorial,mathlib,cheap,0,1,33562426, +Nat.fib,mathlib,cheap,0,0,34171209, +GCDMonoid.gcd,mathlib,heavy,0,0,1005736276, +Nat.Prime.two_le,mathlib,heavy,0,0,1504045298, +Finset.prod,mathlib,heavy,0,0,3045165822, +Finset.sum,mathlib,heavy,0,0,3045189408, +Polynomial.eval,mathlib,heavy,0,0,5342731754, +Multiset.sort,mathlib,heavy,1,1,18670960624, From efd91aa1dbf50f39a48770ed92c48574a139b6b0 Mon Sep 17 00:00:00 2001 From: samuelburnham <45365069+samuelburnham@users.noreply.github.com> Date: Tue, 30 Jun 2026 20:53:00 +0000 Subject: [PATCH 05/27] feat(ci): native backend on !benchmark + bencher parity/cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bring both surfaces (!benchmark PR comment and bencher.dev) to parity across all four backends and remove the test scaffolding. - Native kernel on !benchmark: add a `native` backend (whole-env `ix check --anon`, the parallel out-of-circuit kernel) — bench.py backend/runner/metrics, a run.sh `native` branch, and a GNU-time install for native PR cells. `all` now fans out aiur/zisk/sp1/native. - run.sh: single-source the Aiur path through a per-constant bench-typecheck loop (per-constant peak-rss), add a per-constant `timeout` to the zkVM execute path (heavy primaries can't hang a job), and accept the env arg case-insensitively (bencher reuses the cached InitStd.ixe; bench-pr compiles initStd.ixe). - bench-main.yml: the prove and native-check jobs now drive run.sh too (dedup with the PR path); zkvm-execute gains the Mathlib env to match prove/native. - bench-thresholds-reset.yml: register the zisk / sp1 / native-check workloads. - bench-pr.yml: drop the temporary pull_request trigger (keep the harmless dual-SHA / number fallbacks). - docs/benchmarking.md: document the two surfaces, backends, Vectors.csv, the !benchmark grammar, and the bencher workloads / threshold resets. --- .github/scripts/bench.py | 7 +- .github/scripts/run.sh | 99 +++++++++++++------ .github/workflows/bench-main.yml | 68 ++++--------- .github/workflows/bench-pr.yml | 27 +++-- .../workflows/bencher-thresholds-reset.yml | 13 +-- docs/benchmarking.md | 70 +++++++++++++ 6 files changed, 183 insertions(+), 101 deletions(-) create mode 100644 docs/benchmarking.md diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py index 4622b9f4..018a1b89 100644 --- a/.github/scripts/bench.py +++ b/.github/scripts/bench.py @@ -18,7 +18,7 @@ # ───────────────────────── parse ───────────────────────── -BACKENDS = ("aiur", "zisk", "sp1") +BACKENDS = ("aiur", "zisk", "sp1", "native") MODES = ("execute", "prove") ENVS = ("initStd", "lean", "mathlib") CONFIG_KEYS = {"BENCH_ENVS", "BENCH_TIER", "BENCH_SHARD", "BENCH_GPU", "BENCH_FULL"} @@ -29,6 +29,8 @@ def runner_for(backend, mode, gpu): """(runs-on label, skip?) for a cell.""" if backend == "aiur": return "warp-ubuntu-latest-x64-32x", False + if backend == "native": # whole-env parallel check; no proving, never skips + return "warp-ubuntu-latest-x64-32x", False if mode == "execute": return "warp-ubuntu-latest-x64-16x", False if gpu: # zkVM prove needs a GPU @@ -132,6 +134,9 @@ def cmd_manifest(a): ("zisk", "execute"): ["cycles", "execute-time", "throughput", "peak-rss"], ("sp1", "execute"): ["cycles", "execute-time", "throughput", "peak-rss"], ("zisk", "prove"): ["prove-time", "steps"], ("sp1", "prove"): ["prove-time", "steps"], + # native is whole-env (one row per env); mode is ignored (it never proves). + ("native", "execute"): ["throughput", "check-time", "peak-rss"], + ("native", "prove"): ["throughput", "check-time", "peak-rss"], } diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh index 045a1f4f..165cbc69 100644 --- a/.github/scripts/run.sh +++ b/.github/scripts/run.sh @@ -1,19 +1,20 @@ #!/usr/bin/env bash -# Compile one library env to a `.ixe` from a checked-out repo, then benchmark the -# given backend over a manifest, emitting the neutral results JSON +# Compile one library env to a `.ixe` from a checked-out repo (unless REUSE_IXE), +# then benchmark the given backend, emitting the neutral results JSON # { "": { "": , ... }, ... } -# that bench.py compare consumes. +# that bench.py compare / the bencher jobs consume. # # run.sh # repo_dir : checked-out worktree (has .lake/build/bin/{ix,bench-typecheck}) # env : initStd | lean | mathlib -# backend : aiur | zisk | sp1 +# backend : aiur | zisk | sp1 | native # mode : execute | prove # -# `ix` / `bench-typecheck` are taken from (so base measures base's -# code, PR the PR's — the caller puts /.lake/build/bin on PATH). The -# zkVM hosts run from /. Per-constant subprocesses for the -# zkVM hosts so one OOM/timeout drops only that row. +# `ix` / `bench-typecheck` come from (so base measures base's code, PR +# the PR's — the caller puts /.lake/build/bin on PATH). aiur / zisk / +# sp1 run one subprocess per constant so a failure/timeout drops only that row; +# native is whole-env (`ix check --anon`, the parallel out-of-circuit kernel) and +# ignores , keyed by the env. set -uo pipefail repo=${1:?repo_dir}; benv=${2:?env}; backend=${3:?backend}; mode=${4:?mode} @@ -24,8 +25,11 @@ repo=$(cd "$repo" && pwd) : > "$out" emit_empty() { [ -s "$out" ] || echo '{}' > "$out"; } -case "$benv" in - initStd) module=CompileInitStd ;; +# `$benv` is used verbatim for the `.ixe` filename (bench-pr compiles `initStd.ixe`; +# the bencher jobs reuse the compile job's cached `InitStd.ixe`), and lowercased +# only to pick the Compile module. +case "$(printf '%s' "$benv" | tr '[:upper:]' '[:lower:]')" in + initstd) module=CompileInitStd ;; lean) module=CompileLean ;; mathlib) module=CompileMathlib ;; *) echo "unknown env: $benv" >&2; exit 2 ;; @@ -40,32 +44,42 @@ else echo "::endgroup::" fi +tmp=$(mktemp -d) + case "$backend" in aiur) - # bench-typecheck runs Phase 1 (execute) always; Phase 2 (prove) unless - # --execute-only. It writes the neutral JSON itself via --json. - args=(--ixe "$ixe" --manifest "$names" --json "$out") - if [ "$mode" = execute ]; then - bench-typecheck "${args[@]}" --execute-only || true - else - bench-typecheck "${args[@]}" --texray 2> tx.log || true - # Fold texray's proving RSS high-water mark into every entry (max over - # spans; $2+0 stops at the first non-digit) — same parse as aiur-bench.yml. - rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>m {m=$2+0} END {if (m>0) print m}' tx.log) - if [ -n "${rss:-}" ] && [ -s "$out" ]; then - jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' "$out" > "$out.t" \ - && mv "$out.t" "$out" || true + # One bench-typecheck subprocess per constant (isolation + per-constant + # peak-rss). Phase 1 (execute) always runs; Phase 2 (prove) unless + # --execute-only. bench-typecheck writes the neutral per-constant JSON. + while IFS= read -r c; do + [ -z "$c" ] && continue + slug=$(printf '%s' "$c" | tr '/ .:' '____') + res="$tmp/$slug.json" + if [ "$mode" = execute ]; then + bench-typecheck --ixe "$ixe" "$c" --json "$res" --execute-only \ + || { echo "::warning::aiur execute '$c' failed; dropping"; continue; } + else + bench-typecheck --ixe "$ixe" "$c" --json "$res" --texray 2> "$tmp/$slug.tx" \ + || { echo "::warning::aiur prove '$c' failed (OOM/timeout); dropping"; continue; } + # Fold texray's proving RSS high-water mark (max over spans; awk's $2+0 + # stops at the first non-digit) into this constant's entry. + rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>m {m=$2+0} END {if (m>0) print m}' "$tmp/$slug.tx") + if [ -n "${rss:-}" ] && [ -s "$res" ]; then + jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' "$res" > "$res.t" \ + && mv "$res.t" "$res" || true + fi fi - fi + [ -s "$res" ] && cat "$res" + done < "$names" | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null emit_empty ;; zisk|sp1) host="${backend}-host"; work="$repo/$backend" - # Build the host once so per-constant timing excludes compilation, and run the - # binary directly under `/usr/bin/time` — a `timeout`/`cargo run` wrapper would - # report the wrapper's RSS, not the host's. (No per-constant timeout in execute - # mode; the job-level timeout bounds a hang.) + # Build the host once so per-constant timing excludes compilation. Order is + # `timeout … /usr/bin/time … host`: timeout bounds a runaway constant while + # /usr/bin/time still measures the host directly (its child), so RSS/elapsed + # are the host's, not a wrapper's. echo "::group::cargo build $host" ( cd "$work" && cargo build --quiet --release --bin "$host" ) echo "::endgroup::" @@ -76,16 +90,14 @@ case "$backend" in # in-session as root; the host children inherit it. Without this the ASM # services die with `mmap(rom) errno=11`. SP1 needs no such raise. [ "$backend" = zisk ] && sudo prlimit --pid $$ --memlock=unlimited:unlimited - tmp=$(mktemp -d) while IFS= read -r c; do [ -z "$c" ] && continue slug=$(printf '%s' "$c" | tr '/ .:' '____') log="$tmp/$slug.out"; tmf="$tmp/$slug.time" if [ "$mode" = execute ]; then - # `/usr/bin/time -f '%e %M'` → elapsed seconds + max RSS (kB). - ( cd "$work" && /usr/bin/time -f '%e %M' -o "$tmf" \ + ( cd "$work" && timeout 25m /usr/bin/time -f '%e %M' -o "$tmf" \ "$bin" --execute --ixe "$ixe" --constant "$c" --skip-deps ) > "$log" 2>>"$log" \ - || { echo "::warning::$backend execute '$c' failed; dropping"; continue; } + || { echo "::warning::$backend execute '$c' failed/timed out; dropping"; continue; } fail=$(grep -oE 'failures[:=] ?[0-9]+' "$log" | head -1 | grep -oE '[0-9]+') if [ "${fail:-1}" != 0 ]; then echo "::warning::$backend '$c': nonzero/missing failures; dropping"; continue @@ -130,6 +142,29 @@ case "$backend" in emit_empty ;; + native) + # Native out-of-circuit Rust kernel, whole env across available_parallelism + # workers — far faster than proving; ignores . ix check is a + # single multi-threaded process so /usr/bin/time -f '%M' is the true peak RSS. + # `##check## ` is the machine line. + log="$tmp/native.out"; tmf="$tmp/native.time" + /usr/bin/time -f '%e %M' -o "$tmf" ix check "$ixe" --anon > "$log" 2>>"$log" \ + || { echo "::warning::native check failed"; emit_empty; exit 0; } + line=$(grep '^##check##' "$log" | tail -1) + elapsed_ms=$(echo "$line" | awk '{print $2}') + failures=$(echo "$line" | awk '{print $4}'); total=$(echo "$line" | awk '{print $5}') + if [ -z "${total:-}" ] || [ "${failures:-1}" != 0 ]; then + echo "::warning::native check: nonzero/missing failures or no ##check## line"; emit_empty; exit 0 + fi + rssk=$(awk 'NR==1{print $2}' "$tmf"); rss=$(( ${rssk:-0} * 1024 )) + check_s=$(awk -v e="$elapsed_ms" 'BEGIN{printf "%.3f", e/1000}') + tput=$(awk -v t="$total" -v e="$elapsed_ms" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }') + jq -n --arg n "$benv" --argjson c "$total" --argjson s "$check_s" \ + --argjson tp "$tput" --argjson rss "$rss" \ + '{($n): {constants:$c, "check-time":$s, throughput:$tp, "peak-rss":$rss}}' > "$out" + emit_empty + ;; + *) echo "unknown backend: $backend" >&2; exit 2 ;; esac echo "rows in $out: $(jq 'length' "$out" 2>/dev/null || echo '?')" diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml index 5ddda359..ff6c8c4e 100644 --- a/.github/workflows/bench-main.yml +++ b/.github/workflows/bench-main.yml @@ -214,37 +214,20 @@ jobs: # fold the max over spans in as `peak-rss` (bytes), the proving RSS # high-water mark. - name: Run Aiur typecheck benchmark + env: + REUSE_IXE: "1" run: | - measure() { - local c="$1" rss - timeout 20m bench-typecheck --ixe ${{ matrix.bench }}.ixe "$c" \ - --json "res-$c.json" --texray 2>"tx-$c.log" \ - || echo "warning: $c failed (OOM/timeout); dropping it from this report" - rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>max {max=$2+0} END {if (max>0) print max}' "tx-$c.log") - if [ -f "res-$c.json" ] && [ -n "$rss" ] && [ "$rss" -gt 0 ]; then - jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' \ - "res-$c.json" > "res-$c.json.tmp" && mv "res-$c.json.tmp" "res-$c.json" || true - fi - } # The primary subset's prove tier (cheap primaries) — heavy primaries are - # execute-only (a single-shard prove would OOM the runner). + # execute-only (a single-shard prove would OOM the runner). run.sh drives + # bench-typecheck per constant (per-constant peak-rss), same as the + # !benchmark PR path. benv="${{ matrix.bench }}"; benv="${benv,}" # InitStd→initStd, Mathlib→mathlib python3 .github/scripts/bench.py manifest \ - --csv Benchmarks/Vectors.csv --env "$benv" --mode prove --primary \ - --out names.txt + --csv Benchmarks/Vectors.csv --env "$benv" --mode prove --primary --out names.txt echo "proving $(wc -l < names.txt) primary constants:"; cat names.txt - while IFS= read -r c; do [ -n "$c" ] && measure "$c"; done < names.txt - # Merge the per-constant results; if none produced anything, emit `{}`. - jq -s 'reduce .[] as $o ({}; . + $o)' res-*.json > results.json 2>/dev/null \ - || echo '{}' > results.json - [ -s results.json ] || echo '{}' > results.json - # Wrap each metric value as { "value": v } for Bencher Metric Format. - # bench-typecheck already emits slug keys (constants, fft-cost, - # execute-time, prove-time, throughput = constants/prove-time); peak-rss - # is injected above. - jq ' - map_values(to_entries | map({(.key): {value: .value}}) | add) - ' results.json > aiur.json + bash .github/scripts/run.sh . "${{ matrix.bench }}" aiur prove names.txt neutral.json + # Wrap neutral { name: { metric: v } } → Bencher Metric Format. + jq 'map_values(map_values({value: .}))' neutral.json > aiur.json cat aiur.json # Upload Aiur metrics. Every measure shares the per-workload baseline # window (data points since the aiur reset tag). constants is deterministic @@ -294,13 +277,12 @@ jobs: fail-fast: false matrix: backend: [zisk, sp1] - include: - - bench: InitStd + bench: [InitStd, Mathlib] steps: - uses: actions/checkout@v6 with: fetch-depth: 0 - fetch-tags: true # bencher-track reads the baseline-reset tag + fetch-tags: true # bencher-track reads the bencher-thresholds-reset tag - uses: actions-rust-lang/setup-rust-toolchain@v1 with: cache-workspaces: ${{ matrix.backend }} @@ -379,7 +361,7 @@ jobs: - uses: actions/checkout@v6 with: fetch-depth: 0 - fetch-tags: true # bencher-track reads the baseline-reset tag + fetch-tags: true # bencher-track reads the bencher-thresholds-reset tag - uses: actions/cache/restore@v5 with: path: ~/.local/bin @@ -400,26 +382,16 @@ jobs: path: ${{ matrix.bench }}.ixe key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }} fail-on-cache-miss: true - # `ix check --anon` checks every kernel-checkable address across - # available_parallelism workers and prints a machine-readable - # `##check## ` line. ix check is a - # single multi-threaded process, so `/usr/bin/time -f '%M'` is the true peak RSS. + # run.sh native runs `ix check --anon` (whole env, parallel) and emits the + # neutral { : { constants, check-time, throughput, peak-rss } } — same + # path as the !benchmark native backend. - name: Run native kernel check + env: + REUSE_IXE: "1" run: | - /usr/bin/time -f '%e %M' -o time.txt \ - ix check ${{ matrix.bench }}.ixe --anon 2>&1 | tee out.txt - line=$(grep '^##check##' out.txt | tail -1) - elapsed_ms=$(echo "$line" | awk '{print $2}') - failures=$(echo "$line" | awk '{print $4}') - total=$(echo "$line" | awk '{print $5}') - [ "${failures:-1}" = 0 ] || { echo "kernel check reported ${failures:-?} failure(s)"; exit 1; } - rssk=$(awk 'NR==1{print $2}' time.txt); rss=$(( ${rssk:-0} * 1024 )) - check_s=$(awk -v e="$elapsed_ms" 'BEGIN{printf "%.3f", e/1000}') - tput=$(awk -v t="$total" -v e="$elapsed_ms" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }') - jq -n --arg n "${{ matrix.bench }}" --argjson c "$total" --argjson s "$check_s" \ - --argjson tp "$tput" --argjson rss "$rss" \ - '{($n): {constants:{value:$c}, "check-time":{value:$s}, throughput:{value:$tp}, "peak-rss":{value:$rss}}}' \ - > bench.json + bash .github/scripts/run.sh . "${{ matrix.bench }}" native execute \ + /dev/null neutral.json + jq 'map_values(map_values({value: .}))' neutral.json > bench.json cat bench.json # constants is deterministic → pinned (0/0); check-time / throughput / # peak-rss are noisy parallel wall-clock → percentage bounds. diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index 9f75a308..682e063f 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -1,7 +1,7 @@ # `!benchmark` PR command: run the curated constant set (Benchmarks/Vectors.csv) # through chosen prover backend(s) and post a main-vs-PR comparison table. # -# !benchmark [aiur] [zisk] [sp1 | all] [execute|prove] +# !benchmark [aiur] [zisk] [sp1] [native | all] [execute|prove] # BENCH_ENVS=initStd,mathlib # which compiled envs (default initStd) # BENCH_FULL=1 # run the full curated set, not the ~11 primary # BENCH_TIER=cheap|heavy|all # override the mode default (execute=all, prove=cheap) @@ -18,12 +18,6 @@ name: Benchmark pull requests on: issue_comment: types: [created] - # TEST: temporary trigger so this workflow can be exercised from a PR branch. - # issue_comment only fires once the workflow is on the default branch, so a - # `!benchmark` comment can't test it pre-merge. On pull_request there is no - # comment, so the run uses the parser defaults (aiur / initStd / execute). - # Remove this trigger before merge. - pull_request: permissions: contents: read @@ -38,12 +32,11 @@ jobs: setup: name: Parse !benchmark comment if: >- - github.event_name == 'pull_request' || - (github.event.issue.pull_request && - github.event.issue.state == 'open' && - contains(github.event.comment.body, '!benchmark') && - (github.event.comment.author_association == 'MEMBER' || - github.event.comment.author_association == 'OWNER')) + github.event.issue.pull_request && + github.event.issue.state == 'open' && + contains(github.event.comment.body, '!benchmark') && + (github.event.comment.author_association == 'MEMBER' || + github.event.comment.author_association == 'OWNER') runs-on: ubuntu-latest outputs: matrix: ${{ steps.parse.outputs.matrix }} @@ -157,7 +150,7 @@ jobs: # zkVM cells additionally need the Rust toolchain + the backend's toolchain # and system deps (the shared composite install actions). - name: Set up zkVM Rust toolchain - if: matrix.cell.skip != 'true' && matrix.cell.backend != 'aiur' + if: matrix.cell.skip != 'true' && (matrix.cell.backend == 'zisk' || matrix.cell.backend == 'sp1') uses: actions-rust-lang/setup-rust-toolchain@v1 with: cache-workspaces: ${{ matrix.cell.backend }} @@ -167,6 +160,12 @@ jobs: - name: Install Zisk if: matrix.cell.skip != 'true' && matrix.cell.backend == 'zisk' uses: ./.github/actions/install-zisk + # native = whole-env `ix check --anon`; needs only GNU time (ix is built above). + - name: Install GNU time + if: matrix.cell.skip != 'true' && matrix.cell.backend == 'native' + run: | + sudo sed -i '/azure\.archive\.ubuntu\.com/d' /etc/apt/apt-mirrors.txt 2>/dev/null || true + sudo apt-get update && sudo apt-get install -y time # ---------- main side (cached by base SHA) ---------- - name: Restore cached main results diff --git a/.github/workflows/bencher-thresholds-reset.yml b/.github/workflows/bencher-thresholds-reset.yml index 37a0213c..d3c3eef5 100644 --- a/.github/workflows/bencher-thresholds-reset.yml +++ b/.github/workflows/bencher-thresholds-reset.yml @@ -17,8 +17,9 @@ name: Bencher thresholds reset # `bencher-thresholds-reset:` label on the PR, whatever added it — so a # Triage+ collaborator can queue a reset by applying the label directly, and # cancel by removing it before merge. Naming convention: one label per token, -# `bencher-thresholds-reset:` where is `ix-compile`, `aiur`, or -# `all` (the merge step expands an `all` label into every workload). Labeling +# `bencher-thresholds-reset:` where is `ix-compile`, `aiur`, +# `zisk`, `sp1`, `native-check`, or `all` (the merge step expands an `all` +# label into every workload). Labeling # requires Triage+, so PR authors from forks cannot self-queue a reset. The # label shares the command/workflow name; the git tag it moves is the same # stem with a dash: `bencher-thresholds-reset-`. @@ -36,7 +37,7 @@ on: description: Workload baseline to reset required: true type: choice - options: [ix-compile, aiur, all] + options: [ix-compile, aiur, zisk, sp1, native-check, all] sha: description: "Commit to anchor to (default: HEAD)" required: false @@ -65,7 +66,7 @@ jobs: MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }} steps: - run: | - valid="ix-compile aiur" + valid="ix-compile aiur zisk sp1 native-check" if [ "$EVENT" = workflow_dispatch ]; then # Reset the chosen workload(s) at the given commit; no PR scan. sha="${INPUT_SHA:-$HEAD_SHA}" @@ -119,7 +120,7 @@ jobs: - run: | # Accepted command tokens — applied verbatim as labels (incl. `all`, # which the merge job expands into every workload). - accepted="ix-compile aiur all" + accepted="ix-compile aiur zisk sp1 native-check all" # Parse the workload token(s) after the command, lowercased. workloads=$(printf '%s' "$BODY" \ | grep -oiE '!bencher-thresholds-reset[[:space:]]+[a-z0-9 -]+' \ @@ -142,5 +143,5 @@ jobs: --body "♻️ Baseline reset queued for:$ok — will anchor to the merge commit when this PR merges." else gh pr comment "$PR" --repo "$REPO" \ - --body "⚠️ Reset command matched no known workload (expected \`ix-compile\`, \`aiur\`, or \`all\`). Nothing will reset on merge." + --body "⚠️ Reset command matched no known workload (expected \`ix-compile\`, \`aiur\`, \`zisk\`, \`sp1\`, \`native-check\`, or \`all\`). Nothing will reset on merge." fi diff --git a/docs/benchmarking.md b/docs/benchmarking.md new file mode 100644 index 00000000..8677e4a8 --- /dev/null +++ b/docs/benchmarking.md @@ -0,0 +1,70 @@ +# Benchmarking + +Ix is benchmarked on two surfaces, both driven by one curated constant set and +the same backend drivers: + +- **`!benchmark` PR comment** (`.github/workflows/bench-pr.yml`) — on demand, + posts a **main-vs-PR** comparison table on the pull request. +- **Bencher.dev** (`.github/workflows/bench-main.yml`) — on every push to `main`, + tracks each measure over time at (project `ix`). + +## Backends + +| backend | what it measures | metrics | +|---|---|---| +| `aiur` | IxVM kernel typecheck in the Aiur STARK prover (out-of-circuit execute + in-circuit prove) | `fft-cost`, `execute-time`, `prove-time`, `peak-rss`, `constants`, `throughput` | +| `zisk` / `sp1` | the same kernel in the Zisk / SP1 zkVM hosts, **execute** only (proving needs a GPU) | `cycles`, `execute-time`, `throughput`, `peak-rss` (+ `shards`, `max-shard-cycles` for sharded runs) | +| `native` | the same kernel run **out-of-circuit and in parallel** (`ix check --anon`, whole env) — far faster | `throughput` (constants/s), `check-time`, `peak-rss`, `constants` | + +All four are driven by `.github/scripts/run.sh` (compile the env `.ixe`, run the +backend, emit a neutral `{ "": { "": n } }` JSON). The PR workflow +compares two such JSONs; the bencher workflow wraps one in Bencher Metric Format. + +## Constant set — `Benchmarks/Vectors.csv` + +One CSV is the single source of truth: `name,env,tier,shard_target,primary,aiur_fft,zisk_cycles`. + +- `env` — compile target the constant resolves in (`initStd` / `lean` / `mathlib`). +- `tier` — `cheap` (prove-feasible on a CI runner) or `heavy` (execute-only; a + single-shard prove would OOM). +- `primary` — the **~11-constant default subset**, spanning shape and the + cheap→heavy cost range (3 are shard targets). Everything defaults to this; + the full ~60-constant set is opt-in. + +`bench.py manifest` selects names by env + mode (`prove`→cheap, `execute`→all) + +`--primary`. `bench.py compare` renders the PR table. + +## `!benchmark` grammar + +Maintainer comment on a PR: + +``` +!benchmark [aiur] [zisk] [sp1] [native | all] [execute|prove] +BENCH_ENVS=initStd,mathlib # which compiled envs (default initStd) +BENCH_FULL=1 # run the full curated set, not the ~11 primary +BENCH_TIER=cheap|heavy|all # override the mode default (execute=all, prove=cheap) +BENCH_SHARD=1 # restrict to the multi-shard target constants +BENCH_GPU=1 # allow zkVM prove on a self-hosted GPU runner +RUST_LOG=info # passthrough env (allowlisted) +``` + +Defaults: `aiur`, `execute`, `initStd`, primary subset. Backends fan out as a +matrix; `main` results are cached by base SHA. zkVM `prove` is skipped with a +note unless a GPU runner is selected. + +## Bencher jobs (`bench-main.yml`) + +`build → compile → { prove, zkvm-execute, native-check }`, each reporting to its +own testbed + **workload** (`aiur`, `zisk`, `sp1`, `native-check`, `ix-compile`). +Deterministic measures (cycles, fft-cost, constants, …) are pinned exactly; +noisy wall-clock measures (time, RAM, throughput) ride percentage bounds, both +windowed to the per-workload `bencher-thresholds-reset-` tag. + +To re-baseline a workload after an intended step change, comment +`!bencher-thresholds-reset ` on the merging PR, or run the +`bencher-thresholds-reset` workflow (`.github/workflows/bencher-thresholds-reset.yml`). + +## Not yet covered + +- **zkVM proving** (Zisk/SP1 `prove`) needs a self-hosted GPU runner; on CPU + runners it is execute-only. From 0dbb0afbdf042cfeacbbd7153a727687cfe931ed Mon Sep 17 00:00:00 2001 From: samuelburnham <45365069+samuelburnham@users.noreply.github.com> Date: Tue, 30 Jun 2026 21:42:45 +0000 Subject: [PATCH 06/27] feat(ci): Aiur proves all primaries that fit 128GB; native checks primaries too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Aiur prove now covers the whole primary subset: bench.py exempts --primary from the prove cheap-tier cap, and run.sh proves each constant whose Aiur fft-cost fits the prover RAM ceiling (AIUR_PROVE_MAX_FFT, ~128 GB at 2.34 GB per billion fft) and execute-only's the rest — so heavy primaries (only Vector.extract_append at ~145 GB) still report execute metrics instead of being dropped. BENCH_FULL prove stays capped at the cheap tier so it doesn't balloon. - native now reports two views per env: the whole env (`ix check --anon`, keyed by env) and a per-primary subject check (`ix check --consts`, keyed by constant) — apples-to-apples with the zkVM --skip-deps execute. Wired into both the native-check bencher job and the !benchmark native backend. - run.sh: fix a stream-corruption bug — tool stdout and ::warning::/::notice:: now go to logs/stderr so only JSON reaches the per-constant `jq -s` merge. --- .github/scripts/bench.py | 5 +- .github/scripts/run.sh | 105 +++++++++++++++++++------------ .github/workflows/bench-main.yml | 15 +++-- docs/benchmarking.md | 9 ++- 4 files changed, 86 insertions(+), 48 deletions(-) diff --git a/.github/scripts/bench.py b/.github/scripts/bench.py index 018a1b89..746b9ac7 100644 --- a/.github/scripts/bench.py +++ b/.github/scripts/bench.py @@ -100,7 +100,10 @@ def cmd_parse(_a): # ──────────────────────── manifest ──────────────────────── def cmd_manifest(a): - tier = a.tier or ("cheap" if a.mode == "prove" else "all") + # prove defaults to the cheap tier to keep the full set bounded; the curated + # primary subset is exempt — run.sh proves each primary that fits the Aiur RAM + # ceiling and execute-only's the rest, so all primaries are selected here. + tier = a.tier or ("cheap" if (a.mode == "prove" and not a.primary) else "all") names = [] with open(a.csv) as f: for line in f: diff --git a/.github/scripts/run.sh b/.github/scripts/run.sh index 165cbc69..2f31199d 100644 --- a/.github/scripts/run.sh +++ b/.github/scripts/run.sh @@ -6,15 +6,15 @@ # # run.sh # repo_dir : checked-out worktree (has .lake/build/bin/{ix,bench-typecheck}) -# env : initStd | lean | mathlib +# env : initStd | lean | mathlib (any case; used verbatim for .ixe) # backend : aiur | zisk | sp1 | native # mode : execute | prove # # `ix` / `bench-typecheck` come from (so base measures base's code, PR -# the PR's — the caller puts /.lake/build/bin on PATH). aiur / zisk / -# sp1 run one subprocess per constant so a failure/timeout drops only that row; -# native is whole-env (`ix check --anon`, the parallel out-of-circuit kernel) and -# ignores , keyed by the env. +# the PR's — the caller puts /.lake/build/bin on PATH). Each constant +# is its own subprocess (a failure/timeout drops only that row). Only JSON is +# written to stdout — tool output and `::warning::`/`::notice::` go to logs / +# stderr so they never corrupt the merged JSON stream. set -uo pipefail repo=${1:?repo_dir}; benv=${2:?env}; backend=${3:?backend}; mode=${4:?mode} @@ -37,7 +37,7 @@ esac ixe="$repo/$benv.ixe" if [ "${REUSE_IXE:-0}" = 1 ] && [ -f "$ixe" ]; then - echo "reusing existing $ixe (REUSE_IXE)" + echo "reusing existing $ixe (REUSE_IXE)" >&2 else echo "::group::ix compile $module → $benv.ixe ($backend/$mode)" "$repo/.lake/build/bin/ix" compile "$repo/Benchmarks/Compile/$module.lean" --out "$ixe" @@ -48,19 +48,30 @@ tmp=$(mktemp -d) case "$backend" in aiur) - # One bench-typecheck subprocess per constant (isolation + per-constant - # peak-rss). Phase 1 (execute) always runs; Phase 2 (prove) unless - # --execute-only. bench-typecheck writes the neutral per-constant JSON. + # One bench-typecheck per constant (isolation + per-constant peak-rss). + # Execute mode → Phase 1 only (--execute-only). Prove mode → prove each + # constant whose Aiur fft-cost fits the prover RAM ceiling (~128 GB at + # 2.34 GB per billion fft), else fall back to execute-only so a too-large + # single-shard prove never OOM-kills the job. + csv="$repo/Benchmarks/Vectors.csv" + ceil=${AIUR_PROVE_MAX_FFT:-50000000000} while IFS= read -r c; do [ -z "$c" ] && continue slug=$(printf '%s' "$c" | tr '/ .:' '____') res="$tmp/$slug.json" - if [ "$mode" = execute ]; then - bench-typecheck --ixe "$ixe" "$c" --json "$res" --execute-only \ - || { echo "::warning::aiur execute '$c' failed; dropping"; continue; } - else - bench-typecheck --ixe "$ixe" "$c" --json "$res" --texray 2> "$tmp/$slug.tx" \ - || { echo "::warning::aiur prove '$c' failed (OOM/timeout); dropping"; continue; } + do_prove=0 + if [ "$mode" = prove ]; then + fft=$(awk -F, -v n="$c" '$1==n {print $6}' "$csv" 2>/dev/null) + if [ -n "${fft:-}" ] && [ "$fft" -le "$ceil" ]; then + do_prove=1 + else + echo "::notice::aiur: '$c' fft=${fft:-?} exceeds $ceil (~128 GB); execute-only" >&2 + fi + fi + if [ "$do_prove" = 1 ]; then + bench-typecheck --ixe "$ixe" "$c" --json "$res" --texray \ + > "$tmp/$slug.log" 2> "$tmp/$slug.tx" \ + || { echo "::warning::aiur prove '$c' failed (OOM/timeout); dropping" >&2; continue; } # Fold texray's proving RSS high-water mark (max over spans; awk's $2+0 # stops at the first non-digit) into this constant's entry. rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>m {m=$2+0} END {if (m>0) print m}' "$tmp/$slug.tx") @@ -68,6 +79,9 @@ case "$backend" in jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' "$res" > "$res.t" \ && mv "$res.t" "$res" || true fi + else + bench-typecheck --ixe "$ixe" "$c" --json "$res" --execute-only > "$tmp/$slug.log" 2>&1 \ + || { echo "::warning::aiur execute '$c' failed; dropping" >&2; continue; } fi [ -s "$res" ] && cat "$res" done < "$names" | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null @@ -97,15 +111,15 @@ case "$backend" in if [ "$mode" = execute ]; then ( cd "$work" && timeout 25m /usr/bin/time -f '%e %M' -o "$tmf" \ "$bin" --execute --ixe "$ixe" --constant "$c" --skip-deps ) > "$log" 2>>"$log" \ - || { echo "::warning::$backend execute '$c' failed/timed out; dropping"; continue; } + || { echo "::warning::$backend execute '$c' failed/timed out; dropping" >&2; continue; } fail=$(grep -oE 'failures[:=] ?[0-9]+' "$log" | head -1 | grep -oE '[0-9]+') if [ "${fail:-1}" != 0 ]; then - echo "::warning::$backend '$c': nonzero/missing failures; dropping"; continue + echo "::warning::$backend '$c': nonzero/missing failures; dropping" >&2; continue fi # Total cycles: sharded prints "total cycles: N", single prints "cycles: N". cyc=$(grep -oE 'total cycles: [0-9]+' "$log" | tail -1 | grep -oE '[0-9]+') [ -z "$cyc" ] && cyc=$(grep -oE 'cycles: [0-9]+' "$log" | tail -1 | grep -oE '[0-9]+') - [ -z "$cyc" ] && { echo "::warning::$backend '$c': no cycle count; dropping"; continue; } + [ -z "$cyc" ] && { echo "::warning::$backend '$c': no cycle count; dropping" >&2; continue; } secs=$(awk 'NR==1{print $1}' "$tmf"); rssk=$(awk 'NR==1{print $2}' "$tmf") rss=$(( ${rssk:-0} * 1024 )) tput=$(awk -v c="$cyc" -v s="${secs:-0}" 'BEGIN{ if (s>0) printf "%.0f", c/s; else print 0 }') @@ -127,7 +141,7 @@ case "$backend" in # prove (single-leaf, GPU runner only — the workflow gates this cell). ( cd "$work" && timeout 60m cargo run --quiet --release --bin "$host" -- \ --gpu --ixe "$ixe" --constant "$c" --skip-deps ) > "$log" 2>&1 \ - || { echo "::warning::$backend prove '$c' failed/timed out; dropping"; continue; } + || { echo "::warning::$backend prove '$c' failed/timed out; dropping" >&2; continue; } secs=$(grep -oE 'prove [0-9.]+s' "$log" | head -1 | grep -oE '[0-9.]+') steps=$(grep -oE '\([0-9]+ steps\)' "$log" | head -1 | grep -oE '[0-9]+') fail=$(grep -oE 'failures=[0-9]+' "$log" | head -1 | grep -oE '[0-9]+') @@ -135,7 +149,7 @@ case "$backend" in jq -n --arg n "$c" --argjson t "$secs" --argjson s "${steps:-0}" \ '{($n): {"prove-time": $t, steps: $s}}' else - echo "::warning::$backend prove '$c': no clean prove line; dropping" + echo "::warning::$backend prove '$c': no clean prove line; dropping" >&2 fi fi done < "$names" | jq -s 'reduce .[] as $o ({}; . + $o)' > "$out" 2>/dev/null @@ -143,28 +157,39 @@ case "$backend" in ;; native) - # Native out-of-circuit Rust kernel, whole env across available_parallelism - # workers — far faster than proving; ignores . ix check is a - # single multi-threaded process so /usr/bin/time -f '%M' is the true peak RSS. - # `##check## ` is the machine line. - log="$tmp/native.out"; tmf="$tmp/native.time" - /usr/bin/time -f '%e %M' -o "$tmf" ix check "$ixe" --anon > "$log" 2>>"$log" \ - || { echo "::warning::native check failed"; emit_empty; exit 0; } - line=$(grep '^##check##' "$log" | tail -1) - elapsed_ms=$(echo "$line" | awk '{print $2}') - failures=$(echo "$line" | awk '{print $4}'); total=$(echo "$line" | awk '{print $5}') - if [ -z "${total:-}" ] || [ "${failures:-1}" != 0 ]; then - echo "::warning::native check: nonzero/missing failures or no ##check## line"; emit_empty; exit 0 - fi - rssk=$(awk 'NR==1{print $2}' "$tmf"); rss=$(( ${rssk:-0} * 1024 )) - check_s=$(awk -v e="$elapsed_ms" 'BEGIN{printf "%.3f", e/1000}') - tput=$(awk -v t="$total" -v e="$elapsed_ms" 'BEGIN{ if (e>0) printf "%.2f", t*1000/e; else print 0 }') - jq -n --arg n "$benv" --argjson c "$total" --argjson s "$check_s" \ - --argjson tp "$tput" --argjson rss "$rss" \ - '{($n): {constants:$c, "check-time":$s, throughput:$tp, "peak-rss":$rss}}' > "$out" + # Native out-of-circuit Rust kernel (far faster than proving). Two views, + # both via the `##check## ` line + # (ix check is a single multi-threaded process so /usr/bin/time -f '%M' is the + # true peak RSS): the whole env in parallel (`--anon`, keyed by env), and a + # per-primary subject check (`--consts`, keyed by constant) for an + # apples-to-apples baseline next to the zkVM `--skip-deps` execute. + native_one() { #