diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 4ac8bff3a..2a2c421b4 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -2,7 +2,17 @@ ## Changes from 4.6.0 to 4.6.1 -XXX version-specific blurb XXX +### DSL → JavaScript backend for WebAssembly (`jit_backend="js"`) + +- Under WebAssembly/Pyodide, `@blosc2.dsl_kernel` kernels can now be transpiled + to JavaScript and run via the browser's JIT. It is the **default** there for + transpilable floating-point kernels (silently falling back to miniexpr for + anything it can't handle), and beats the WASM TinyCC JIT on compute-heavy + kernels (e.g. ~2.8x on a Newton-fractal kernel). Request it explicitly with + `compute(jit_backend="js")`; outside WebAssembly that raises. +- Supports index/shape symbols (`_i0`/`_n0`/`_ndim`/`_flat_idx`) and integer inputs + with a floating-point output. Integer/complex *output*, reductions, and + unsupported constructs stay on miniexpr. Native builds are unaffected. ## Changes from 4.5.1 to 4.6.0 diff --git a/bench/js-transpiler/README.md b/bench/js-transpiler/README.md new file mode 100644 index 000000000..e41de2674 --- /dev/null +++ b/bench/js-transpiler/README.md @@ -0,0 +1,113 @@ +# DSL → JavaScript transpiler benches + +Benches/demos for `blosc2.dsl_js`, which transpiles a `@blosc2.dsl_kernel` to JavaScript so +kernels run at V8-optimized native speed in the browser/Pyodide (the `jit_backend="js"` +path). Design and findings: [`plans/dsl-js.md`](../../plans/dsl-js.md). + +Both run a Newton-fractal kernel (high arithmetic intensity + per-pixel early exit) and +compare backends. Run everything from the **repo root**. + +## Headless (Node + Pyodide) — `dsl-js-node.mjs` + +Integration test **and** perf bench, no browser. Installs the blosc2 wasm wheel from PyPI, +overlays this working tree's pure-Python (`src/blosc2/dsl_js.py` + `lazyexpr.py`) on top of +it before importing blosc2 — so the wired `jit_backend="js"` path is exercised **without +rebuilding a wheel**. Asserts `js` and miniexpr-JIT both match a numpy reference exactly, +then benches a 24-frame `relax` sweep. + +```sh +npm i # pulls pyodide@314 (see package.json) +node bench/js-transpiler/dsl-js-node.mjs # correctness + kernel sweep, 12 reps +node bench/js-transpiler/dsl-js-node.mjs 24 # N reps +``` + +Needs network on first run (PyPI wheel via micropip). Exits non-zero on a correctness +mismatch *or* a broken default fallback, so it works as a smoke test. It benches five kernel +shapes so the js-vs-tcc ratio can be read against the kernel, not generalized from one. The +`default` column (no `jit`/`jit_backend` set) shows the prefer-js-with-fallback default, and +it also checks that an int kernel and an index-symbol kernel fall back cleanly to miniexpr. +Representative (Apple M2, 4.6.0): + +``` +default fallback (no jit_backend): int=ok index-symbol=ok -> falls back cleanly + kernel default js tcc nojit js/tcc + newton 12.0 11.4 23.9 104.5 2.10x + poly 3.0 2.9 2.7 3.2 0.91x + trans 4.3 4.3 4.4 5.7 1.01x + deep 116.0 116.8 143.4 103.3 1.23x + deepar 22.3 22.0 50.4 52.7 2.29x +``` + +Columns: `default` = prefer-js-with-fallback, `js` = forced `jit_backend="js"`, `tcc` = +miniexpr JIT (`jit_backend="tcc"`), `nojit` = miniexpr interpreter (`jit=False`). `default ≈ +js` here (all float + transpilable) → prefer-js engaged. Note `jit=True` *also* prefers js +(it's a JIT); to force miniexpr use `jit_backend="tcc"`/`"cc"`, and `jit=False` selects the +interpreter — that's what the `tcc`/`nojit` columns pin. + +**The takeaway: there is no single "js is N× the JIT" number — it depends on what the kernel +is bottlenecked on.** + +- **Arithmetic / control-flow bound** (newton, deepar) → V8's optimizing JIT beats blosc2's + miniexpr WASM codegen by **~2×**. This is the sweet spot. +- **Transcendental bound** (trans, deep) → **~1×**: time is spent in `sin`/`exp`/`log` (libm), + which costs about the same whoever runs the loop — `nojit` even edges `js` on `deep`. +- **Light / trivial** (poly) → **<1×**: the kernel does almost no compute, so the blosc2 + pipeline + per-call JS marshaling dominate, and `js` can be *slightly slower* than the JIT. + +So the honest generalization is qualitative: transpiling to JS wins (~2×, single-threaded) +for **compute-bound float kernels dominated by arithmetic and control flow**, and is roughly +a wash for transcendental-bound or trivial kernels. + +> The overlay pins `blosc2==4.6.0` to keep the compiled `blosc2_ext` ABI in step with the +> pure-Python we drop on top. Once these changes ship in a Pyodide-installable wheel, the +> overlay can go away. If the overlay import ever breaks on version skew, overlay all of +> `src/blosc2/*.py` (or bump the pin). + +## Browser — `newton-dsl-js.html` + +Visual proof in a real browser: transpiles a real `@blosc2.dsl_kernel` under Pyodide, checks +the emitted JS against a numpy reference on the **same** inputs, and times it against a +hand-written JS kernel over the 24-frame sweep (ratio should sit near 1.00 — the transpiler +reaches hand-written-JS speed), then renders the fractal. + +```sh +python3 -m http.server # from the repo root +# open http://localhost:8000/bench/js-transpiler/newton-dsl-js.html and click Run +``` + +Serve from the repo root (not `file://`): the page fetches `/src/blosc2/dsl_js.py` (the +local transpiler, newer than the PyPI wheel) at a server-root-absolute path. + +## Multithreading ceiling — `worker-pool-bench.mjs` + +Throwaway exploration of *how fast the transpiled kernel could go* with real JS +multithreading: pure Node (`worker_threads` + `SharedArrayBuffer`), **no Pyodide, no +blosc2**. Same Newton kernel, partitioned across a persistent worker pool with an Atomics +barrier; reports speedup vs single-thread for 1/2/4/N workers. + +```sh +node bench/js-transpiler/worker-pool-bench.mjs +``` + +Findings (Apple M2, 4 performance + 4 efficiency cores; laptop numbers vary ±10–15% with +thermal/P-vs-E scheduling, so treat these as representative, not exact): + +| workers | ms/frame | speedup | +|---|---|---| +| single-thread | ~11.3 | 1.0× | +| ×2 | ~5.9 | ~1.9× (~94% eff) | +| ×4 | ~3.1 | ~3.5× (~88% eff) | +| ×8 | ~2.3 | ~4.8× (~60% eff — E-cores) | + +- The worker mechanism is ~free (×1 ≈ 1.0×); scaling is near-linear up to the performance + core count. The ×8 drop-off is the M2's efficiency cores, not overhead. +- **Load balancing is essential.** Contiguous row-bands regress badly (×4 fell to 1.48×) + because the per-pixel early-`break` makes some bands all-max-iter and others trivial. + **Striped** rows (worker `i` → rows `i, i+nw, …`) fix it — that's what the bench uses. + +Why this stays a *headroom* result, not a shipped feature: it measures **pure compute**. +The real `jit_backend="js"` path also pays ~8 ms/frame of blosc2 decompress/compress that +does not parallelize this way, plus Pyodide orchestration is single-threaded — so realistic +end-to-end gain is a fraction of 5×. And a browser integration needs pure-JS workers (not the +Pyodide bridge), a SharedArrayBuffer, COOP/COEP cross-origin isolation, and a path to get +decompressed chunks into shared memory. See "Deferred" in [`plans/dsl-js.md`](../../plans/dsl-js.md). diff --git a/bench/js-transpiler/dsl-js-node.mjs b/bench/js-transpiler/dsl-js-node.mjs new file mode 100644 index 000000000..67bb7384e --- /dev/null +++ b/bench/js-transpiler/dsl-js-node.mjs @@ -0,0 +1,271 @@ +// Headless integration test + perf bench for the DSL->JS backend (jit_backend="js"), +// using Pyodide-in-Node. Installs the blosc2 wasm wheel from PyPI, then OVERLAYS this +// working tree's pure-Python (src/blosc2/dsl_js.py + lazyexpr.py) on top of it before +// importing blosc2 -- so the wired path runs without waiting for a new wheel. +// +// Benches a spread of kernel shapes to show how the js-vs-JIT ratio depends on the kernel: +// branchy + early-exit (newton), branch-free light (poly), transcendental-heavy (trans), +// deep no-exit loop (deep). Reports js / jit / no-jit per kernel. +// +// npm i # pulls pyodide@314 (see package.json) +// node bench/js-transpiler/dsl-js-node.mjs # correctness + bench, 12 reps +// node bench/js-transpiler/dsl-js-node.mjs 24 # N reps +import { loadPyodide } from "pyodide"; +import { readFileSync } from "node:fs"; +import { fileURLToPath } from "node:url"; + +// Resolve paths from this file, so the harness runs from any CWD (repo root is ../../). +const ROOT = fileURLToPath(new URL("../../", import.meta.url)); +const NFRAMES = Number(process.argv[2]) || 12; + +// Kernels + bench live in a real module file: @blosc2.dsl_kernel runs inspect.getsource(), +// which needs the function to be file-backed (not exec'd from a string). +const PYSRC = String.raw` +import json, time +import numpy as np +import blosc2 + +WIDTH, HEIGHT, MAXITER = 320, 213, 48 +SPANX = 3.4 +ASPECT = HEIGHT / WIDTH +DTYPE = np.float64 + +# --- kernels spanning the cost/control-flow spectrum ------------------------------------- +@blosc2.dsl_kernel # branchy, deep, per-pixel early exit +def newton_dsl(a, b, max_iter, relax): + za = a + zb = b + mif = float(max_iter) + it = mif + for k in range(max_iter): + a2 = za * za + b2 = zb * zb + fr = za * a2 - 3.0 * za * b2 - 1.0 + fi = 3.0 * a2 * zb - zb * b2 + dr = 3.0 * (a2 - b2) + di = 6.0 * za * zb + den = dr * dr + di * di + 0.000000000001 + qr = relax * (fr * dr + fi * di) / den + qi = relax * (fi * dr - fr * di) / den + za = za - qr + zb = zb - qi + if qr * qr + qi * qi < 0.000001: + it = float(k) + break + d0 = (za - 1.0) * (za - 1.0) + zb * zb + d1 = (za + 0.5) * (za + 0.5) + (zb - 0.8660254) * (zb - 0.8660254) + d2 = (za + 0.5) * (za + 0.5) + (zb + 0.8660254) * (zb + 0.8660254) + root = 0.0 + md = d0 + if d1 < md: + md = d1 + root = 1.0 + if d2 < md: + root = 2.0 + return root + 0.9 * (it / mif) + +@blosc2.dsl_kernel # light, branch-free, vectorizable arithmetic +def poly_dsl(a, b): + a2 = a * a + b2 = b * b + return a2 * a - 3.0 * a * b2 + 2.0 * b2 * b - a + 0.5 * b + +@blosc2.dsl_kernel # transcendental-heavy (exercises each engine's libm). miniexpr wants +def trans_dsl(a, b): # bare sin/cos/... (not np.sin); the transpiler maps both to Math.* + msq = a * a + b * b + sc = sin(a * 3.0) * cos(b * 2.0) + ex = exp(msq * -0.5) + return sc + ex + sqrt(msq + 1.0) + log(msq + 2.0) + +@blosc2.dsl_kernel # deep fixed loop, transcendental-bound (libm sin every iter) +def deep_dsl(a, b): + acc = a + for k in range(64): + acc = acc * 0.99 + sin(acc + b) + return acc + +@blosc2.dsl_kernel # deep fixed loop, pure arithmetic (no libm, no branches); contractive +def deepar_dsl(a, b): + acc = a * 0.1 + t = b * 0.1 + for k in range(64): + t = t * 0.5 - acc * 0.25 + 0.1 + acc = acc * 0.5 + t * 0.25 + 0.1 + return acc + t + +@blosc2.dsl_kernel # P1: index/shape symbols -> per-element global coords (radial gradient) +def idxgrad_dsl(a): + dx = float(_i0) - _n0 * 0.5 # noqa: F821 + dy = float(_i1) - _n1 * 0.5 # noqa: F821 + return a + sqrt(dx * dx + dy * dy) # noqa: F821 + +@blosc2.dsl_kernel # P2: integer inputs, float output (the bridge float64-converts the operands) +def intmix_dsl(a, b): + return sqrt(a * a + b * b) * 0.25 + (a - b) # noqa: F821 + +# Path-coverage kernels (used only by path_check, not the float sweep): +@blosc2.dsl_kernel # int *output* -> default must fall back to miniexpr (float64 bridge unsafe) +def int_dsl(a, b): + return a * 2 + b * 3 + +@blosc2.dsl_kernel # int *inputs*, float output -> JS is safe (bridge float64-converts operands) +def intin_dsl(a, b): + return (a + b) * 0.5 + +@blosc2.dsl_kernel # index/shape symbols -> JS reconstructs global coords per block +def idx_dsl(a): + return a + float(_i0) # noqa: F821 + +@blosc2.dsl_kernel # expm1() is valid DSL/miniexpr but outside the JS Math.* set -> falls back +def unsup_dsl(a, b): + return expm1(a + b) * 0.5 # noqa: F821 + +_x = np.linspace(-SPANX / 2, SPANX / 2, WIDTH, dtype=DTYPE) +_y = np.linspace(-SPANX * ASPECT / 2, SPANX * ASPECT / 2, HEIGHT, dtype=DTYPE) +A_NP, B_NP = np.meshgrid(_x, _y) +_chunks = (min(100, HEIGHT), min(150, WIDTH)) +_blocks = (max(1, _chunks[0] // 4), max(1, _chunks[1] // 3)) +_cp = blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=1) +A_B2 = blosc2.asarray(A_NP, chunks=_chunks, blocks=_blocks, cparams=_cp) +B_B2 = blosc2.asarray(B_NP, chunks=_chunks, blocks=_blocks, cparams=_cp) +# Small-magnitude integer operands for the P2 (int in / float out) kernels. +AI_NP = (A_NP * 10).astype(np.int64) +BI_NP = (B_NP * 10).astype(np.int64) +AI_B2 = blosc2.asarray(AI_NP, chunks=_chunks, blocks=_blocks, cparams=_cp) +BI_B2 = blosc2.asarray(BI_NP, chunks=_chunks, blocks=_blocks, cparams=_cp) + +# (name, kernel, operand tuple). Fixed inputs -> each rep does identical work. +KERNELS = [ + ("newton", newton_dsl, (A_B2, B_B2, MAXITER, 1.37)), + ("poly", poly_dsl, (A_B2, B_B2)), + ("trans", trans_dsl, (A_B2, B_B2)), + ("deep", deep_dsl, (A_B2, B_B2)), + ("deepar", deepar_dsl, (A_B2, B_B2)), + ("idxgrad", idxgrad_dsl, (A_B2,)), # P1: index/shape symbols (float out) + ("intmix", intmix_dsl, (AI_B2, BI_B2)), # P2: integer inputs, float out +] + +def run(func, ops, backend, dtype=DTYPE): + kw = {"dtype": dtype, "cparams": _cp} + if backend == "js": + kw["jit_backend"] = "js" + elif backend == "tcc": + kw["jit"] = True + kw["jit_backend"] = "tcc" # miniexpr JIT, TinyCC backend (explicit) + elif backend == "nojit": + kw["jit"] = False + # "default": pass nothing -> under WASM this prefers js, falling back to miniexpr. + return blosc2.lazyudf(func, ops, **kw)[:] + +def bench(func, ops, backend, reps): + run(func, ops, backend) # warm + best = float("inf") + for _ in range(3): + t = time.perf_counter() + for _ in range(reps): + run(func, ops, backend) + best = min(best, (time.perf_counter() - t) * 1000 / reps) + return best + +def debug_bridge(): + import blosc2.dsl_js as dj + bridge = dj.js_kernel(newton_dsl) + out = np.empty((HEIGHT, WIDTH), dtype=DTYPE) + inp = (A_NP, B_NP, MAXITER, 1.37) + t = time.perf_counter(); bridge(inp, out, 0); first = (time.perf_counter() - t) * 1000 + best = float("inf") + for _ in range(8): + t = time.perf_counter(); bridge(inp, out, 0); best = min(best, (time.perf_counter() - t) * 1000) + return {"first_ms": first, "warm_ms": best} + +def path_check(): + # The default backend must agree with miniexpr (tcc) on every kernel, whether it runs + # the kernel through JS (index symbols, int inputs+float out) or transparently falls back + # to miniexpr where JS can't go (int output, unsupported constructs) -- with no error. + int_def = run(int_dsl, (AI_B2, BI_B2), "default", dtype=np.int64) # -> falls back to miniexpr + int_tcc = run(int_dsl, (AI_B2, BI_B2), "tcc", dtype=np.int64) + intin_def = run(intin_dsl, (AI_B2, BI_B2), "default") # -> JS (int in, float out) + intin_tcc = run(intin_dsl, (AI_B2, BI_B2), "tcc") + idx_def = run(idx_dsl, (A_B2,), "default") # -> JS (index symbols) + idx_tcc = run(idx_dsl, (A_B2,), "tcc") + unsup_def = run(unsup_dsl, (A_B2, B_B2), "default") # -> falls back to miniexpr + unsup_tcc = run(unsup_dsl, (A_B2, B_B2), "tcc") + return { + "int_ok": bool(np.array_equal(int_def, int_tcc)), + "intin_ok": bool(np.allclose(intin_def, intin_tcc)), + "idx_ok": bool(np.allclose(idx_def, idx_tcc)), + "unsup_ok": bool(np.allclose(unsup_def, unsup_tcc)), + } + +def kernel_names(): + return json.dumps([name for name, _f, _o in KERNELS]) + +def bench_kernel(i, reps): + # One kernel at a time so the driver can print each row as soon as it is computed. + import math + name, func, ops = KERNELS[i] + rj = run(func, ops, "js") + rtcc = run(func, ops, "tcc") + diff = float(np.max(np.abs(rj - rtcc))) + diff = diff if math.isfinite(diff) else 1e30 # keep JSON valid; flags as mismatch + ms = {b: bench(func, ops, b, reps) for b in ("default", "js", "tcc", "nojit")} + return json.dumps({"name": name, "ms": ms, "diff": diff}) + +def summary(): + return json.dumps({"bridge": debug_bridge(), "paths": path_check()}) +`; + +const py = await loadPyodide(); +await py.loadPackage("micropip"); +// Pin to the release this tree is based on, to keep the C-extension ABI in step with the +// pure-Python we overlay. The compiled blosc2_ext comes from the wheel; only .py is ours. +await py.runPythonAsync(`import micropip; await micropip.install("blosc2==4.6.0")`); + +// find_spec does NOT import blosc2 -- so we can patch files before first import. +const pkgdir = await py.runPythonAsync( + `import importlib.util, os; os.path.dirname(importlib.util.find_spec("blosc2").origin)`, +); +for (const f of ["dsl_js.py", "lazyexpr.py"]) { + py.FS.writeFile(`${pkgdir}/${f}`, readFileSync(`${ROOT}src/blosc2/${f}`)); +} +await py.runPythonAsync(` +import sys, blosc2 +assert hasattr(sys.modules["blosc2.lazyexpr"], "_as_js_udf"), "overlay did not take" +`); +console.log("blosc2", await py.runPythonAsync("blosc2.__version__"), + "| Pyodide", py.version, "| reps", NFRAMES); + +py.FS.writeFile("/kernel_bench.py", new TextEncoder().encode(PYSRC)); +await py.runPythonAsync(` +import sys +if "/" not in sys.path: sys.path.insert(0, "/") +import kernel_bench +`); + +const fmt = (x, w) => String(x).padStart(w); +const names = JSON.parse(await py.runPythonAsync("kernel_bench.kernel_names()")); + +// Stream the table: print each row as soon as its kernel finishes benchmarking. +console.log("\nper-kernel bench (ms/frame, lower is better; 'default' = prefer-js w/ fallback,"); +console.log("'tcc' = miniexpr JIT, 'nojit' = miniexpr interpreter):"); +const cols = ["default", "js", "tcc", "nojit", "js/tcc"]; +console.log(" " + "kernel".padEnd(8) + cols.map((c) => fmt(c, 8)).join("")); +const bad = []; +for (let i = 0; i < names.length; i++) { + const k = JSON.parse(await py.runPythonAsync(`kernel_bench.bench_kernel(${i}, ${NFRAMES})`)); + const { default: def, js, tcc, nojit } = k.ms; + const cells = [def, js, tcc, nojit].map((v) => v.toFixed(1)).concat((tcc / js).toFixed(2) + "x"); + console.log(" " + k.name.padEnd(8) + cells.map((c) => fmt(c, 8)).join("")); + if (k.diff > 1e-5) bad.push(k.name); +} + +const s = JSON.parse(await py.runPythonAsync("kernel_bench.summary()")); +console.log(`\ncorrectness (js vs tcc maxdiff): ${bad.length ? "MISMATCH " + bad : "OK"}`); +const fb = s.paths; +const fbOk = fb.int_ok && fb.intin_ok && fb.idx_ok && fb.unsup_ok; +console.log(`default backend (no jit_backend) vs miniexpr: ` + + `int-out=${fb.int_ok ? "ok" : "FAIL"} int-in=${fb.intin_ok ? "ok" : "FAIL"} ` + + `index-symbol=${fb.idx_ok ? "ok" : "FAIL"} unsupported=${fb.unsup_ok ? "ok" : "FAIL"}` + + ` -> ${fbOk ? "all paths agree" : "BROKEN"}`); +console.log(`\nnewton bridge probe (no blosc2 machinery): first=${s.bridge.first_ms.toFixed(1)} ms warm=${s.bridge.warm_ms.toFixed(1)} ms`); +if (bad.length || !fbOk) process.exit(1); diff --git a/bench/js-transpiler/newton-dsl-js.html b/bench/js-transpiler/newton-dsl-js.html new file mode 100644 index 000000000..574390240 --- /dev/null +++ b/bench/js-transpiler/newton-dsl-js.html @@ -0,0 +1,234 @@ + + +blosc2 DSL → JavaScript transpiler (Pyodide) + + + +

blosc2 DSL → JavaScript transpiler

+

Takes a real @blosc2.dsl_kernel, transpiles it to JS with +dsl_js.build_js_module(), and runs the emitted JS in the browser. Correctness: +checked against a numpy reference on the same input arrays. Speed: a fair, warmed, +best-of-N comparison against a hand-written JS kernel over the full 24-frame relax +sweep (a total, to clear the browser timer-resolution floor) — the ratio should sit near 1.00, +i.e. the transpiler reaches hand-written-JS speed. See plans/dsl-js.md.

+ + loading Pyodide… +

+
+
+
diff --git a/bench/js-transpiler/worker-pool-bench.mjs b/bench/js-transpiler/worker-pool-bench.mjs
new file mode 100644
index 000000000..a9e928d7b
--- /dev/null
+++ b/bench/js-transpiler/worker-pool-bench.mjs
@@ -0,0 +1,126 @@
+// Throwaway: how fast can the transpiled Newton kernel go with real JS multithreading?
+// Pure Node (worker_threads + SharedArrayBuffer), no Pyodide, no blosc2 -- isolates the
+// compute ceiling and per-dispatch overhead a Web Worker pool would hit in a browser.
+// The kernel is the same scalar loop dsl_js emits; hand-written here to avoid Pyodide.
+//
+//   node bench/js-transpiler/worker-pool-bench.mjs
+import { Worker, isMainThread, workerData } from "node:worker_threads";
+import os from "node:os";
+import { performance } from "node:perf_hooks";
+
+const WIDTH = 320, HEIGHT = 213, MAXITER = 48, NFRAMES = 24, SPANX = 3.4;
+const ASPECT = HEIGHT / WIDTH;
+const N = WIDTH * HEIGHT;
+
+// Striped rows (rowStart, rowStep): worker i does rows i, i+nw, ... so the per-pixel
+// early-exit work spreads evenly across workers instead of clumping in contiguous bands.
+function newtonBand(A, B, OUT, rowStart, rowStep, H, W, maxIter, relax) {
+  for (let row = rowStart; row < H; row += rowStep) {
+    for (let col = 0; col < W; col++) {
+      const i = row * W + col;
+      let za = A[i], zb = B[i], it = maxIter;
+      for (let k = 0; k < maxIter; k++) {
+        const a2 = za * za, b2 = zb * zb;
+        const fr = za * a2 - 3 * za * b2 - 1, fi = 3 * a2 * zb - zb * b2;
+        const dr = 3 * (a2 - b2), di = 6 * za * zb, den = dr * dr + di * di + 1e-12;
+        const qr = relax * (fr * dr + fi * di) / den, qi = relax * (fi * dr - fr * di) / den;
+        za -= qr; zb -= qi;
+        if (qr * qr + qi * qi < 1e-6) { it = k; break; }
+      }
+      const d0 = (za - 1) * (za - 1) + zb * zb;
+      const d1 = (za + 0.5) * (za + 0.5) + (zb - 0.8660254) * (zb - 0.8660254);
+      const d2 = (za + 0.5) * (za + 0.5) + (zb + 0.8660254) * (zb + 0.8660254);
+      let root = 0, md = d0;
+      if (d1 < md) { md = d1; root = 1; }
+      if (d2 < md) { root = 2; }
+      OUT[i] = root + 0.9 * (it / maxIter);
+    }
+  }
+}
+
+// ctrl: Int32[ gen, done ].  params: Float64[ relax, maxIter ].
+if (!isMainThread) {
+  const { ctrlSab, paramsSab, aSab, bSab, outSab, rowStart, rowStep, W } = workerData;
+  const ctrl = new Int32Array(ctrlSab), params = new Float64Array(paramsSab);
+  const A = new Float64Array(aSab), B = new Float64Array(bSab), OUT = new Float64Array(outSab);
+  let gen = 0;
+  for (;;) {
+    Atomics.wait(ctrl, 0, gen);          // block until main bumps the generation
+    gen = Atomics.load(ctrl, 0);
+    if (gen < 0) break;                  // shutdown
+    newtonBand(A, B, OUT, rowStart, rowStep, HEIGHT, W, params[1] | 0, params[0]);
+    Atomics.add(ctrl, 1, 1);             // signal this band done
+    Atomics.notify(ctrl, 1);
+  }
+} else {
+  main();
+}
+
+function buildGrid() {
+  const aSab = new SharedArrayBuffer(N * 8), bSab = new SharedArrayBuffer(N * 8),
+        outSab = new SharedArrayBuffer(N * 8);
+  const A = new Float64Array(aSab), B = new Float64Array(bSab);
+  const x0 = -SPANX / 2, dx = SPANX / (WIDTH - 1);
+  const y0 = -SPANX * ASPECT / 2, dy = SPANX * ASPECT / (HEIGHT - 1);
+  for (let r = 0; r < HEIGHT; r++)
+    for (let c = 0; c < WIDTH; c++) { A[r * WIDTH + c] = x0 + dx * c; B[r * WIDTH + c] = y0 + dy * r; }
+  return { aSab, bSab, outSab };
+}
+
+function timeBest(fn, runs) {
+  for (let w = 0; w < 2; w++) fn();      // warm V8 / workers
+  let best = Infinity;
+  for (let r = 0; r < runs; r++) { const t = performance.now(); fn(); best = Math.min(best, performance.now() - t); }
+  return best;
+}
+
+async function benchPool(nw, sabs, relaxes) {
+  const ctrlSab = new SharedArrayBuffer(8), paramsSab = new SharedArrayBuffer(16);
+  const ctrl = new Int32Array(ctrlSab), params = new Float64Array(paramsSab);
+  params[1] = MAXITER;
+  const workers = [];
+  for (let i = 0; i < nw; i++) {
+    workers.push(new Worker(new URL(import.meta.url), {
+      workerData: { ...sabs, ctrlSab, paramsSab, rowStart: i, rowStep: nw, W: WIDTH },
+    }));
+  }
+  const frame = (relax) => {
+    Atomics.store(ctrl, 1, 0);
+    params[0] = relax;
+    Atomics.add(ctrl, 0, 1);
+    Atomics.notify(ctrl, 0, nw);
+    let d;                               // barrier: wait until all bands reported done
+    while ((d = Atomics.load(ctrl, 1)) < nw) Atomics.wait(ctrl, 1, d);
+  };
+  const sweep = () => { for (const rx of relaxes) frame(rx); };
+  const best = timeBest(sweep, 5);
+  Atomics.store(ctrl, 0, -1); Atomics.notify(ctrl, 0, nw);   // shutdown
+  await Promise.all(workers.map((w) => w.terminate()));
+  return best;
+}
+
+async function main() {
+  const cores = os.cpus().length;
+  const sabs = buildGrid();
+  const OUT = new Float64Array(sabs.outSab);
+  const relaxes = Array.from({ length: NFRAMES }, (_, i) => 1.0 + (1.85 - 1.0) * i / (NFRAMES - 1));
+
+  // Single-thread baseline on the main thread (no worker overhead at all).
+  const A = new Float64Array(sabs.aSab), B = new Float64Array(sabs.bSab);
+  const tSingle = timeBest(() => { for (const rx of relaxes) newtonBand(A, B, OUT, 0, 1, HEIGHT, WIDTH, MAXITER, rx); }, 5);
+  const ref = Float64Array.from(OUT);    // last frame (relax=1.85), for correctness check
+
+  console.log(`Newton ${WIDTH}x${HEIGHT}, max_iter=${MAXITER}, ${NFRAMES}-frame sweep | cores=${cores}`);
+  const per = (ms) => `${ms.toFixed(0)} ms total (${(ms / NFRAMES).toFixed(2)} ms/frame)`;
+  console.log(`\nsingle-thread (main): ${per(tSingle)}`);
+
+  const counts = [...new Set([1, 2, 4, cores])].filter((n) => n >= 1 && n <= cores * 2).sort((a, b) => a - b);
+  for (const nw of counts) {
+    const t = await benchPool(nw, sabs, relaxes);
+    let maxdiff = 0;
+    for (let i = 0; i < N; i++) maxdiff = Math.max(maxdiff, Math.abs(OUT[i] - ref[i]));
+    const sp = tSingle / t;
+    console.log(`pool x${String(nw).padStart(2)} : ${per(t)}  | speedup ${sp.toFixed(2)}x` +
+                `  eff ${(100 * sp / nw).toFixed(0)}%  | maxdiff ${maxdiff.toExponential(1)}`);
+  }
+}
diff --git a/doc/getting_started/dsl_syntax.md b/doc/getting_started/dsl_syntax.md
index 2942e94b4..5aa9a2782 100644
--- a/doc/getting_started/dsl_syntax.md
+++ b/doc/getting_started/dsl_syntax.md
@@ -228,6 +228,27 @@ Runtime error examples:
 - Missing return on executed control path
 - While-loop iteration cap exceeded
 
+## Execution backends
+
+A DSL kernel is compiled and run by one of two backends, selected per evaluation
+via the `jit` / `jit_backend` arguments to `compute()` / `__getitem__`:
+
+- **miniexpr** (default on native builds): a runtime JIT (TinyCC, `jit_backend="tcc"`)
+  with an interpreter fallback (`jit=False`). Supports the full DSL described here,
+  including integer/complex dtypes and reductions.
+- **JavaScript** (`jit_backend="js"`): transpiles the kernel to JavaScript and runs it
+  through the browser's JIT. **WebAssembly/Pyodide only** — requesting it elsewhere raises.
+  Under WebAssembly it is also the *default* for eligible kernels (set `jit=False` or
+  `strict_miniexpr=True` to opt out), and silently falls back to miniexpr for anything it
+  cannot handle.
+
+The JavaScript backend computes in float64 and covers floating-point element-wise kernels:
+arithmetic, comparisons, `where`, `if`/`elif`/`else`, `for ... in range(...)`/`while`
+loops, the index/shape symbols (`_i0`/`_n0`/`_ndim`/`_flat_idx`), and the standard math
+functions. It also accepts **integer inputs** when the output dtype is floating. It does
+**not** support integer/complex *output*, reductions, or constructs outside the transpiled
+subset; those stay on miniexpr (or, with an explicit `jit_backend="js"`, raise).
+
 ## Python syntax that is out of DSL scope
 
 These Python features are not part of this DSL:
diff --git a/plans/dsl-js-coverage.md b/plans/dsl-js-coverage.md
new file mode 100644
index 000000000..32c6b7b1f
--- /dev/null
+++ b/plans/dsl-js-coverage.md
@@ -0,0 +1,145 @@
+# DSL → JS transpiler: coverage gaps & future work
+
+Status of the `blosc2.dsl_js` transpiler (the `jit_backend="js"` path) versus the
+miniexpr + WASM-JIT backend. Everything listed below as *unsupported* currently rides on
+**miniexpr + jit-wasm** instead of the JS bridge.
+
+## Implemented
+
+- **P1 — Index / shape symbols** (`_i0`/`_n0`/`_ndim`/`_flat_idx`, ...). The transpiler emits them
+  as trailing kernel params and the runtime driver reconstructs per-block global coordinates
+  from `(off, gshape, cshape)`; see `_module_with_index` in `src/blosc2/dsl_js.py`. The
+  whole-array shape is threaded `chunked_eval → _maybe_js_backend → _as_js_udf → js_kernel`.
+  Requires ≥1 array operand (zero-input DSL kernels stay on miniexpr) and a known output
+  shape; without a shape such kernels fall back. Covered by `tests/ndarray/test_dsl_js.py`
+  (`test_index_*`) and `tests/ndarray/test_wasm_dsl_jit.py::test_wasm_dsl_index_symbols_via_js`.
+- **P2 (input side) — Integer inputs with a floating output.** The JS bridge already
+  float64-converts every operand, which is exactly miniexpr's promotion of integer inputs for
+  a float result (so values above 2**53 lose precision identically). `_js_dtypes_ok`
+  (`src/blosc2/lazyexpr.py`) now admits integer inputs when the output dtype is floating.
+  Integer/complex *output* still goes to miniexpr — see the remaining P2 work below.
+
+## Performance characteristics (and where the residual cost is)
+
+Measured with `bench/js-transpiler/dsl-js-node.mjs` (Pyodide, ms/frame). JS beats miniexpr's
+TinyCC JIT (`tcc`) on **compute-heavy** kernels and lands at parity / slightly behind on
+**compute-light, vectorizable** ones:
+
+```
+kernel    js/tcc
+newton     2.80x   (heavy: loop + complex arithmetic)
+deepar     2.78x
+idxgrad    2.00x   (P1 index symbols)
+deep       1.30x
+trans      0.99x
+intmix     0.87x   (P2 int inputs; light, vectorizable)
+poly       0.86x   (light, vectorizable)
+```
+
+Two cost components matter, and only the second remains:
+
+- **Per-evaluation transpile + `js.eval` (amortized away).** Each `lazyudf` evaluation used to
+  re-parse the kernel AST and re-`eval` the JS module, while miniexpr caches its compiled
+  program by source. Now memoized: `_TRANSPILE_CACHE` (by kernel source) and `_RUN_CACHE` (the
+  V8-compiled `__run`, by module string) in `src/blosc2/dsl_js.py`. This lifted every ratio
+  (e.g. newton 2.20→2.80x, poly 0.77→0.86x) and is a real win for repeated / animation-loop use.
+
+- **Per-block marshaling (the residual).** The bridge copies each block across the Python↔JS
+  boundary: in via `ascontiguousarray(float64) → tobytes → Float64Array`, out via
+  `to_bytes → np.frombuffer`. miniexpr's prefilter computes **in place** with zero copies. For
+  light kernels (~2 ms compute) these two copies are a meaningful fraction with no compute to
+  hide them behind, so JS sits at parity or just behind `tcc` there. For compute-bound kernels
+  (the reason the JS backend exists) it is negligible.
+
+**Future lever — zero-copy block I/O.** Replace the `tobytes`/`frombuffer` copies with a
+`HEAPF64` view onto WASM linear memory so operands/output alias the block buffers (the
+"ponytail" note in `js_kernel`). This would mostly close the gap on marshaling-bound (light)
+kernels but needs care around WASM-heap lifetime/alignment, and does nothing for compute-bound
+kernels — so build it only if a real marshaling-bound workload appears.
+
+## How routing works today
+
+Under WebAssembly with `jit_backend` unset (and `jit != False`, no `strict_miniexpr`),
+blosc2 *prefers* JS for float DSL kernels and **silently falls back to miniexpr+jit-wasm**
+for anything it can't transpile — see `_maybe_js_backend` (`src/blosc2/lazyexpr.py:1475`):
+
+```python
+try:
+    bridge = _as_js_udf(expression)  # transpiles; raises on any unsupported construct
+except Exception:
+    return expression, jit, jit_backend  # fall back to miniexpr, no regression
+```
+
+With an **explicit** `jit_backend="js"`, the same gaps instead **raise** rather than fall
+back (the user asked for JS specifically, so we don't second-guess them). This includes a
+non-floating *output* dtype: `_maybe_js_backend` raises a clear `ValueError` up front rather
+than letting the float64 bridge silently compute integer/complex output (see below).
+
+The JS backend today covers *float64/float32 element-wise scalar kernels* using arithmetic,
+`where`, comparisons, `if/elif/else`, `range` loops, and whitelisted math functions.
+
+## Remaining P2 — Integer *output*
+
+`_js_dtypes_ok` still sends any non-floating *output* dtype to miniexpr, because the JS
+bridge computes in **float64** and can't reproduce integer semantics for the result:
+
+- **Integer division / modulo / truncation**: `//`, `%`, `int(...)` must match C/miniexpr
+  integer rules, not float `Math.floor`/`pymod`.
+- **Overflow / wraparound**: miniexpr wraps at the integer width; float64 doesn't.
+- **int64 range**: float64 can't represent int64 above 2**53 exactly.
+
+Options, in rough order of effort:
+- **int32 and smaller output**: representable exactly in float64; could be allowed for kernels
+  that provably stay within ±2^53 with integer-valued ops and an explicit safe-range / no-
+  overflow contract. Still needs integer-correct `//`/`%`/`int()` codegen.
+- **int64 output**: requires BigInt or a typed-array split-word scheme — significantly more
+  work and likely slower; probably not worth it until a real workload needs it.
+
+## Other unsupported constructs (lower priority)
+
+All of these raise `_DSLToJSError` in the transpiler → fall back (or raise under explicit
+`jit_backend="js"`).
+
+**Reductions** — any `reduce_args` (`sum`, `prod`, …) → miniexpr. Explicit
+`jit_backend="js"` raises `'jit_backend="js" does not support reductions'`. A JS reduction
+path would need a fundamentally different driver (accumulate, not map).
+
+**Statements** — only `Assign, AugAssign, Return, Expr, If, For(range), While, Break,
+Continue` are emitted (`_stmt`, `src/blosc2/dsl_js.py:151`). Not supported:
+- Tuple / multiple / subscript assignment targets — only a single `Name` target is handled
+  (`node.targets[0].id`). `a, b = ...`, `a = b = ...`, `arr[i] = ...` all fail.
+- `with`, nested `def`, `try`, etc.
+
+**Expressions** — only `Name, Constant, UnaryOp, BinOp, BoolOp, Compare, Call` (`_expr`).
+Not supported:
+- Python ternary `a if cond else b` (`ast.IfExp`) — must be written as `where(cond, a, b)`.
+- Chained comparisons `a < b < c` — only `ops[0]`/`comparators[0]` are read.
+- Subscript / indexing, attribute access (except `np.`/`numpy.`/`math.` call targets),
+  tuples, lists, dicts, comprehensions, slices.
+
+**Calls** — only `where`, `int`, `float`, `bool`, and the `_MATH` whitelist (`sin, cos, exp,
+log, sqrt, pow, floor, abs, min/max, …`, see `src/blosc2/dsl_js.py:27`). Any other call name,
+or a call through a non-`np`/`numpy`/`math` target → fall back.
+
+**For-loops** — only `for v in range(...)`. Iterating over arrays/other iterables is
+unsupported.
+
+## Environment gate (by design)
+
+Browser/Pyodide only. `_as_js_udf` raises `RuntimeError` off-WASM (`js_kernel` imports
+Pyodide's `js` at run time). On native/CI, DSL kernels always go to miniexpr+jit.
+
+## Known semantic ceilings (supported, but lossy)
+
+These transpile but with caveats worth tracking, since miniexpr may differ:
+- 64-bit integer bitwise ops degrade to int32 (JS number semantics).
+- `%` uses a Python-sign helper (`pymod`); large-magnitude float edge cases may differ.
+- `range()` with a non-literal step assumes a positive step (loop-direction guess).
+- float64/float32 are the target; exotic dtypes untested.
+
+## See also
+
+- `plans/dsl-js.md` — original design, perf numbers, and the "Deferred" / "Known ceilings"
+  notes this document expands on.
+- `src/blosc2/dsl_js.py` — the transpiler.
+- `src/blosc2/lazyexpr.py` — `_maybe_js_backend`, `_js_dtypes_ok`, `_as_js_udf` (routing).
diff --git a/plans/dsl-js.md b/plans/dsl-js.md
new file mode 100644
index 000000000..996e44594
--- /dev/null
+++ b/plans/dsl-js.md
@@ -0,0 +1,183 @@
+# Plan: Transpile blosc2 DSL kernels to JavaScript (browser/Pyodide accel)
+
+## Context
+
+In `newton-js-vs-numpy-vs-nojit-vs-jit.html`, the same Newton-fractal kernel runs four
+ways in the browser. Measured: JS 272 ms, numpy 1302 ms, blosc2 no-JIT 3023 ms, blosc2
+JIT 887 ms. Hand-written JavaScript is **~3.3× faster than the blosc2 WASM JIT** and ~11×
+faster than the no-JIT interpreter, because V8 JIT-compiles a fused scalar loop to
+optimized native code while blosc2's WASM JIT (tcc/miniexpr) does not.
+
+The same blosc2 DSL kernels (`@blosc2.dsl_kernel`) are written in a strict, bounded subset
+of Python that is already parsed via the stdlib `ast` module. So we can **transpile a DSL
+kernel to JavaScript** and run that JS in the browser, capturing the V8 speed win — without
+the user rewriting the kernel. This is browser/Pyodide-only by nature.
+
+### Why this shape (decisions settled)
+
+- **Single-threaded, per-block, via the existing `lazyudf` callable seam.** `lazyudf(func,
+  inputs)` already accepts a plain Python callable `func(inputs_tuple, output, offset)` and
+  drives it per-block through `chunked_eval`/`slices_eval`. Plugging a JS bridge in there
+  needs **zero changes to compiled code** (no `.pyx` edits, no rebuild) and handles
+  multi-input kernels (Newton takes `a`, `b`) correctly.
+- **Not a postfilter.** A postfilter is single-input / same-itemsize / 1:1 — wrong shape for
+  an N-input compute kernel. (Postfilter is the hook only for a future *transparent fused
+  read*; different feature.)
+- **No Web Worker pool / SharedArrayBuffer in the MVP.** That's a parallel-runtime project
+  (tiling driver, COOP/COEP headers, Atomics join) mostly *outside* blosc2. Single-threaded
+  JS already beats the JIT 3.3×; parallelism is deferred until measured need. Generalizing
+  the per-block bridge to **per-chunk** is the natural next rung.
+- **Shipped as `src/blosc2/dsl_js.py`**, a new `jit_backend="js"` alongside no-JIT and
+  miniexpr-JIT. Wiring is one swap in `chunked_eval`; no compiled-code changes. Started as a
+  repo-root prototype, graduated once benches confirmed the ~2× win.
+- **Default under WebAssembly is prefer-js-with-fallback.** Unless `jit=False`, a float,
+  transpilable, non-reduction DSL kernel auto-routes to js; anything js can't do
+  (integer/complex dtypes, reductions, unsupported DSL constructs) *silently falls back to
+  miniexpr*, so there is no regression. Because js is itself a JIT (the JS engine compiles
+  it), `jit=True` prefers it too — only `jit=False` (interpreter) or an explicit
+  `jit_backend` opts out; force miniexpr with `jit_backend="tcc"`/`"cc"` (see
+  `_maybe_js_backend`, `_js_dtypes_ok`). Off-WASM, `jit_backend="js"` raises; the default is
+  unchanged (miniexpr).
+
+## Feasibility summary
+
+- **Grammar is bounded and known.** `DSLValidator` in `src/blosc2/dsl_kernel.py:265-492`
+  enumerates exactly the supported nodes: assign/augassign, if/elif/else, `for ... in
+  range()`, while, break, continue, return; binops `+ - * / // % ** & | ^ << >>`,
+  single comparisons, bool ops, unary `+ - not`, calls to `range/where/int/float/bool` and
+  `np.* / numpy.* / math.*`, name/constant. Ternary, chained compares, tuple-unpack, input
+  reassignment are rejected. A transpiler maps this set ~1:1 to JS.
+- **Kernel source is available:** `DSLKernel.dsl_source` (`src/blosc2/dsl_kernel.py:495+`),
+  dedented and ready to `ast.parse`.
+- **Scalar semantics:** the kernel reads like per-element scalar code (per-pixel `for`/`break`),
+  exactly like the hand-written `newtonJS` in the demo. So transliterate the DSL function to a
+  JS function with the **same signature**, then drive it element-by-element over each block.
+
+## Architecture
+
+```
+lazyudf(js_kernel(newton_dsl), (A_B2, B_B2, MAXITER, relax))[:]
+   -> chunked_eval / slices_eval  (existing, unchanged)
+       -> per block: bridge(inputs_tuple, output, offset)
+            marshal block numpy arrays -> JS typed arrays
+            run transpiled JS element loop
+            copy result back into `output`
+```
+
+`js_kernel(dsl_kernel)` returns the plain Python callable lazyudf expects. The transpiler runs
+in Python (pure stdlib `ast`), so it works inside Pyodide too.
+
+## Component 1 — transpiler (`dsl_to_js`)
+
+Walk the Python `ast` of `kernel.dsl_source` and emit a JS function with the same signature
+and body. Node mapping (mirror `DSLValidator`'s allowed set so we stay in lockstep):
+
+- **Assign**: first time a local name is seen → `let x = expr;`, later → `x = expr;` (seed the
+  "declared" set with the parameter names).
+- **AugAssign**: `+= -= *= /=` direct; expand `**= //= %=` to the binop form below.
+- **BinOp**: `+ - * /` direct; `**` → `Math.pow(a,b)`; `//` → `Math.floor(a/b)`;
+  `%` → `pymod(a,b)` helper `(((a%b)+b)%b)` (Python sign convention); `& | ^ << >>` → JS
+  bitwise (int32 coercion — fine for boolean masks, **ceiling:** real 64-bit int bitwise not
+  supported).
+- **BoolOp** `and`/`or` → `&&`/`||`. **UnaryOp** `+ - not` → `+ - !`.
+- **Compare** (single): `== != < <= > >=` → `=== !== < <= > >=`.
+- **Call**: `where(c,a,b)` → `(c ? a : b)`; `int(x)` → `Math.trunc(x)`; `float(x)` → `(x)`;
+  `bool(x)` → `((x)!=0)`; `np.*/numpy.*/math.*` → name table to `Math.*`
+  (`sin cos tan sqrt exp log abs floor ceil pow atan2 ...`); unknown name → raise.
+- **For** `for k in range(a[,b[,c]])` → `for (let k=START; k) { ; };
+   function __run(arrays, scalars, out, n) {
+     for (let i = 0; i < n; i++) out[i] = __k(/* per param: arrays[k][i] or scalars[k] */);
+   }
+   ```
+3. In Pyodide, materialize it once: `import js; run = js.eval("(...)")` → JS function proxy.
+4. Return a callable `bridge(inputs_tuple, output, offset)` that, per block:
+   - splits inputs into array operands (→ `arr.to_js()` typed arrays) and scalars,
+   - calls `run(arrays, scalars, out_js, n)`,
+   - copies `out_js` back into `output`.
+
+`# ponytail: per-block to_js() copy; swap to a zero-copy HEAPF64 view onto WASM linear memory
+only if marshaling shows up as the bottleneck.`
+
+Outside Pyodide (no `js`), `js_kernel` still exposes `.js_source` for inspection/testing and
+raises if you try to *run* it.
+
+## Files (as shipped)
+
+- **`src/blosc2/dsl_js.py`** — `dsl_to_js()`, `build_js_module()`, `js_kernel()` bridge.
+- **`src/blosc2/lazyexpr.py`** — `_as_js_udf()` + the `jit_backend="js"` swap in `chunked_eval`.
+- **`tests/ndarray/test_dsl_js.py`** — transpiler + node-backed numeric-equivalence tests.
+- **`bench/js-transpiler/dsl-js-node.mjs`** — headless Pyodide-in-Node integration test + bench.
+- **`bench/js-transpiler/newton-dsl-js.html`** — browser demo (transpiled vs hand-written JS).
+- **`bench/js-transpiler/README.md`** — how to run both.
+
+## Verification
+
+1. **Transpiler tests** — `pytest tests/ndarray/test_dsl_js.py`: structure + index-symbol
+   rejection, and (when `node` is on PATH) run the emitted JS over a grid and assert it matches
+   the Python kernel to ~1e-9.
+2. **Headless wired path** — `node bench/js-transpiler/dsl-js-node.mjs`: overlays the local
+   pure-Python onto the PyPI wheel and drives the real `lazyudf(jit_backend="js")` path;
+   asserts `js`/JIT both match numpy exactly, then benches. Exits non-zero on mismatch.
+3. **Browser** — serve repo root, open `bench/js-transpiler/newton-dsl-js.html`, click Run.
+
+## Bench findings (verified)
+
+Measured on Apple M-series, blosc2 4.6.0 under Pyodide 314, Newton 320×213 / max_iter=48,
+24-frame `relax` sweep (`dsl-js-node.mjs`):
+
+| backend | ms/frame | vs `js` |
+|---|---|---|
+| `jit_backend="js"` | **~16** | — |
+| miniexpr JIT | ~31 | js **~2× faster** |
+| no-JIT interpreter | ~130 | js ~8× faster |
+
+Correctness exact: `js` and JIT both `maxdiff=0.00` vs numpy.
+
+**The ~2× is kernel-dependent, not a flat rule** (kernel sweep in `dsl-js-node.mjs`, see
+`bench/js-transpiler/README.md`). The js-vs-tcc (miniexpr JIT) win tracks what the kernel is bottlenecked on:
+**arithmetic / control-flow** bound → ~2× (newton 2.15×, a deep pure-arithmetic loop 2.23×);
+**transcendental** bound (`sin`/`exp`/`log`) → ~1× (libm cost is engine-independent; `nojit`
+can even edge `js`); **light / trivial** → <1× (blosc2 pipeline + marshaling dominate, `js`
+slightly loses). So JS helps for compute-bound float kernels heavy on arithmetic and branches.
+
+**The PyProxy gotcha (the one real bug the headless harness caught).** The bridge must pass
+the per-call operands to the JS driver as real **JS `Array`s**, not Python lists. A Python
+list arrives in JS as a `PyProxy`, so every `ops[k][i]` in the hot inner loop crosses the
+Python↔JS boundary — still correct, but ~**10× slower** (140 vs 8 ms/frame for the direct
+bridge call). The browser demo never hit this because it built its arrays in JS. Fix:
+`Array.new()` + `.push(...)` in `js_kernel`'s bridge. With that, per-chunk marshaling is cheap
+(multi-chunk ≈ single-chunk), so the **per-chunk driver below is *not* needed** for speed.
+
+## Deferred (explicitly not built now)
+
+- **Whole-array / fewer-crossing driver** — per-chunk is already cheap after the PyProxy fix,
+  so this is only worth it if a future kernel shows marshaling-bound; the transpiler is unchanged.
+- **Web Worker pool + SharedArrayBuffer** — real multithreading; needs COOP/COEP and a tiling
+  driver. Build only if single-thread proves too slow for a real workload.
+- **Index/shape symbols** (`_i0`/`_n0`/`_flat_idx`) in the transpiler.
+- **Postfilter-based transparent fused read** — different feature (single-input), single-threaded.
+
+## Known ceilings / limitations
+
+- Browser/Pyodide-only.
+- 64-bit integer bitwise ops degrade to int32 (JS).
+- `%` follows Python sign convention via helper; large-magnitude float edge cases may differ.
+- `range()` assumes positive step unless the step is a literal.
+- float64/float32 numeric kernels are the target; exotic dtypes untested.
diff --git a/src/blosc2/dsl_js.py b/src/blosc2/dsl_js.py
new file mode 100644
index 000000000..a5179ab29
--- /dev/null
+++ b/src/blosc2/dsl_js.py
@@ -0,0 +1,498 @@
+"""Transpile a blosc2 DSL kernel to JavaScript, and run it from a lazyudf callable.
+
+Browser/Pyodide-only payoff: V8 JIT-compiles the emitted scalar loop to optimized native
+code, which in the Newton-fractal demo beats blosc2's WASM JIT ~3.3x and the no-JIT
+interpreter ~11x. See plans/dsl-js.md.
+
+Public API:
+    dsl_to_js(kernel)         -> (js_source, param_names)  # pure stdlib, runs anywhere
+    build_js_module(k, ndim)  -> js_source string          # ndim needed for index symbols
+    js_kernel(kernel, shape)  -> callable for lazyudf(...)  # needs Pyodide `js` to *run*
+
+`kernel` may be a blosc2 DSLKernel (has .dsl_source), a plain function, or a source string.
+
+Kernels may use index/shape symbols (`_i0`/`_i1`/.., `_n0`/.., `_flat_idx`); they become
+trailing kernel parameters and the runtime driver reconstructs each element's global
+coordinate per block. That requires the output rank, so `build_js_module`/`js_kernel` need
+`ndim`/`shape`; without them, index-symbol kernels raise (and the caller falls back).
+"""
+
+from __future__ import annotations
+
+import ast
+import inspect
+import json
+import textwrap
+
+# Wired into lazyexpr via jit_backend="js": a DSL kernel is transpiled here and run as a
+# plain per-block callable. Browser/Pyodide only (js_kernel imports `js` at call time).
+
+# Canonical signature order for index/shape symbols passed to the transpiled kernel.
+_INDEX_SYMBOL_ORDER = ("_i0", "_i1", "_i2", "_n0", "_n1", "_n2", "_ndim", "_flat_idx")
+_INDEX_SYMBOLS = set(_INDEX_SYMBOL_ORDER)
+
+# numpy/math function name -> JS Math.* name (numpy aliases included).
+_MATH = {
+    "sin": "sin",
+    "cos": "cos",
+    "tan": "tan",
+    "asin": "asin",
+    "acos": "acos",
+    "atan": "atan",
+    "atan2": "atan2",
+    "arcsin": "asin",
+    "arccos": "acos",
+    "arctan": "atan",
+    "arctan2": "atan2",
+    "sinh": "sinh",
+    "cosh": "cosh",
+    "tanh": "tanh",
+    "exp": "exp",
+    "log": "log",
+    "log2": "log2",
+    "log10": "log10",
+    "sqrt": "sqrt",
+    "cbrt": "cbrt",
+    "pow": "pow",
+    "power": "pow",
+    "hypot": "hypot",
+    "floor": "floor",
+    "ceil": "ceil",
+    "trunc": "trunc",
+    "round": "round",
+    "abs": "abs",
+    "absolute": "abs",
+    "fabs": "abs",
+    "sign": "sign",
+    "min": "min",
+    "max": "max",
+    "minimum": "min",
+    "maximum": "max",
+}
+
+_BIN = {
+    ast.Add: "+",
+    ast.Sub: "-",
+    ast.Mult: "*",
+    ast.Div: "/",
+    ast.BitAnd: "&",
+    ast.BitOr: "|",
+    ast.BitXor: "^",
+    ast.LShift: "<<",
+    ast.RShift: ">>",
+}
+_AUG = {
+    ast.Add: "+=",
+    ast.Sub: "-=",
+    ast.Mult: "*=",
+    ast.Div: "/=",
+    ast.BitAnd: "&=",
+    ast.BitOr: "|=",
+    ast.BitXor: "^=",
+    ast.LShift: "<<=",
+    ast.RShift: ">>=",
+}
+_CMP = {
+    ast.Eq: "===",
+    ast.NotEq: "!==",
+    ast.Lt: "<",
+    ast.LtE: "<=",
+    ast.Gt: ">",
+    ast.GtE: ">=",
+}
+
+JS_PRELUDE = "const pymod = (a, b) => (((a % b) + b) % b);"
+
+
+class _DSLToJSError(Exception):
+    pass
+
+
+def _get_source(obj) -> str:
+    if hasattr(obj, "dsl_source"):
+        src = obj.dsl_source
+    elif isinstance(obj, str):
+        src = obj
+    elif callable(obj):
+        src = inspect.getsource(obj)
+    else:
+        raise _DSLToJSError(f"cannot get DSL source from {obj!r}")
+    return textwrap.dedent(src)
+
+
+class _Transpiler:
+    def transpile(self, func: ast.FunctionDef):
+        self.params = [a.arg for a in func.args.args]
+        used_index = self._collect_index_symbols(func)
+        hoist = self._hoist_names(func)
+        body = self._block(func.body, 1)
+        # Index/shape symbols (`_i0`, `_n1`, `_flat_idx`, ...) become extra trailing
+        # parameters; the runtime driver computes them per element (see _module_with_index).
+        sig = self.params + used_index
+        head = f"function {func.name}({', '.join(sig)}) {{\n"
+        decl = f"  let {', '.join(sorted(hoist))};\n" if hoist else ""
+        return head + decl + body + "}", list(self.params), used_index
+
+    # -- scope analysis -------------------------------------------------
+    def _collect_index_symbols(self, func):
+        """Return the index/shape symbols the kernel references, in canonical order."""
+        used = {
+            node.id for node in ast.walk(func) if isinstance(node, ast.Name) and node.id in _INDEX_SYMBOLS
+        }
+        return [s for s in _INDEX_SYMBOL_ORDER if s in used]
+
+    def _hoist_names(self, func):
+        assigned, fortargets = set(), set()
+        for node in ast.walk(func):
+            if isinstance(node, ast.Assign):
+                for t in node.targets:
+                    if isinstance(t, ast.Name):
+                        assigned.add(t.id)
+            elif isinstance(node, ast.AugAssign) and isinstance(node.target, ast.Name):
+                assigned.add(node.target.id)
+            elif isinstance(node, ast.For) and isinstance(node.target, ast.Name):
+                fortargets.add(node.target.id)
+        return assigned - set(self.params) - fortargets
+
+    # -- statements -----------------------------------------------------
+    def _block(self, stmts, ind):
+        return "".join(self._stmt(s, ind) for s in stmts)
+
+    def _stmt(self, node, ind):
+        pad = "  " * ind
+        if isinstance(node, ast.Assign):
+            return f"{pad}{node.targets[0].id} = {self._expr(node.value)};\n"
+        if isinstance(node, ast.AugAssign):
+            return pad + self._augassign(node) + "\n"
+        if isinstance(node, ast.Return):
+            return f"{pad}return {self._expr(node.value)};\n"
+        if isinstance(node, ast.Expr):
+            return f"{pad}{self._expr(node.value)};\n"
+        if isinstance(node, ast.If):
+            return self._if(node, ind)
+        if isinstance(node, ast.For):
+            return self._for(node, ind)
+        if isinstance(node, ast.While):
+            return f"{pad}while ({self._expr(node.test)}) {{\n{self._block(node.body, ind + 1)}{pad}}}\n"
+        if isinstance(node, ast.Break):
+            return f"{pad}break;\n"
+        if isinstance(node, ast.Continue):
+            return f"{pad}continue;\n"
+        raise _DSLToJSError(f"unsupported statement: {type(node).__name__}")
+
+    def _augassign(self, node):
+        t, val, op = node.target.id, self._expr(node.value), type(node.op)
+        if op in _AUG:
+            return f"{t} {_AUG[op]} {val};"
+        if op is ast.Pow:
+            return f"{t} = Math.pow({t}, {val});"
+        if op is ast.FloorDiv:
+            return f"{t} = Math.floor({t} / {val});"
+        if op is ast.Mod:
+            return f"{t} = pymod({t}, {val});"
+        raise _DSLToJSError(f"unsupported augmented op: {op.__name__}")
+
+    def _if(self, node, ind):
+        pad = "  " * ind
+        s = f"{pad}if ({self._expr(node.test)}) {{\n{self._block(node.body, ind + 1)}{pad}}}"
+        if node.orelse:
+            if len(node.orelse) == 1 and isinstance(node.orelse[0], ast.If):
+                s += " else " + self._if(node.orelse[0], ind).lstrip()
+            else:
+                s += f" else {{\n{self._block(node.orelse, ind + 1)}{pad}}}\n"
+                return s
+        return s + "\n"
+
+    def _for(self, node, ind):
+        pad = "  " * ind
+        var = node.target.id
+        args = node.iter.args
+        if len(args) == 1:
+            start, stop, step, stepnode = "0", self._expr(args[0]), "1", None
+        elif len(args) == 2:
+            start, stop, step, stepnode = self._expr(args[0]), self._expr(args[1]), "1", None
+        else:
+            start, stop, step, stepnode = (
+                self._expr(args[0]),
+                self._expr(args[1]),
+                self._expr(args[2]),
+                args[2],
+            )
+        cond = f"{var} > {stop}" if _neg_literal(stepnode) else f"{var} < {stop}"
+        return (
+            f"{pad}for (let {var} = {start}; {cond}; {var} += {step}) {{\n"
+            f"{self._block(node.body, ind + 1)}{pad}}}\n"
+        )
+
+    # -- expressions ----------------------------------------------------
+    def _expr(self, node):
+        if isinstance(node, ast.Name):
+            return node.id
+        if isinstance(node, ast.Constant):
+            return _const(node.value)
+        if isinstance(node, ast.UnaryOp):
+            sym = {ast.Not: "!", ast.USub: "-", ast.UAdd: "+"}[type(node.op)]
+            return f"({sym}{self._expr(node.operand)})"
+        if isinstance(node, ast.BinOp):
+            return self._binop(node)
+        if isinstance(node, ast.BoolOp):
+            sym = "&&" if isinstance(node.op, ast.And) else "||"
+            return "(" + f" {sym} ".join(self._expr(v) for v in node.values) + ")"
+        if isinstance(node, ast.Compare):
+            op = _CMP[type(node.ops[0])]
+            return f"({self._expr(node.left)} {op} {self._expr(node.comparators[0])})"
+        if isinstance(node, ast.Call):
+            return self._call(node)
+        raise _DSLToJSError(f"unsupported expression: {type(node).__name__}")
+
+    def _binop(self, node):
+        left, right, op = self._expr(node.left), self._expr(node.right), type(node.op)
+        if op is ast.Pow:
+            return f"Math.pow({left}, {right})"
+        if op is ast.FloorDiv:
+            return f"Math.floor({left} / {right})"
+        if op is ast.Mod:
+            return f"pymod({left}, {right})"
+        if op in _BIN:
+            return f"({left} {_BIN[op]} {right})"
+        raise _DSLToJSError(f"unsupported binary op: {op.__name__}")
+
+    def _call(self, node):
+        name = self._call_name(node.func)
+        args = [self._expr(a) for a in node.args]
+        if name == "where":
+            if len(args) != 3:
+                raise _DSLToJSError("where() needs 3 args: where(cond, a, b)")
+            return f"({args[0]} ? {args[1]} : {args[2]})"
+        if name == "int":
+            return f"Math.trunc({args[0]})"
+        if name == "float":
+            return f"({args[0]})"
+        if name == "bool":
+            return f"(({args[0]}) != 0)"
+        if name == "range":
+            raise _DSLToJSError("range() is only valid as a for-loop iterator")
+        if name in _MATH:
+            return f"Math.{_MATH[name]}({', '.join(args)})"
+        raise _DSLToJSError(f"unsupported call: {name}()")
+
+    def _call_name(self, node):
+        if isinstance(node, ast.Name):
+            return node.id
+        if (
+            isinstance(node, ast.Attribute)
+            and isinstance(node.value, ast.Name)
+            and node.value.id in {"np", "numpy", "math"}
+        ):
+            return node.attr
+        raise _DSLToJSError("unsupported call target")
+
+
+def _neg_literal(node) -> bool:
+    if isinstance(node, ast.Constant) and isinstance(node.value, (int, float)):
+        return node.value < 0
+    return isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.USub)
+
+
+def _const(v) -> str:
+    if isinstance(v, bool):
+        return "true" if v else "false"
+    if isinstance(v, int):
+        return str(v)
+    if isinstance(v, float):
+        if v != v:
+            return "NaN"
+        if v == float("inf"):
+            return "Infinity"
+        if v == float("-inf"):
+            return "-Infinity"
+        return repr(v)
+    if isinstance(v, str):
+        return json.dumps(v)
+    raise _DSLToJSError(f"unsupported constant: {v!r}")
+
+
+# Transpiling is a pure function of the kernel source, so memoize it: the same kernel is
+# typically re-transpiled on every lazyudf evaluation (e.g. an animation loop rebuilds the
+# expression each frame), and ast.parse + codegen is non-trivial under WASM.
+_TRANSPILE_CACHE: dict[str, tuple] = {}
+
+# module string -> the V8-compiled `__run` function (a Pyodide JsProxy). Populated only when
+# a kernel actually runs in-browser (see js_kernel's bridge); empty off-WASM.
+_RUN_CACHE: dict = {}
+
+
+def _transpile(kernel):
+    """Transpile *kernel* to JS. Returns (js_source, params, index_symbols, func_name)."""
+    src = _get_source(kernel)
+    hit = _TRANSPILE_CACHE.get(src)
+    if hit is not None:
+        return hit
+    tree = ast.parse(src)
+    func = next((n for n in tree.body if isinstance(n, ast.FunctionDef)), None)
+    if func is None:
+        raise _DSLToJSError("no function definition found in DSL source")
+    js_src, params, used_index = _Transpiler().transpile(func)
+    result = (js_src, params, used_index, func.name)
+    _TRANSPILE_CACHE[src] = result
+    return result
+
+
+def dsl_to_js(kernel):
+    """Transpile a DSL kernel to a JS function string. Returns (js_source, param_names)."""
+    js_src, params, _used, _name = _transpile(kernel)
+    return js_src, params
+
+
+def _max_index_dim(used_index) -> int:
+    """Highest axis referenced by `_iK`/`_nK` symbols (-1 if only `_ndim`/`_flat_idx`/none)."""
+    dims = [int(s[2:]) for s in used_index if s[:2] in ("_i", "_n") and s[2:].isdigit()]
+    return max(dims) if dims else -1
+
+
+def _op_call_args(nparams):
+    return [f"(isarr[{k}] ? ops[{k}][i] : ops[{k}])" for k in range(nparams)]
+
+
+def _module_with_index(kernel_js, fname, params, used_index) -> str:
+    """Driver for kernels that use index/shape symbols.
+
+    Signature ``__run(ops, isarr, out, n, off, gshape, cshape)``: `off` is the block's
+    global start coord, `gshape` the whole-array shape, `cshape` the block shape (all JS
+    arrays).  Each element's local coord is unravelled from its flat position (C-order),
+    then the referenced symbols are derived and passed as trailing kernel args."""
+    decls = []
+    for s in used_index:
+        if s in ("_flat_idx", "_ndim"):
+            continue
+        k = int(s[2:])
+        rhs = f"off[{k}] + loc[{k}]" if s.startswith("_i") else f"gshape[{k}]"
+        decls.append(f"    const {s} = {rhs};")
+    if "_ndim" in used_index:
+        decls.append("    const _ndim = d;")
+    if "_flat_idx" in used_index:
+        decls.append("    let _flat_idx = 0;")
+        decls.append(
+            "    for (let k = 0; k < d; k++) _flat_idx = _flat_idx * gshape[k] + (off[k] + loc[k]);"
+        )
+    call = f"{fname}({', '.join(_op_call_args(len(params)) + used_index)})"
+    driver = "\n".join(
+        [
+            "function __run(ops, isarr, out, n, off, gshape, cshape) {",
+            "  const d = cshape.length;",
+            "  const loc = new Array(d);",
+            "  for (let i = 0; i < n; i++) {",
+            "    let rem = i;",
+            "    for (let k = d - 1; k >= 0; k--) { loc[k] = rem % cshape[k]; rem = (rem - loc[k]) / cshape[k]; }",
+            *decls,
+            f"    out[i] = {call};",
+            "  }",
+            "}",
+        ]
+    )
+    return f"{JS_PRELUDE}\n{kernel_js}\n{driver}\nreturn __run;"
+
+
+def build_js_module(kernel, ndim: int | None = None) -> str:
+    """Self-contained JS: prelude + kernel + a runtime element driver returning ``__run``.
+
+    Kernels without index/shape symbols get a flat ``__run(ops, isarr, out, n)`` driver.
+    Kernels that use `_i0`/`_n0`/`_flat_idx` get the index-aware driver (see
+    :func:`_module_with_index`) and require *ndim* (the output rank) so the referenced
+    axes can be validated; ``ndim=None`` raises for such kernels."""
+    kernel_js, params, used_index, fname = _transpile(kernel)
+    if used_index:
+        if ndim is None:
+            raise _DSLToJSError("kernel uses index/shape symbols; the output ndim must be supplied")
+        max_dim = _max_index_dim(used_index)
+        if max_dim >= ndim:
+            raise _DSLToJSError(f"kernel references axis {max_dim} but the output is {ndim}-D")
+        return _module_with_index(kernel_js, fname, params, used_index)
+    call_args = ", ".join(_op_call_args(len(params)))
+    driver = (
+        f"function __run(ops, isarr, out, n) {{ "
+        f"for (let i = 0; i < n; i++) out[i] = {fname}({call_args}); }}"
+    )
+    return f"{JS_PRELUDE}\n{kernel_js}\n{driver}\nreturn __run;"
+
+
+def js_kernel(kernel, shape=None):
+    """Return a lazyudf-compatible callable that runs the transpiled JS (Pyodide only).
+
+    *shape* is the whole-array output shape; it is required for kernels that use index/shape
+    symbols (so the driver knows the rank and global geometry) and ignored otherwise."""
+    ndim = len(shape) if shape is not None else None
+    _, _, used_index, _ = _transpile(kernel)  # cached
+    module = build_js_module(kernel, ndim=ndim)
+    uses_index = bool(used_index)
+    gshape = tuple(int(s) for s in shape) if shape is not None else None
+    run = None  # lazily created in-browser
+
+    def bridge(inputs, output, offset=None):
+        nonlocal run
+        import numpy as np
+        from js import Array, Float64Array, Uint8Array  # Pyodide
+
+        if run is None:
+            # Reuse the V8-compiled function across lazyudf evaluations of the same kernel:
+            # the module is a pure function of (source, ndim), so js.eval need run only once
+            # per distinct module instead of once per frame.
+            run = _RUN_CACHE.get(module)
+            if run is None:
+                import js
+
+                run = js.eval(f"(function() {{ {module} }})()")
+                _RUN_CACHE[module] = run
+
+        n = int(output.size)
+        # Pass real JS Arrays, not Python lists: a Python list arrives in JS as a PyProxy,
+        # so each ops[k][i] in the hot loop would cross the Python<->JS boundary (~10x slower).
+        ops = Array.new()
+        isarr = Array.new()
+        for x in inputs:
+            if isinstance(x, np.ndarray) and x.ndim > 0:
+                ops.push(
+                    _to_jsf64(
+                        np.ascontiguousarray(x, dtype=np.float64).reshape(-1), Float64Array, Uint8Array
+                    )
+                )
+                isarr.push(True)
+            else:
+                ops.push(float(x))
+                isarr.push(False)
+        out_js = Float64Array.new(n)
+        if uses_index:
+            off = offset if offset is not None else (0,) * output.ndim
+            run(
+                ops,
+                isarr,
+                out_js,
+                n,
+                _to_jsint(off, Array),
+                _to_jsint(gshape, Array),
+                _to_jsint(output.shape, Array),
+            )
+        else:
+            run(ops, isarr, out_js, n)
+        # ponytail: per-block to_js()/to_bytes() copies; swap to a zero-copy HEAPF64 view
+        # onto WASM linear memory only if marshaling shows up as the bottleneck.
+        res = np.frombuffer(bytes(out_js.to_bytes()), dtype=np.float64)
+        output.reshape(-1)[:] = res
+        return output
+
+    bridge.js_source = module
+    return bridge
+
+
+def _to_jsf64(xf, Float64Array, Uint8Array):
+    u8 = Uint8Array.new(xf.nbytes)
+    u8.assign(xf.tobytes())  # Pyodide TypedArray.assign(buffer) copies bytes in
+    return Float64Array.new(u8.buffer)
+
+
+def _to_jsint(seq, Array):
+    """Small geometry vector (offset/shape) -> a real JS Array of ints (avoids PyProxy)."""
+    arr = Array.new()
+    for v in seq:
+        arr.push(int(v))
+    return arr
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 5653beff6..ae67a366f 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -479,7 +479,8 @@ def compute(
 
             - ``strict_miniexpr`` (bool): controls whether miniexpr compilation/execution
               failures are raised instead of silently falling back to regular chunked eval
-              for non-DSL expressions.
+              for non-DSL expressions.  Setting it ``True`` also opts a DSL kernel out of the
+              WebAssembly prefer-js default, keeping it on miniexpr.
 
             - ``jit`` (bool | None): enable (``True``) or disable (``False``) JIT compilation
               of the expression via miniexpr.  When ``None`` (default), JIT is only used
@@ -488,9 +489,21 @@ def compute(
               kernels.
 
             - ``jit_backend`` (str | None): select the JIT compiler backend.  Valid
-              values are ``"tcc"`` (bundled Tiny C Compiler) and ``"cc"`` (system C
-              compiler, e.g. gcc or clang).  ``None`` (default) defers to the miniexpr
-              default (``"tcc"``).
+              values are ``"tcc"`` (bundled Tiny C Compiler), ``"cc"`` (system C
+              compiler, e.g. gcc or clang), and ``"js"`` (transpile the DSL kernel to
+              JavaScript; browser/Pyodide only — see below).  ``None`` (default) defers
+              to the miniexpr default (``"tcc"``), except under WebAssembly where — unless
+              ``jit=False`` — it *prefers* ``"js"`` for transpilable float DSL kernels and
+              falls back to miniexpr otherwise.  Since ``"js"`` is itself JIT-compiled by
+              the JS engine, ``jit=True`` prefers it too; force miniexpr with
+              ``jit_backend="tcc"``/``"cc"``.
+
+            - ``"js"`` backend (WebAssembly/Pyodide only): transpiles a
+              :func:`blosc2.dsl_kernel` to JavaScript so it runs at the browser engine's
+              optimized native speed.  It tends to beat the WASM miniexpr JIT (~2x) for
+              float kernels dominated by arithmetic and control flow, and is roughly a
+              wash for transcendental-heavy or trivial kernels.  Outside WebAssembly,
+              ``jit_backend="js"`` raises.  Forcing ``"tcc"``/``"cc"`` always uses miniexpr.
 
             - ``BLOSC_ME_JIT`` environment variable: when set to ``"1"``, ``"true"``,
               ``"on"``, ``"tcc"``, or ``"cc"``, it forces ``jit=True`` and overrides
@@ -1407,8 +1420,11 @@ def fill_chunk_operands(
 def _apply_jit_backend_pragma(expression: str, inputs: dict, jit_backend: str | None) -> str:
     if jit_backend is None:
         return expression
+    if jit_backend == "js":
+        # "js" is handled earlier (DSL kernels -> JS bridge); it never carries a C pragma.
+        return expression
     if jit_backend not in ("tcc", "cc"):
-        raise ValueError("jit_backend must be one of: None, 'tcc', 'cc'")
+        raise ValueError("jit_backend must be one of: None, 'tcc', 'cc', 'js'")
 
     pragma = f"# me:compiler={jit_backend}\n"
     stripped = expression.lstrip()
@@ -1432,6 +1448,96 @@ def _is_dsl_kernel_expression(expression) -> bool:
     return isinstance(expression, DSLKernel) and expression.dsl_source is not None
 
 
+def _as_js_udf(expression, shape=None):
+    """For jit_backend="js": transpile a DSL kernel to JS and return a plain per-block
+    callable (so the normal UDF path runs it). Browser/Pyodide only.
+
+    *shape* (the whole-array output shape) is forwarded to the transpiler so kernels using
+    index/shape symbols can reconstruct global coordinates per block."""
+    if not _is_dsl_kernel_expression(expression):
+        raise ValueError('jit_backend="js" requires a blosc2.dsl_kernel-decorated kernel')
+    if not blosc2.IS_WASM:
+        raise RuntimeError('jit_backend="js" is only available under WebAssembly/Pyodide')
+    from .dsl_js import js_kernel
+
+    return js_kernel(expression, shape=shape)
+
+
+def _js_dtypes_ok(operands, kwargs) -> bool:
+    """True only if the JS bridge (which computes in float64) is safe for these operands.
+
+    The output dtype must be floating: integer/complex *output* goes to miniexpr (the bridge
+    can't reproduce integer division/overflow/truncation semantics, and float64 can't hold
+    int64 exactly).  Given a floating output, integer *inputs* are fine -- the bridge converts
+    every operand to float64, which is exactly what miniexpr does when promoting integer inputs
+    for a float result (so any values above 2**53 lose precision identically).  Complex inputs
+    are rejected (the bridge is real-only)."""
+    dt = kwargs.get("dtype")
+    if dt is None:
+        # Inferred output: only safe when all operands are float (so the output is float too).
+        return all(
+            np.issubdtype(op.dtype, np.floating)
+            for op in operands.values()
+            if isinstance(op, blosc2.NDArray)
+        )
+    if not np.issubdtype(np.dtype(dt), np.floating):
+        return False
+    return all(
+        np.issubdtype(op.dtype, np.floating) or np.issubdtype(op.dtype, np.integer)
+        for op in operands.values()
+        if isinstance(op, blosc2.NDArray)
+    )
+
+
+def _maybe_js_backend(expression, jit, jit_backend, reduce_args, operands, kwargs, shape=None):
+    """Resolve the JS backend for a DSL kernel.
+
+    - ``jit_backend="js"`` (explicit): transpile to the JS bridge, or raise if it can't.
+    - ``jit_backend=None`` under WebAssembly, unless ``jit=False``: *prefer* JS (it is a
+      JIT too, and the fastest one here) for transpilable float DSL kernels, silently
+      falling back to miniexpr for anything it can't do (non-float dtypes, reductions, or
+      unsupported DSL constructs).  ``jit=True`` and ``jit=None`` both prefer JS; only
+      ``jit=False`` (interpreter), ``strict_miniexpr=True``, or an explicit ``jit_backend``
+      opts out.
+
+    *shape* is the whole-array output shape, forwarded to the transpiler for kernels that
+    use index/shape symbols (``_i0``/``_n0``/``_flat_idx``); without it such kernels fall
+    back to miniexpr.
+
+    Returns ``(expression, jit, jit_backend)`` — expression becomes a plain per-block
+    callable when JS is chosen, else everything passes through unchanged.
+    """
+    if jit_backend == "js":
+        if reduce_args:
+            raise ValueError('jit_backend="js" does not support reductions')
+        out_dtype = kwargs.get("dtype")
+        if out_dtype is not None and not np.issubdtype(np.dtype(out_dtype), np.floating):
+            # The JS bridge computes in float64 and cannot reproduce integer/complex output
+            # semantics (division/overflow/truncation); keep those on miniexpr.
+            raise ValueError(
+                'jit_backend="js" requires a floating-point output dtype '
+                f"(got {np.dtype(out_dtype)}); drop jit_backend to use miniexpr"
+            )
+        return _as_js_udf(expression, shape), None, None
+    prefer_js = (
+        jit is not False  # jit=True/None prefer the best JIT (js); only jit=False forces interpreter
+        and jit_backend is None
+        and not kwargs.get("strict_miniexpr")  # explicit strict_miniexpr=True keeps miniexpr
+        and blosc2.IS_WASM
+        and _is_dsl_kernel_expression(expression)
+        and operands  # at least one operand: the zero-input DSL path stays on miniexpr
+        and not reduce_args
+        and _js_dtypes_ok(operands, kwargs)
+    )
+    if not prefer_js:
+        return expression, jit, jit_backend
+    try:
+        bridge = _as_js_udf(expression, shape)  # transpiles; raises on any unsupported construct
+    except Exception:
+        return expression, jit, jit_backend  # fall back to miniexpr, no regression
+    return bridge, None, None
+
+
 def _format_dsl_parse_error_hint(expr_text: str, backend_msg: str):
     marker = "parse_error_pos="
     pos0 = backend_msg.find(marker)
@@ -2961,6 +3067,12 @@ def chunked_eval(
             operands = {**operands, **where}
 
         reduce_args = kwargs.pop("_reduce_args", {})
+        # Resolve the JS backend: explicit jit_backend="js", or prefer-js-with-fallback under
+        # WebAssembly when the user left jit_backend unset (see _maybe_js_backend).
+        expression, jit, jit_backend = _maybe_js_backend(
+            expression, jit, jit_backend, reduce_args, operands, kwargs, shape=shape
+        )
+
         fast_path = _validate_chunked_eval_inputs(operands, out, shape, reduce_args)
 
         # Activate last read cache for NDField instances
@@ -4814,9 +4926,13 @@ def myudf(inputs_tuple, output, offset):
     jit: bool or None, optional
         JIT policy for miniexpr-backed execution:
         ``None`` uses default behavior (currently, JIT is tried out), ``True`` prefers JIT, ``False`` disables JIT.
-    jit_backend: {"tcc", "cc"} or None, optional
-        JIT backend selection for miniexpr-backed execution:
-        ``None`` uses backend defaults (currently "tcc"), ``"tcc"`` forces libtcc, ``"cc"`` forces C compiler backend.
+    jit_backend: {"tcc", "cc", "js"} or None, optional
+        JIT backend selection. ``None`` uses backend defaults (miniexpr "tcc"), except under
+        WebAssembly where — unless ``jit=False`` — it *prefers* ``"js"`` for transpilable
+        float DSL kernels and falls back to miniexpr otherwise (``jit=True`` prefers ``"js"``
+        too, since it is JIT-compiled by the JS engine). ``"tcc"`` forces libtcc, ``"cc"``
+        forces the C compiler backend, and ``"js"`` transpiles a :func:`blosc2.dsl_kernel`
+        to JavaScript (browser/Pyodide only; raises elsewhere).
     kwargs: Any, optional
         Keyword arguments that are supported by the :func:`empty` constructor.
         These arguments will be used by the :meth:`LazyArray.__getitem__` and
diff --git a/tests/b2view/test_basics.py b/tests/b2view/test_basics.py
index 4e8a169e5..f4c12a41a 100644
--- a/tests/b2view/test_basics.py
+++ b/tests/b2view/test_basics.py
@@ -96,6 +96,20 @@ async def wait_for_table(pilot) -> None:
     raise AssertionError("data table never finished loading")
 
 
+async def wait_until(pilot, predicate, *, message="condition not met in time") -> None:
+    """Pump the event loop until *predicate* holds.
+
+    Setting ``Input.value`` posts an ``Input.Changed`` that rebuilds dependent widgets
+    asynchronously; a single ``pilot.pause()`` is not always enough on slower/loaded CI
+    (e.g. Windows), so poll until the resulting state settles.
+    """
+    for _ in range(100):
+        await pilot.pause()
+        if predicate():
+            return
+    raise AssertionError(message)
+
+
 async def focus_data_table(pilot) -> DataTable:
     table = pilot.app.query_one("#data-table", DataTable)
     table.focus()
@@ -655,11 +669,11 @@ async def submit_filter(expr: str) -> None:
 
         # Typing narrows the candidate list (substring, case-insensitive).
         app.screen.query_one("#colfilter-input", Input).value = "v1"
-        await pilot.pause()
+        await wait_until(pilot, lambda: sel.option_count == 10, message="list did not narrow")
         assert sel.option_count == 10  # v10..v19
         # Clear the filter again so the first column ('a') is reachable.
         app.screen.query_one("#colfilter-input", Input).value = ""
-        await pilot.pause()
+        await wait_until(pilot, lambda: sel.option_count == ncols, message="list did not reset")
 
         # ↓ moves focus into the list; Space unchecks the highlighted ('a');
         # Enter applies the remaining set.
diff --git a/tests/ndarray/test_dsl_js.py b/tests/ndarray/test_dsl_js.py
new file mode 100644
index 000000000..2fe8e98c2
--- /dev/null
+++ b/tests/ndarray/test_dsl_js.py
@@ -0,0 +1,326 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team 
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+"""Tests for the DSL -> JavaScript transpiler (blosc2.dsl_js).
+
+The transpiler itself is pure stdlib and runs anywhere. Where `node` is on PATH we also
+run the emitted JS and check it matches the Python kernel semantics element-by-element.
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import numpy as np
+import pytest
+
+import blosc2
+from blosc2.dsl_js import build_js_module, dsl_to_js
+
+# `blosc2.lazyexpr` (attribute) is the re-exported function, not the module; grab the module.
+lx = sys.modules["blosc2.lazyexpr"]
+
+
+# Same Newton kernel as the demo (scalar semantics), as a plain function.
+def newton_dsl(a, b, max_iter, relax):
+    za = a
+    zb = b
+    mif = float(max_iter)
+    it = mif
+    for k in range(max_iter):
+        a2 = za * za
+        b2 = zb * zb
+        fr = za * a2 - 3.0 * za * b2 - 1.0
+        fi = 3.0 * a2 * zb - zb * b2
+        dr = 3.0 * (a2 - b2)
+        di = 6.0 * za * zb
+        den = dr * dr + di * di + 0.000000000001
+        qr = relax * (fr * dr + fi * di) / den
+        qi = relax * (fi * dr - fr * di) / den
+        za = za - qr
+        zb = zb - qi
+        if qr * qr + qi * qi < 0.000001:
+            it = float(k)
+            break
+    d0 = (za - 1.0) * (za - 1.0) + zb * zb
+    d1 = (za + 0.5) * (za + 0.5) + (zb - 0.8660254) * (zb - 0.8660254)
+    d2 = (za + 0.5) * (za + 0.5) + (zb + 0.8660254) * (zb + 0.8660254)
+    root = 0.0
+    md = d0
+    if d1 < md:
+        md = d1
+        root = 1.0
+    if d2 < md:
+        root = 2.0
+    return root + 0.9 * (it / mif)
+
+
+# Exercises where(), int(), //, %, ** and an elif chain.
+def misc_dsl(x, y):
+    q = int(x) // 3
+    r = x % 7.0
+    s = where(r < 2.0, x**2.0, y**0.5)  # noqa: F821
+    out = q + r + s
+    if out > 10.0:
+        out = out - 10.0
+    elif out > 5.0:
+        out = out - 5.0
+    else:
+        out = out + 1.0
+    return out
+
+
+def _run_node(module, pts, scalars):
+    """Run the emitted JS over `pts` (list of input rows) and return the output list."""
+    node = shutil.which("node")
+    if not node:
+        pytest.skip("node not found; skipping JS numeric-equivalence check")
+    ncols = len(pts[0])
+    cols = "".join(f"const c{j} = Float64Array.from(pts.map(p => p[{j}]));\n" for j in range(ncols))
+    ops = ", ".join([f"c{j}" for j in range(ncols)] + [str(s) for s in scalars])
+    isarr = ", ".join(["true"] * ncols + ["false"] * len(scalars))
+    prog = f"""
+const __run = (function() {{ {module} }})();
+const pts = {json.dumps(pts)};
+const out = new Float64Array(pts.length);
+{cols}__run([{ops}], [{isarr}], out, pts.length);
+console.log(JSON.stringify(Array.from(out)));
+"""
+    # Write to a temp file rather than `node -e `: a big inlined program (the points
+    # are JSON-embedded) overflows the Windows command-line length limit (WinError 206).
+    with tempfile.TemporaryDirectory() as d:
+        script = os.path.join(d, "dsl_js_check.js")
+        with open(script, "w", encoding="utf-8") as fh:
+            fh.write(prog)
+        res = subprocess.run([node, script], capture_output=True, text=True)
+    if res.returncode != 0:
+        raise AssertionError(f"node failed:\n{res.stderr}")
+    return json.loads(res.stdout)
+
+
+def test_transpile_structure():
+    js_src, params = dsl_to_js(newton_dsl)
+    assert params == ["a", "b", "max_iter", "relax"]
+    assert "function newton_dsl(a, b, max_iter, relax)" in js_src
+    assert "for (let k = 0; k < max_iter" in js_src
+    assert "Math.pow" not in js_src  # newton uses no ** -> no Math.pow expected
+    assert "break;" in js_src
+
+    misc_js, _ = dsl_to_js(misc_dsl)
+    assert "Math.pow" in misc_js  # **
+    assert "Math.floor" in misc_js  # //
+    assert "pymod(" in misc_js  # %
+    assert "? " in misc_js  # where()
+    assert "} else if " in misc_js
+
+    for src in (js_src, misc_js, build_js_module(newton_dsl)):
+        assert src.count("{") == src.count("}"), "unbalanced braces"
+
+
+def test_index_symbols_transpile():
+    # Index/shape symbols become trailing kernel params; the driver gains the geometry args.
+    def ramp(a):
+        return float(_i0) * _n1 + _i1  # noqa: F821
+
+    js_src, params = dsl_to_js(ramp)
+    assert params == ["a"]  # only the user input is reported as a param
+    assert "function ramp(a, _i0, _i1, _n1)" in js_src
+
+    mod = build_js_module(ramp, ndim=2)
+    assert "function __run(ops, isarr, out, n, off, gshape, cshape)" in mod
+    assert "const _i0 = off[0] + loc[0];" in mod
+    assert "const _n1 = gshape[1];" in mod
+
+    # flat_idx pulls in the global-flatten loop.
+    def flat(a):
+        return float(_flat_idx)  # noqa: F821
+
+    flat_mod = build_js_module(flat, ndim=2)
+    assert "_flat_idx = _flat_idx * gshape[k]" in flat_mod
+
+    # _ndim resolves to the runtime block rank (cshape.length).
+    def ndim_k(a):
+        return float(_ndim)  # noqa: F821
+
+    ndim_mod = build_js_module(ndim_k, ndim=3)
+    assert "function ndim_k(a, _ndim)" in ndim_mod
+    assert "const _ndim = d;" in ndim_mod
+
+
+def test_index_symbols_need_ndim_and_valid_axis():
+    def ramp(a):
+        return float(_i0) + _i1  # noqa: F821
+
+    # ndim is required to know the rank / validate the referenced axes.
+    with pytest.raises(Exception, match="ndim"):
+        build_js_module(ramp)
+    # axis 1 referenced but the output is 1-D -> rejected.
+    with pytest.raises(Exception, match="axis 1"):
+        build_js_module(ramp, ndim=1)
+
+
+def _run_node_index(module, gshape, off, cshape, ncols=1):
+    """Run an index-aware module over one block and return the (flat) output list."""
+    node = shutil.which("node")
+    if not node:
+        pytest.skip("node not found; skipping JS numeric-equivalence check")
+    n = int(np.prod(cshape))
+    ops = ", ".join([f"new Float64Array({n})"] * ncols)
+    isarr = ", ".join(["true"] * ncols)
+    prog = (
+        f"const __run = (function() {{ {module} }})();\n"
+        f"const out = new Float64Array({n});\n"
+        f"__run([{ops}], [{isarr}], out, {n}, "
+        f"{json.dumps(list(off))}, {json.dumps(list(gshape))}, {json.dumps(list(cshape))});\n"
+        "console.log(JSON.stringify(Array.from(out)));\n"
+    )
+    with tempfile.TemporaryDirectory() as d:
+        script = os.path.join(d, "dsl_js_idx.js")
+        with open(script, "w", encoding="utf-8") as fh:
+            fh.write(prog)
+        res = subprocess.run([node, script], capture_output=True, text=True)
+    if res.returncode != 0:
+        raise AssertionError(f"node failed:\n{res.stderr}")
+    return json.loads(res.stdout)
+
+
+@pytest.mark.skipif(blosc2.IS_WASM, reason="emscripten cannot spawn the node subprocess")
+def test_index_ramp_matches_numpy():
+    def ramp(a):
+        return float(_i0) * _n1 + _i1  # noqa: F821
+
+    gshape = (16, 9)
+    expected = np.arange(np.prod(gshape), dtype=np.float64).reshape(gshape)
+    mod = build_js_module(ramp, ndim=2)
+    # A non-origin block exercises the offset handling.
+    off, cshape = (8, 0), (8, 9)
+    got = np.array(_run_node_index(mod, gshape, off, cshape)).reshape(cshape)
+    np.testing.assert_array_equal(got, expected[8:16, :])
+
+
+@pytest.mark.skipif(blosc2.IS_WASM, reason="emscripten cannot spawn the node subprocess")
+def test_flat_idx_matches_numpy():
+    def flat(a):
+        return float(_flat_idx) * 2.0  # noqa: F821
+
+    gshape = (16, 9)
+    expected = np.arange(np.prod(gshape), dtype=np.float64).reshape(gshape) * 2.0
+    mod = build_js_module(flat, ndim=2)
+    off, cshape = (4, 3), (3, 4)
+    got = np.array(_run_node_index(mod, gshape, off, cshape)).reshape(cshape)
+    np.testing.assert_array_equal(got, expected[4:7, 3:7])
+
+
+@pytest.mark.skipif(blosc2.IS_WASM, reason="emscripten cannot spawn the node subprocess")
+def test_newton_matches_python():
+    w, h, max_iter, relax = 40, 30, 48, 1.37
+    pts = [[-1.7 + 3.4 * c / (w - 1), -1.1 + 2.2 * r / (h - 1)] for r in range(h) for c in range(w)]
+    py_vals = [newton_dsl(a, b, max_iter, relax) for a, b in pts]
+    js_vals = _run_node(build_js_module(newton_dsl), pts, [max_iter, relax])
+    maxdiff = max(abs(p - j) for p, j in zip(py_vals, js_vals, strict=True))
+    assert maxdiff < 1e-9, f"newton py-vs-js mismatch: maxdiff={maxdiff}"
+
+
+@pytest.mark.skipif(blosc2.IS_WASM, reason="emscripten cannot spawn the node subprocess")
+def test_misc_matches_python():
+    pts = [[3.5, 16.0], [1.2, 9.0], [-4.3, 25.0], [8.0, 4.0], [0.0, 100.0]]
+    ref = []
+    for x, y in pts:
+        q = int(x) // 3
+        r = x % 7.0
+        s = (x**2.0) if (r < 2.0) else (y**0.5)
+        o = q + r + s
+        o = o - 10.0 if o > 10.0 else (o - 5.0 if o > 5.0 else o + 1.0)
+        ref.append(o)
+    js_vals = _run_node(build_js_module(misc_dsl), pts, [])
+    mdiff = max(abs(p - j) for p, j in zip(ref, js_vals, strict=True))
+    assert mdiff < 1e-9, f"misc py-vs-js mismatch: maxdiff={mdiff}"
+
+
+# --- prefer-js-with-fallback backend selection (logic only; the bridge isn't *run* here, so
+# no real WASM is needed -- IS_WASM is monkeypatched and js_kernel only transpiles) ----------
+@blosc2.dsl_kernel
+def _add(a, b):
+    return a + b
+
+
+@blosc2.dsl_kernel
+def _idx(a):
+    return a + float(_i0)  # noqa: F821  index symbol -> needs the output shape to transpile
+
+
+def test_prefer_js_selection(monkeypatch):
+    monkeypatch.setattr(blosc2, "IS_WASM", True)
+    af = blosc2.asarray(np.ones((4, 4), dtype=np.float64))
+    ai = blosc2.asarray(np.ones((4, 4), dtype=np.int64))
+
+    def sel(jit, jit_backend, operands, kwargs, reduce_args=None):
+        return lx._maybe_js_backend(_add, jit, jit_backend, reduce_args or {}, operands, kwargs)
+
+    # jit=None and jit=True both prefer js (js is a JIT) -> swapped to a plain callable
+    for jit in (None, True):
+        expr, _, jb = sel(jit, None, {"a": af, "b": af}, {"dtype": np.float64})
+        assert callable(expr)
+        assert not lx._is_dsl_kernel_expression(expr)
+        assert jb is None
+
+    # jit=False (interpreter) opts out -> stays the DSL kernel for miniexpr
+    assert sel(False, None, {"a": af, "b": af}, {"dtype": np.float64})[0] is _add
+
+    # explicit jit_backend opts out too (here tcc would force miniexpr)
+    assert sel(True, "tcc", {"a": af, "b": af}, {"dtype": np.float64})[0] is _add
+
+    # explicit strict_miniexpr=True opts out (keep miniexpr); =False/absent does not
+    assert sel(None, None, {"a": af, "b": af}, {"dtype": np.float64, "strict_miniexpr": True})[0] is _add
+    expr, *_ = sel(None, None, {"a": af, "b": af}, {"dtype": np.float64, "strict_miniexpr": False})
+    assert callable(expr)
+    assert not lx._is_dsl_kernel_expression(expr)
+
+    # integer *output*, reductions -> fall back to miniexpr
+    assert sel(None, None, {"a": ai, "b": ai}, {"dtype": np.int64})[0] is _add
+    assert sel(None, None, {"a": af}, {}, reduce_args={"op": "sum"})[0] is _add
+
+    # zero-input DSL stays on miniexpr (the zero-input fast path needs the DSL kernel)
+    assert sel(None, None, {}, {"dtype": np.float64})[0] is _add
+
+    # integer *inputs* with a float output -> JS (the bridge float64-converts operands)
+    expr, *_ = sel(None, None, {"a": ai, "b": ai}, {"dtype": np.float64})
+    assert callable(expr)
+    assert not lx._is_dsl_kernel_expression(expr)
+
+
+def test_prefer_js_index_needs_shape(monkeypatch):
+    monkeypatch.setattr(blosc2, "IS_WASM", True)
+    af = blosc2.asarray(np.ones((4, 4), dtype=np.float64))
+    # Without a shape the transpiler can't size the index symbols -> fall back to miniexpr.
+    expr, _, _ = lx._maybe_js_backend(_idx, None, None, {}, {"a": af}, {"dtype": np.float64})
+    assert expr is _idx
+    # With the output shape supplied, the index kernel transpiles and JS is chosen.
+    expr, _, _ = lx._maybe_js_backend(_idx, None, None, {}, {"a": af}, {"dtype": np.float64}, shape=(4, 4))
+    assert callable(expr)
+    assert not lx._is_dsl_kernel_expression(expr)
+
+
+@pytest.mark.skipif(blosc2.IS_WASM, reason="this test asserts off-WASM behavior")
+def test_explicit_js_off_wasm_raises():
+    # jit_backend="js" is an explicit choice -> hard error off-WASM (not a silent fallback).
+    assert not blosc2.IS_WASM  # this test runs on a native build
+    with pytest.raises(RuntimeError, match="WebAssembly"):
+        lx._maybe_js_backend(_add, None, "js", {}, {}, {})
+
+
+def test_explicit_js_integer_output_raises():
+    # Integer/complex output is left to miniexpr; explicit jit_backend="js" must reject it
+    # (the float64 bridge can't reproduce integer semantics) rather than silently compute.
+    af = blosc2.asarray(np.ones((4, 4), dtype=np.float64))
+    with pytest.raises(ValueError, match="floating-point output"):
+        lx._maybe_js_backend(_add, None, "js", {}, {"a": af, "b": af}, {"dtype": np.int64})
+    with pytest.raises(ValueError, match="floating-point output"):
+        lx._maybe_js_backend(_add, None, "js", {}, {"a": af, "b": af}, {"dtype": np.complex128})
diff --git a/tests/ndarray/test_dsl_kernels.py b/tests/ndarray/test_dsl_kernels.py
index d09c479be..0439015cf 100644
--- a/tests/ndarray/test_dsl_kernels.py
+++ b/tests/ndarray/test_dsl_kernels.py
@@ -23,6 +23,46 @@
 clip = np.clip
 
 
+def _jit_backend_available():
+    """Whether the miniexpr runtime JIT backend is bundled in this build.
+
+    The JIT compiler is not shipped on every platform (e.g. the Windows wheels lack
+    it); there a DSL kernel still compiles and runs, but via the interpreter rather
+    than a JIT kernel.  Probe a trivial kernel once so the JIT-specific assertions can
+    be gated on the platform actually having a JIT backend."""
+    try:
+
+        @blosc2.dsl_kernel
+        def _probe(a):
+            return a + 1.0
+
+        return bool(blosc2.validate_dsl_jit(_probe, [np.float64], np.float64).get("jit"))
+    except Exception:
+        return False
+
+
+JIT_AVAILABLE = _jit_backend_available()
+
+
+def _expect_jit(status):
+    """Assert *status* compiled, and that it produced a runtime JIT kernel where the
+    platform actually has a JIT backend (see :func:`_jit_backend_available`)."""
+    assert status["compiled"]
+    if JIT_AVAILABLE:
+        assert status["jit"]
+
+
+@pytest.fixture(autouse=True)
+def _no_auto_js_backend(monkeypatch):
+    """Keep this module on the miniexpr/DSL path. Under WebAssembly the default prefers the
+    JS backend for float kernels, which would bypass ``_set_pref_expr`` and break the
+    miniexpr-specific assertions here. Stubbing the dtype gate to ``False`` disables only the
+    *auto* prefer-js (explicit ``jit_backend="js"`` still works, and is covered by
+    test_dsl_js.py / test_wasm_dsl_jit.py). No-op off WebAssembly (prefer-js never engages)."""
+    # `blosc2.lazyexpr` the attribute is the re-exported function; patch the actual module.
+    monkeypatch.setattr(sys.modules["blosc2.lazyexpr"], "_js_dtypes_ok", lambda *a, **k: False)
+
+
 def _make_arrays(shape=(8, 8), chunks=(4, 4), blocks=(2, 2)):
     a = np.linspace(0, 1, num=np.prod(shape), dtype=np.float32).reshape(shape)
     b = np.linspace(1, 2, num=np.prod(shape), dtype=np.float32).reshape(shape)
@@ -319,9 +359,9 @@ def wrapped_set_pref_expr(self, expression, inputs, fp_accuracy, aux_reduc=None,
     assert "_n1" in captured["expr"]
     assert "_i1" in captured["expr"]
     # ...and it JIT-compiles rather than silently running on the interpreter.
-    assert blosc2.validate_dsl_jit(kernel_index_ramp_float_cast, {"x": np.float32}, np.float32, shape=shape)[
-        "jit"
-    ]
+    _expect_jit(
+        blosc2.validate_dsl_jit(kernel_index_ramp_float_cast, {"x": np.float32}, np.float32, shape=shape)
+    )
 
 
 def test_dsl_kernel_index_symbols_int_cast_matches_expected_ramp():
@@ -452,9 +492,11 @@ def wrapped_set_pref_expr(self, expression, inputs, fp_accuracy, aux_reduc=None,
         assert "range(niter)" not in captured["expr"]
         assert "float(niter)" not in captured["expr"]
         # ...and the loop+scalar kernel JIT-compiles (the original G3 fallback shape).
-        assert blosc2.validate_dsl_jit(
-            kernel_loop_param, {"x": a2.dtype, "y": b2.dtype, "niter": niter}, a2.dtype, shape=(32, 32)
-        )["jit"]
+        _expect_jit(
+            blosc2.validate_dsl_jit(
+                kernel_loop_param, {"x": a2.dtype, "y": b2.dtype, "niter": niter}, a2.dtype, shape=(32, 32)
+            )
+        )
     finally:
         lazyexpr_mod.try_miniexpr = old_try_miniexpr
 
@@ -1086,8 +1128,7 @@ def simple(a, b):
 
     st = blosc2.validate_dsl_jit(simple, [np.float64, np.float64], np.float64)
     assert st["valid"]
-    assert st["compiled"]
-    assert st["jit"]
+    _expect_jit(st)
     assert st["status"] == "ME_COMPILE_SUCCESS"
 
     # A scalar param is inlined (passed as a value); a variable named 'out' (the
@@ -1097,7 +1138,7 @@ def simple(a, b):
         "def withk(a, b, niter):\n    out = a + b * float(niter)\n    return out\n", "withk"
     )
     st = blosc2.validate_dsl_jit(with_out, {"a": np.float64, "b": np.float64, "niter": 3}, np.float64)
-    assert st["jit"]
+    _expect_jit(st)
 
     # Invalid syntax -> not valid, nothing compiled.
     invalid = kernel_from_source("def k(a, b):\n    a = a - b\n    return a\n", "k")
diff --git a/tests/ndarray/test_wasm_dsl_jit.py b/tests/ndarray/test_wasm_dsl_jit.py
index 00df2fd64..1dee30e5e 100644
--- a/tests/ndarray/test_wasm_dsl_jit.py
+++ b/tests/ndarray/test_wasm_dsl_jit.py
@@ -16,21 +16,94 @@ def _wasm_kernel(x, y):
     return (x + y) * 1.5 - 0.25
 
 
-@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
-def test_wasm_dsl_tcc_jit_smoke():
-    assert getattr(blosc2, "_WASM_MINIEXPR_ENABLED", False)
+@blosc2.dsl_kernel  # integer *output* -> the float64 JS bridge is unsafe -> must fall back to miniexpr
+def _wasm_int_kernel(x, y):
+    return x * 2 + y * 3
+
+
+@blosc2.dsl_kernel  # integer *inputs*, float output -> JS is safe (bridge float64-converts operands)
+def _wasm_int_input_kernel(x, y):
+    return (x + y) * 0.5
+
+
+@blosc2.dsl_kernel  # index/shape symbols -> JS reconstructs global coords per block
+def _wasm_ramp_kernel(x):
+    return float(_i0) * _n1 + _i1  # noqa: F821
 
+
+def _wasm_grids():
     a_np = np.linspace(-1.0, 1.0, 64, dtype=np.float64).reshape(8, 8)
     b_np = np.linspace(0.0, 2.0, 64, dtype=np.float64).reshape(8, 8)
     a = blosc2.asarray(a_np, chunks=(4, 4), blocks=(2, 2))
     b = blosc2.asarray(b_np, chunks=(4, 4), blocks=(2, 2))
+    return a_np, b_np, a, b
+
 
+@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
+def test_wasm_dsl_tcc_jit_smoke():
+    assert getattr(blosc2, "_WASM_MINIEXPR_ENABLED", False)
+
+    a_np, b_np, a, b = _wasm_grids()
     expr = blosc2.lazyudf(_wasm_kernel, (a, b), dtype=np.float64)
     out = expr.compute(jit=True, jit_backend="tcc", strict_miniexpr=True)
     expected = (a_np + b_np) * 1.5 - 0.25
     np.testing.assert_allclose(out[...], expected, rtol=1e-6, atol=1e-8)
 
 
+# The next three are the native-CI counterpart of bench/js-transpiler/dsl-js-node.mjs (which
+# checks the same paths via a micropip overlay): explicit js, the prefer-js default, and the
+# silent fallback to miniexpr when js can't run a kernel.
+@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
+def test_wasm_dsl_js_backend_smoke():
+    a_np, b_np, a, b = _wasm_grids()
+    out = blosc2.lazyudf(_wasm_kernel, (a, b), dtype=np.float64).compute(jit_backend="js")
+    np.testing.assert_allclose(out[...], (a_np + b_np) * 1.5 - 0.25, rtol=1e-6, atol=1e-8)
+
+
+@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
+def test_wasm_dsl_default_prefers_js():
+    # No jit/jit_backend -> under WASM this prefers js for a float kernel; just has to be correct.
+    a_np, b_np, a, b = _wasm_grids()
+    out = blosc2.lazyudf(_wasm_kernel, (a, b), dtype=np.float64).compute()
+    np.testing.assert_allclose(out[...], (a_np + b_np) * 1.5 - 0.25, rtol=1e-6, atol=1e-8)
+
+
+@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
+def test_wasm_dsl_default_falls_back_for_int():
+    # int dtype -> js bridge unsafe -> default must fall back to miniexpr with no error.
+    ai_np = np.arange(64, dtype=np.int64).reshape(8, 8)
+    bi_np = ai_np + 1
+    ai = blosc2.asarray(ai_np, chunks=(4, 4), blocks=(2, 2))
+    bi = blosc2.asarray(bi_np, chunks=(4, 4), blocks=(2, 2))
+    out = blosc2.lazyudf(_wasm_int_kernel, (ai, bi), dtype=np.int64).compute()
+    np.testing.assert_array_equal(out[...], ai_np * 2 + bi_np * 3)
+
+
+@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
+def test_wasm_dsl_int_inputs_float_output_via_js():
+    # Integer inputs with a float output: the default must prefer js and stay correct
+    # (the bridge converts every operand to float64, matching miniexpr's promotion).
+    ai_np = np.arange(64, dtype=np.int64).reshape(8, 8)
+    bi_np = ai_np + 1
+    ai = blosc2.asarray(ai_np, chunks=(4, 4), blocks=(2, 2))
+    bi = blosc2.asarray(bi_np, chunks=(4, 4), blocks=(2, 2))
+    out = blosc2.lazyudf(_wasm_int_input_kernel, (ai, bi), dtype=np.float64).compute()
+    np.testing.assert_allclose(out[...], (ai_np + bi_np) * 0.5, rtol=1e-6, atol=1e-8)
+
+
+@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
+def test_wasm_dsl_index_symbols_via_js():
+    # An index/shape-symbol ramp (multi-chunk, so per-block offsets are exercised) must
+    # transpile to JS and reproduce the global C-order ramp, both by default and explicitly.
+    shape = (8, 8)
+    x = blosc2.asarray(np.zeros(shape, dtype=np.float64), chunks=(4, 4), blocks=(2, 2))
+    expected = np.arange(np.prod(shape), dtype=np.float64).reshape(shape)
+    out_default = blosc2.lazyudf(_wasm_ramp_kernel, (x,), dtype=np.float64).compute()
+    np.testing.assert_array_equal(out_default[...], expected)
+    out_js = blosc2.lazyudf(_wasm_ramp_kernel, (x,), dtype=np.float64).compute(jit_backend="js")
+    np.testing.assert_array_equal(out_js[...], expected)
+
+
 @pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
 def test_wasm_string_predicates_strict_miniexpr():
     assert getattr(blosc2, "_WASM_MINIEXPR_ENABLED", False)