From 17380a4e2d2b1e8aff9decfedcfcd631c28086f0 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sun, 28 Jun 2026 12:26:22 +0200 Subject: [PATCH 01/13] Add DSL->JavaScript transpiler backend (jit_backend="js") Transpile a @blosc2.dsl_kernel (the bounded Python-ast subset DSLValidator accepts) to JavaScript, so kernels run at V8-optimized native speed in the browser/Pyodide. Wired into chunked_eval as a new backend alongside no-JIT and miniexpr-JIT: the kernel becomes a plain per-block UDF callable, so there are no compiled-code changes and the default path is untouched. Verified end-to-end under Pyodide (Newton 320x213, 24-frame sweep): correctness exact vs numpy, ~2x faster than the miniexpr JIT, ~8x over no-JIT. The bridge must hand the JS driver real JS Arrays (not Python lists), else the hot loop crosses the Python<->JS boundary per element (~10x slower). - src/blosc2/dsl_js.py: dsl_to_js(), build_js_module(), js_kernel() bridge - src/blosc2/lazyexpr.py: _as_js_udf() + jit_backend="js" swap in chunked_eval - tests/ndarray/test_dsl_js.py: transpiler + node numeric-equivalence tests - bench/js-transpiler/: headless Node+Pyodide bench, browser demo, README - plans/dsl-js.md: design and verified bench findings --- bench/js-transpiler/README.md | 52 ++++ bench/js-transpiler/dsl-js-node.mjs | 186 +++++++++++++ bench/js-transpiler/newton-dsl-js.html | 234 ++++++++++++++++ plans/dsl-js.md | 168 +++++++++++ src/blosc2/dsl_js.py | 372 +++++++++++++++++++++++++ src/blosc2/lazyexpr.py | 25 ++ tests/ndarray/test_dsl_js.py | 142 ++++++++++ 7 files changed, 1179 insertions(+) create mode 100644 bench/js-transpiler/README.md create mode 100644 bench/js-transpiler/dsl-js-node.mjs create mode 100644 bench/js-transpiler/newton-dsl-js.html create mode 100644 plans/dsl-js.md create mode 100644 src/blosc2/dsl_js.py create mode 100644 tests/ndarray/test_dsl_js.py diff --git a/bench/js-transpiler/README.md b/bench/js-transpiler/README.md new file mode 100644 index 000000000..541868ee6 --- /dev/null +++ b/bench/js-transpiler/README.md @@ -0,0 +1,52 @@ +# DSL → JavaScript transpiler benches + +Benches/demos for `blosc2.dsl_js`, which transpiles a `@blosc2.dsl_kernel` to JavaScript so +kernels run at V8-optimized native speed in the browser/Pyodide (the `jit_backend="js"` +path). Design and findings: [`plans/dsl-js.md`](../../plans/dsl-js.md). + +Both run a Newton-fractal kernel (high arithmetic intensity + per-pixel early exit) and +compare backends. Run everything from the **repo root**. + +## Headless (Node + Pyodide) — `dsl-js-node.mjs` + +Integration test **and** perf bench, no browser. Installs the blosc2 wasm wheel from PyPI, +overlays this working tree's pure-Python (`src/blosc2/dsl_js.py` + `lazyexpr.py`) on top of +it before importing blosc2 — so the wired `jit_backend="js"` path is exercised **without +rebuilding a wheel**. Asserts `js` and miniexpr-JIT both match a numpy reference exactly, +then benches a 24-frame `relax` sweep. + +```sh +npm i # pulls pyodide@314 (see package.json) +node bench/js-transpiler/dsl-js-node.mjs # correctness + 24-frame bench +node bench/js-transpiler/dsl-js-node.mjs 48 # N frames +``` + +Needs network on first run (PyPI wheel via micropip). Exits non-zero on a correctness +mismatch, so it works as a smoke test. Typical output (Apple M-series, blosc2 4.6.0): + +``` +correctness vs numpy: js maxdiff=0.00e+0 jit maxdiff=0.00e+0 OK + jit_backend="js" : ~16 ms/frame + jit (miniexpr) : ~31 ms/frame -> js ~2x faster + no-JIT : ~130 ms/frame -> js ~8x faster +``` + +> The overlay pins `blosc2==4.6.0` to keep the compiled `blosc2_ext` ABI in step with the +> pure-Python we drop on top. Once these changes ship in a Pyodide-installable wheel, the +> overlay can go away. If the overlay import ever breaks on version skew, overlay all of +> `src/blosc2/*.py` (or bump the pin). + +## Browser — `newton-dsl-js.html` + +Visual proof in a real browser: transpiles a real `@blosc2.dsl_kernel` under Pyodide, checks +the emitted JS against a numpy reference on the **same** inputs, and times it against a +hand-written JS kernel over the 24-frame sweep (ratio should sit near 1.00 — the transpiler +reaches hand-written-JS speed), then renders the fractal. + +```sh +python3 -m http.server # from the repo root +# open http://localhost:8000/bench/js-transpiler/newton-dsl-js.html and click Run +``` + +Serve from the repo root (not `file://`): the page fetches `/src/blosc2/dsl_js.py` (the +local transpiler, newer than the PyPI wheel) at a server-root-absolute path. diff --git a/bench/js-transpiler/dsl-js-node.mjs b/bench/js-transpiler/dsl-js-node.mjs new file mode 100644 index 000000000..c9f4c8863 --- /dev/null +++ b/bench/js-transpiler/dsl-js-node.mjs @@ -0,0 +1,186 @@ +// Headless integration test + perf bench for the DSL->JS backend (jit_backend="js"), +// using Pyodide-in-Node. Installs the blosc2 wasm wheel from PyPI, then OVERLAYS this +// working tree's pure-Python (src/blosc2/dsl_js.py + lazyexpr.py) on top of it before +// importing blosc2 -- so the wired path runs without waiting for a new wheel. +// +// npm i # pulls pyodide@314 (see package.json) +// node bench/js-transpiler/dsl-js-node.mjs # correctness + 24-frame bench +// node bench/js-transpiler/dsl-js-node.mjs 48 # bench with N frames +import { loadPyodide } from "pyodide"; +import { readFileSync } from "node:fs"; +import { fileURLToPath } from "node:url"; + +// Resolve paths from this file, so the harness runs from any CWD (repo root is ../../). +const ROOT = fileURLToPath(new URL("../../", import.meta.url)); +const NFRAMES = Number(process.argv[2]) || 24; + +// Kernel + bench live in a real module file: @blosc2.dsl_kernel runs inspect.getsource(), +// which needs the function to be file-backed (not exec'd from a string). +const PYSRC = String.raw` +import json, time +import numpy as np +import blosc2 + +WIDTH, HEIGHT, MAXITER = 320, 213, 48 +SPANX = 3.4 +ASPECT = HEIGHT / WIDTH +DTYPE = np.float64 + +@blosc2.dsl_kernel +def newton_dsl(a, b, max_iter, relax): + za = a + zb = b + mif = float(max_iter) + it = mif + for k in range(max_iter): + a2 = za * za + b2 = zb * zb + fr = za * a2 - 3.0 * za * b2 - 1.0 + fi = 3.0 * a2 * zb - zb * b2 + dr = 3.0 * (a2 - b2) + di = 6.0 * za * zb + den = dr * dr + di * di + 0.000000000001 + qr = relax * (fr * dr + fi * di) / den + qi = relax * (fi * dr - fr * di) / den + za = za - qr + zb = zb - qi + if qr * qr + qi * qi < 0.000001: + it = float(k) + break + d0 = (za - 1.0) * (za - 1.0) + zb * zb + d1 = (za + 0.5) * (za + 0.5) + (zb - 0.8660254) * (zb - 0.8660254) + d2 = (za + 0.5) * (za + 0.5) + (zb + 0.8660254) * (zb + 0.8660254) + root = 0.0 + md = d0 + if d1 < md: + md = d1 + root = 1.0 + if d2 < md: + root = 2.0 + return root + 0.9 * (it / mif) + +def newton_numpy(a, b, max_iter, relax): + za = a.copy(); zb = b.copy() + mif = float(max_iter) + it = np.full(a.shape, mif) + alive = np.ones(a.shape, dtype=bool) + for k in range(max_iter): + a2 = za * za; b2 = zb * zb + fr = za * a2 - 3.0 * za * b2 - 1.0; fi = 3.0 * a2 * zb - zb * b2 + dr = 3.0 * (a2 - b2); di = 6.0 * za * zb; den = dr * dr + di * di + 1e-12 + qr = relax * (fr * dr + fi * di) / den; qi = relax * (fi * dr - fr * di) / den + za = za - qr; zb = zb - qi + c = alive & ((qr * qr + qi * qi) < 1e-6); it[c] = k; alive &= ~c + if not alive.any(): + break + d0 = (za - 1.0) ** 2 + zb ** 2 + d1 = (za + 0.5) ** 2 + (zb - 0.8660254) ** 2 + d2 = (za + 0.5) ** 2 + (zb + 0.8660254) ** 2 + root = np.zeros(a.shape); md = d0.copy() + m = d1 < md; md = np.where(m, d1, md); root = np.where(m, 1.0, root) + m = d2 < md; root = np.where(m, 2.0, root) + return root + 0.9 * (it / mif) + +_x = np.linspace(-SPANX / 2, SPANX / 2, WIDTH, dtype=DTYPE) +_y = np.linspace(-SPANX * ASPECT / 2, SPANX * ASPECT / 2, HEIGHT, dtype=DTYPE) +A_NP, B_NP = np.meshgrid(_x, _y) +_chunks = (min(100, HEIGHT), min(150, WIDTH)) +_blocks = (max(1, _chunks[0] // 4), max(1, _chunks[1] // 3)) +_cp = blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=1) +A_B2 = blosc2.asarray(A_NP, chunks=_chunks, blocks=_blocks, cparams=_cp) +B_B2 = blosc2.asarray(B_NP, chunks=_chunks, blocks=_blocks, cparams=_cp) + +def run(relax, backend): + kw = {"dtype": DTYPE, "cparams": _cp} + if backend == "js": + kw["jit_backend"] = "js" + elif backend == "jit": + kw["jit"] = True + else: + kw["jit"] = False + return blosc2.lazyudf(newton_dsl, (A_B2, B_B2, MAXITER, relax), **kw)[:] + +def correctness(): + rx = 1.37 + ref = newton_numpy(A_NP, B_NP, MAXITER, rx) + return { + "diff_js": float(np.max(np.abs(run(rx, "js") - ref))), + "diff_jit": float(np.max(np.abs(run(rx, "jit") - ref))), + } + +def bench(nframes): + relaxes = [float(v) for v in np.linspace(1.0, 1.85, nframes)] + def sweep_ms(backend): + for r in relaxes[:2]: + run(r, backend) # warm + best = float("inf") + for _ in range(3): + t = time.perf_counter() + for r in relaxes: + run(r, backend) + best = min(best, (time.perf_counter() - t) * 1000) + return best + return {b: sweep_ms(b) for b in ("js", "jit", "nojit")} + +def debug_bridge(): + # Call the JS bridge directly (no lazyudf/blosc2 machinery) to isolate marshal+compile+compute. + import blosc2.dsl_js as dj + bridge = dj.js_kernel(newton_dsl) + out = np.empty((HEIGHT, WIDTH), dtype=DTYPE) + inp = (A_NP, B_NP, MAXITER, 1.37) + t = time.perf_counter(); bridge(inp, out, 0); first = (time.perf_counter() - t) * 1000 + best = float("inf") + for _ in range(8): + t = time.perf_counter(); bridge(inp, out, 0); best = min(best, (time.perf_counter() - t) * 1000) + return {"first_ms": first, "warm_ms": best} + +def result(nframes): + out = correctness() + out["ms"] = bench(nframes) + out["bridge"] = debug_bridge() + out["nframes"] = nframes + return json.dumps(out) +`; + +const py = await loadPyodide(); +await py.loadPackage("micropip"); +// Pin to the release this tree is based on, to keep the C-extension ABI in step with the +// pure-Python we overlay. The compiled blosc2_ext comes from the wheel; only .py is ours. +await py.runPythonAsync(`import micropip; await micropip.install("blosc2==4.6.0")`); + +// find_spec does NOT import blosc2 -- so we can patch files before first import. +const pkgdir = await py.runPythonAsync( + `import importlib.util, os; os.path.dirname(importlib.util.find_spec("blosc2").origin)`, +); +for (const f of ["dsl_js.py", "lazyexpr.py"]) { + py.FS.writeFile(`${pkgdir}/${f}`, readFileSync(`${ROOT}src/blosc2/${f}`)); +} +await py.runPythonAsync(` +import sys, blosc2 +assert hasattr(sys.modules["blosc2.lazyexpr"], "_as_js_udf"), "overlay did not take" +`); +console.log("blosc2", await py.runPythonAsync("blosc2.__version__"), + "| Pyodide", py.version, "| frames", NFRAMES); + +py.FS.writeFile("/newton_bench.py", new TextEncoder().encode(PYSRC)); +const out = await py.runPythonAsync(` +import sys +if "/" not in sys.path: sys.path.insert(0, "/") +import newton_bench +newton_bench.result(${NFRAMES}) +`); + +const r = JSON.parse(out); +const ok = r.diff_js < 1e-9 && r.diff_jit < 1e-9; +console.log(`\ncorrectness vs numpy: js maxdiff=${r.diff_js.toExponential(2)} ` + + `jit maxdiff=${r.diff_jit.toExponential(2)} ${ok ? "OK" : "MISMATCH"}`); +const per = ms => `${ms.toFixed(0)} ms total (${(ms / r.nframes).toFixed(2)} ms/frame)`; +console.log(`\nperf (${r.nframes}-frame relax sweep, best of 3):`); +console.log(` jit_backend="js" : ${per(r.ms.js)}`); +console.log(` jit (miniexpr) : ${per(r.ms.jit)}`); +console.log(` no-JIT : ${per(r.ms.nojit)}`); +console.log(` js vs jit : ${(r.ms.jit / r.ms.js).toFixed(2)}x ` + + `js vs no-jit : ${(r.ms.nojit / r.ms.js).toFixed(2)}x`); +console.log(`\nbridge probe (direct call, no blosc2 machinery): ` + + `first=${r.bridge.first_ms.toFixed(1)} ms warm=${r.bridge.warm_ms.toFixed(1)} ms`); +if (!ok) process.exit(1); diff --git a/bench/js-transpiler/newton-dsl-js.html b/bench/js-transpiler/newton-dsl-js.html new file mode 100644 index 000000000..574390240 --- /dev/null +++ b/bench/js-transpiler/newton-dsl-js.html @@ -0,0 +1,234 @@ + + +blosc2 DSL → JavaScript transpiler (Pyodide) + + + +

blosc2 DSL → JavaScript transpiler

+

Takes a real @blosc2.dsl_kernel, transpiles it to JS with +dsl_js.build_js_module(), and runs the emitted JS in the browser. Correctness: +checked against a numpy reference on the same input arrays. Speed: a fair, warmed, +best-of-N comparison against a hand-written JS kernel over the full 24-frame relax +sweep (a total, to clear the browser timer-resolution floor) — the ratio should sit near 1.00, +i.e. the transpiler reaches hand-written-JS speed. See plans/dsl-js.md.

+ + loading Pyodide… +

+
+
+
diff --git a/plans/dsl-js.md b/plans/dsl-js.md
new file mode 100644
index 000000000..034a44536
--- /dev/null
+++ b/plans/dsl-js.md
@@ -0,0 +1,168 @@
+# Plan: Transpile blosc2 DSL kernels to JavaScript (browser/Pyodide accel)
+
+## Context
+
+In `newton-js-vs-numpy-vs-nojit-vs-jit.html`, the same Newton-fractal kernel runs four
+ways in the browser. Measured: JS 272 ms, numpy 1302 ms, blosc2 no-JIT 3023 ms, blosc2
+JIT 887 ms. Hand-written JavaScript is **~3.3× faster than the blosc2 WASM JIT** and ~11×
+faster than the no-JIT interpreter, because V8 JIT-compiles a fused scalar loop to
+optimized native code while blosc2's WASM JIT (tcc/miniexpr) does not.
+
+The same blosc2 DSL kernels (`@blosc2.dsl_kernel`) are written in a strict, bounded subset
+of Python that is already parsed via the stdlib `ast` module. So we can **transpile a DSL
+kernel to JavaScript** and run that JS in the browser, capturing the V8 speed win — without
+the user rewriting the kernel. This is browser/Pyodide-only by nature.
+
+### Why this shape (decisions settled)
+
+- **Single-threaded, per-block, via the existing `lazyudf` callable seam.** `lazyudf(func,
+  inputs)` already accepts a plain Python callable `func(inputs_tuple, output, offset)` and
+  drives it per-block through `chunked_eval`/`slices_eval`. Plugging a JS bridge in there
+  needs **zero changes to compiled code** (no `.pyx` edits, no rebuild) and handles
+  multi-input kernels (Newton takes `a`, `b`) correctly.
+- **Not a postfilter.** A postfilter is single-input / same-itemsize / 1:1 — wrong shape for
+  an N-input compute kernel. (Postfilter is the hook only for a future *transparent fused
+  read*; different feature.)
+- **No Web Worker pool / SharedArrayBuffer in the MVP.** That's a parallel-runtime project
+  (tiling driver, COOP/COEP headers, Atomics join) mostly *outside* blosc2. Single-threaded
+  JS already beats the JIT 3.3×; parallelism is deferred until measured need. Generalizing
+  the per-block bridge to **per-chunk** is the natural next rung.
+- **Shipped as `src/blosc2/dsl_js.py`** behind `jit_backend="js"` (a new backend alongside
+  no-JIT and miniexpr-JIT). Wiring is one swap in `chunked_eval`; no compiled-code changes.
+  Started as a repo-root prototype, graduated once benches confirmed the ~2× win.
+
+## Feasibility summary
+
+- **Grammar is bounded and known.** `DSLValidator` in `src/blosc2/dsl_kernel.py:265-492`
+  enumerates exactly the supported nodes: assign/augassign, if/elif/else, `for ... in
+  range()`, while, break, continue, return; binops `+ - * / // % ** & | ^ << >>`,
+  single comparisons, bool ops, unary `+ - not`, calls to `range/where/int/float/bool` and
+  `np.* / numpy.* / math.*`, name/constant. Ternary, chained compares, tuple-unpack, input
+  reassignment are rejected. A transpiler maps this set ~1:1 to JS.
+- **Kernel source is available:** `DSLKernel.dsl_source` (`src/blosc2/dsl_kernel.py:495+`),
+  dedented and ready to `ast.parse`.
+- **Scalar semantics:** the kernel reads like per-element scalar code (per-pixel `for`/`break`),
+  exactly like the hand-written `newtonJS` in the demo. So transliterate the DSL function to a
+  JS function with the **same signature**, then drive it element-by-element over each block.
+
+## Architecture
+
+```
+lazyudf(js_kernel(newton_dsl), (A_B2, B_B2, MAXITER, relax))[:]
+   -> chunked_eval / slices_eval  (existing, unchanged)
+       -> per block: bridge(inputs_tuple, output, offset)
+            marshal block numpy arrays -> JS typed arrays
+            run transpiled JS element loop
+            copy result back into `output`
+```
+
+`js_kernel(dsl_kernel)` returns the plain Python callable lazyudf expects. The transpiler runs
+in Python (pure stdlib `ast`), so it works inside Pyodide too.
+
+## Component 1 — transpiler (`dsl_to_js`)
+
+Walk the Python `ast` of `kernel.dsl_source` and emit a JS function with the same signature
+and body. Node mapping (mirror `DSLValidator`'s allowed set so we stay in lockstep):
+
+- **Assign**: first time a local name is seen → `let x = expr;`, later → `x = expr;` (seed the
+  "declared" set with the parameter names).
+- **AugAssign**: `+= -= *= /=` direct; expand `**= //= %=` to the binop form below.
+- **BinOp**: `+ - * /` direct; `**` → `Math.pow(a,b)`; `//` → `Math.floor(a/b)`;
+  `%` → `pymod(a,b)` helper `(((a%b)+b)%b)` (Python sign convention); `& | ^ << >>` → JS
+  bitwise (int32 coercion — fine for boolean masks, **ceiling:** real 64-bit int bitwise not
+  supported).
+- **BoolOp** `and`/`or` → `&&`/`||`. **UnaryOp** `+ - not` → `+ - !`.
+- **Compare** (single): `== != < <= > >=` → `=== !== < <= > >=`.
+- **Call**: `where(c,a,b)` → `(c ? a : b)`; `int(x)` → `Math.trunc(x)`; `float(x)` → `(x)`;
+  `bool(x)` → `((x)!=0)`; `np.*/numpy.*/math.*` → name table to `Math.*`
+  (`sin cos tan sqrt exp log abs floor ceil pow atan2 ...`); unknown name → raise.
+- **For** `for k in range(a[,b[,c]])` → `for (let k=START; k) { ; };
+   function __run(arrays, scalars, out, n) {
+     for (let i = 0; i < n; i++) out[i] = __k(/* per param: arrays[k][i] or scalars[k] */);
+   }
+   ```
+3. In Pyodide, materialize it once: `import js; run = js.eval("(...)")` → JS function proxy.
+4. Return a callable `bridge(inputs_tuple, output, offset)` that, per block:
+   - splits inputs into array operands (→ `arr.to_js()` typed arrays) and scalars,
+   - calls `run(arrays, scalars, out_js, n)`,
+   - copies `out_js` back into `output`.
+
+`# ponytail: per-block to_js() copy; swap to a zero-copy HEAPF64 view onto WASM linear memory
+only if marshaling shows up as the bottleneck.`
+
+Outside Pyodide (no `js`), `js_kernel` still exposes `.js_source` for inspection/testing and
+raises if you try to *run* it.
+
+## Files (as shipped)
+
+- **`src/blosc2/dsl_js.py`** — `dsl_to_js()`, `build_js_module()`, `js_kernel()` bridge.
+- **`src/blosc2/lazyexpr.py`** — `_as_js_udf()` + the `jit_backend="js"` swap in `chunked_eval`.
+- **`tests/ndarray/test_dsl_js.py`** — transpiler + node-backed numeric-equivalence tests.
+- **`bench/js-transpiler/dsl-js-node.mjs`** — headless Pyodide-in-Node integration test + bench.
+- **`bench/js-transpiler/newton-dsl-js.html`** — browser demo (transpiled vs hand-written JS).
+- **`bench/js-transpiler/README.md`** — how to run both.
+
+## Verification
+
+1. **Transpiler tests** — `pytest tests/ndarray/test_dsl_js.py`: structure + index-symbol
+   rejection, and (when `node` is on PATH) run the emitted JS over a grid and assert it matches
+   the Python kernel to ~1e-9.
+2. **Headless wired path** — `node bench/js-transpiler/dsl-js-node.mjs`: overlays the local
+   pure-Python onto the PyPI wheel and drives the real `lazyudf(jit_backend="js")` path;
+   asserts `js`/JIT both match numpy exactly, then benches. Exits non-zero on mismatch.
+3. **Browser** — serve repo root, open `bench/js-transpiler/newton-dsl-js.html`, click Run.
+
+## Bench findings (verified)
+
+Measured on Apple M-series, blosc2 4.6.0 under Pyodide 314, Newton 320×213 / max_iter=48,
+24-frame `relax` sweep (`dsl-js-node.mjs`):
+
+| backend | ms/frame | vs `js` |
+|---|---|---|
+| `jit_backend="js"` | **~16** | — |
+| miniexpr JIT | ~31 | js **~2× faster** |
+| no-JIT interpreter | ~130 | js ~8× faster |
+
+Correctness exact: `js` and JIT both `maxdiff=0.00` vs numpy.
+
+**The PyProxy gotcha (the one real bug the headless harness caught).** The bridge must pass
+the per-call operands to the JS driver as real **JS `Array`s**, not Python lists. A Python
+list arrives in JS as a `PyProxy`, so every `ops[k][i]` in the hot inner loop crosses the
+Python↔JS boundary — still correct, but ~**10× slower** (140 vs 8 ms/frame for the direct
+bridge call). The browser demo never hit this because it built its arrays in JS. Fix:
+`Array.new()` + `.push(...)` in `js_kernel`'s bridge. With that, per-chunk marshaling is cheap
+(multi-chunk ≈ single-chunk), so the **per-chunk driver below is *not* needed** for speed.
+
+## Deferred (explicitly not built now)
+
+- **Whole-array / fewer-crossing driver** — per-chunk is already cheap after the PyProxy fix,
+  so this is only worth it if a future kernel shows marshaling-bound; the transpiler is unchanged.
+- **Web Worker pool + SharedArrayBuffer** — real multithreading; needs COOP/COEP and a tiling
+  driver. Build only if single-thread proves too slow for a real workload.
+- **Index/shape symbols** (`_i0`/`_n0`/`_flat_idx`) in the transpiler.
+- **Postfilter-based transparent fused read** — different feature (single-input), single-threaded.
+
+## Known ceilings / limitations
+
+- Browser/Pyodide-only.
+- 64-bit integer bitwise ops degrade to int32 (JS).
+- `%` follows Python sign convention via helper; large-magnitude float edge cases may differ.
+- `range()` assumes positive step unless the step is a literal.
+- float64/float32 numeric kernels are the target; exotic dtypes untested.
diff --git a/src/blosc2/dsl_js.py b/src/blosc2/dsl_js.py
new file mode 100644
index 000000000..4446cf092
--- /dev/null
+++ b/src/blosc2/dsl_js.py
@@ -0,0 +1,372 @@
+"""Transpile a blosc2 DSL kernel to JavaScript, and run it from a lazyudf callable.
+
+Browser/Pyodide-only payoff: V8 JIT-compiles the emitted scalar loop to optimized native
+code, which in the Newton-fractal demo beats blosc2's WASM JIT ~3.3x and the no-JIT
+interpreter ~11x. See plans/dsl-js.md.
+
+Public API:
+    dsl_to_js(kernel)  -> (js_source, param_names)   # pure stdlib, runs anywhere
+    js_kernel(kernel)  -> callable for lazyudf(...)   # needs Pyodide `js` to *run*
+
+`kernel` may be a blosc2 DSLKernel (has .dsl_source), a plain function, or a source string.
+"""
+
+from __future__ import annotations
+
+import ast
+import inspect
+import json
+import textwrap
+
+# Wired into lazyexpr via jit_backend="js": a DSL kernel is transpiled here and run as a
+# plain per-block callable. Browser/Pyodide only (js_kernel imports `js` at call time).
+
+_INDEX_SYMBOLS = {"_i0", "_i1", "_i2", "_n0", "_n1", "_n2", "_flat_idx"}
+
+# numpy/math function name -> JS Math.* name (numpy aliases included).
+_MATH = {
+    "sin": "sin",
+    "cos": "cos",
+    "tan": "tan",
+    "asin": "asin",
+    "acos": "acos",
+    "atan": "atan",
+    "atan2": "atan2",
+    "arcsin": "asin",
+    "arccos": "acos",
+    "arctan": "atan",
+    "arctan2": "atan2",
+    "sinh": "sinh",
+    "cosh": "cosh",
+    "tanh": "tanh",
+    "exp": "exp",
+    "log": "log",
+    "log2": "log2",
+    "log10": "log10",
+    "sqrt": "sqrt",
+    "cbrt": "cbrt",
+    "pow": "pow",
+    "power": "pow",
+    "hypot": "hypot",
+    "floor": "floor",
+    "ceil": "ceil",
+    "trunc": "trunc",
+    "round": "round",
+    "abs": "abs",
+    "absolute": "abs",
+    "fabs": "abs",
+    "sign": "sign",
+    "min": "min",
+    "max": "max",
+    "minimum": "min",
+    "maximum": "max",
+}
+
+_BIN = {
+    ast.Add: "+",
+    ast.Sub: "-",
+    ast.Mult: "*",
+    ast.Div: "/",
+    ast.BitAnd: "&",
+    ast.BitOr: "|",
+    ast.BitXor: "^",
+    ast.LShift: "<<",
+    ast.RShift: ">>",
+}
+_AUG = {
+    ast.Add: "+=",
+    ast.Sub: "-=",
+    ast.Mult: "*=",
+    ast.Div: "/=",
+    ast.BitAnd: "&=",
+    ast.BitOr: "|=",
+    ast.BitXor: "^=",
+    ast.LShift: "<<=",
+    ast.RShift: ">>=",
+}
+_CMP = {
+    ast.Eq: "===",
+    ast.NotEq: "!==",
+    ast.Lt: "<",
+    ast.LtE: "<=",
+    ast.Gt: ">",
+    ast.GtE: ">=",
+}
+
+JS_PRELUDE = "const pymod = (a, b) => (((a % b) + b) % b);"
+
+
+class _DSLToJSError(Exception):
+    pass
+
+
+def _get_source(obj) -> str:
+    if hasattr(obj, "dsl_source"):
+        src = obj.dsl_source
+    elif isinstance(obj, str):
+        src = obj
+    elif callable(obj):
+        src = inspect.getsource(obj)
+    else:
+        raise _DSLToJSError(f"cannot get DSL source from {obj!r}")
+    return textwrap.dedent(src)
+
+
+class _Transpiler:
+    def transpile(self, func: ast.FunctionDef):
+        self.params = [a.arg for a in func.args.args]
+        self._reject_index_symbols(func)
+        hoist = self._hoist_names(func)
+        body = self._block(func.body, 1)
+        head = f"function {func.name}({', '.join(self.params)}) {{\n"
+        decl = f"  let {', '.join(sorted(hoist))};\n" if hoist else ""
+        return head + decl + body + "}", list(self.params)
+
+    # -- scope analysis -------------------------------------------------
+    def _reject_index_symbols(self, func):
+        for node in ast.walk(func):
+            if isinstance(node, ast.Name) and node.id in _INDEX_SYMBOLS:
+                raise _DSLToJSError(
+                    f"index/shape symbol '{node.id}' is not supported yet (MVP); "
+                    "see plans/dsl-js.md 'Deferred'"
+                )
+
+    def _hoist_names(self, func):
+        assigned, fortargets = set(), set()
+        for node in ast.walk(func):
+            if isinstance(node, ast.Assign):
+                for t in node.targets:
+                    if isinstance(t, ast.Name):
+                        assigned.add(t.id)
+            elif isinstance(node, ast.AugAssign) and isinstance(node.target, ast.Name):
+                assigned.add(node.target.id)
+            elif isinstance(node, ast.For) and isinstance(node.target, ast.Name):
+                fortargets.add(node.target.id)
+        return assigned - set(self.params) - fortargets
+
+    # -- statements -----------------------------------------------------
+    def _block(self, stmts, ind):
+        return "".join(self._stmt(s, ind) for s in stmts)
+
+    def _stmt(self, node, ind):
+        pad = "  " * ind
+        if isinstance(node, ast.Assign):
+            return f"{pad}{node.targets[0].id} = {self._expr(node.value)};\n"
+        if isinstance(node, ast.AugAssign):
+            return pad + self._augassign(node) + "\n"
+        if isinstance(node, ast.Return):
+            return f"{pad}return {self._expr(node.value)};\n"
+        if isinstance(node, ast.Expr):
+            return f"{pad}{self._expr(node.value)};\n"
+        if isinstance(node, ast.If):
+            return self._if(node, ind)
+        if isinstance(node, ast.For):
+            return self._for(node, ind)
+        if isinstance(node, ast.While):
+            return f"{pad}while ({self._expr(node.test)}) {{\n{self._block(node.body, ind + 1)}{pad}}}\n"
+        if isinstance(node, ast.Break):
+            return f"{pad}break;\n"
+        if isinstance(node, ast.Continue):
+            return f"{pad}continue;\n"
+        raise _DSLToJSError(f"unsupported statement: {type(node).__name__}")
+
+    def _augassign(self, node):
+        t, val, op = node.target.id, self._expr(node.value), type(node.op)
+        if op in _AUG:
+            return f"{t} {_AUG[op]} {val};"
+        if op is ast.Pow:
+            return f"{t} = Math.pow({t}, {val});"
+        if op is ast.FloorDiv:
+            return f"{t} = Math.floor({t} / {val});"
+        if op is ast.Mod:
+            return f"{t} = pymod({t}, {val});"
+        raise _DSLToJSError(f"unsupported augmented op: {op.__name__}")
+
+    def _if(self, node, ind):
+        pad = "  " * ind
+        s = f"{pad}if ({self._expr(node.test)}) {{\n{self._block(node.body, ind + 1)}{pad}}}"
+        if node.orelse:
+            if len(node.orelse) == 1 and isinstance(node.orelse[0], ast.If):
+                s += " else " + self._if(node.orelse[0], ind).lstrip()
+            else:
+                s += f" else {{\n{self._block(node.orelse, ind + 1)}{pad}}}\n"
+                return s
+        return s + "\n"
+
+    def _for(self, node, ind):
+        pad = "  " * ind
+        var = node.target.id
+        args = node.iter.args
+        if len(args) == 1:
+            start, stop, step, stepnode = "0", self._expr(args[0]), "1", None
+        elif len(args) == 2:
+            start, stop, step, stepnode = self._expr(args[0]), self._expr(args[1]), "1", None
+        else:
+            start, stop, step, stepnode = (
+                self._expr(args[0]),
+                self._expr(args[1]),
+                self._expr(args[2]),
+                args[2],
+            )
+        cond = f"{var} > {stop}" if _neg_literal(stepnode) else f"{var} < {stop}"
+        return (
+            f"{pad}for (let {var} = {start}; {cond}; {var} += {step}) {{\n"
+            f"{self._block(node.body, ind + 1)}{pad}}}\n"
+        )
+
+    # -- expressions ----------------------------------------------------
+    def _expr(self, node):
+        if isinstance(node, ast.Name):
+            return node.id
+        if isinstance(node, ast.Constant):
+            return _const(node.value)
+        if isinstance(node, ast.UnaryOp):
+            sym = {ast.Not: "!", ast.USub: "-", ast.UAdd: "+"}[type(node.op)]
+            return f"({sym}{self._expr(node.operand)})"
+        if isinstance(node, ast.BinOp):
+            return self._binop(node)
+        if isinstance(node, ast.BoolOp):
+            sym = "&&" if isinstance(node.op, ast.And) else "||"
+            return "(" + f" {sym} ".join(self._expr(v) for v in node.values) + ")"
+        if isinstance(node, ast.Compare):
+            op = _CMP[type(node.ops[0])]
+            return f"({self._expr(node.left)} {op} {self._expr(node.comparators[0])})"
+        if isinstance(node, ast.Call):
+            return self._call(node)
+        raise _DSLToJSError(f"unsupported expression: {type(node).__name__}")
+
+    def _binop(self, node):
+        left, right, op = self._expr(node.left), self._expr(node.right), type(node.op)
+        if op is ast.Pow:
+            return f"Math.pow({left}, {right})"
+        if op is ast.FloorDiv:
+            return f"Math.floor({left} / {right})"
+        if op is ast.Mod:
+            return f"pymod({left}, {right})"
+        if op in _BIN:
+            return f"({left} {_BIN[op]} {right})"
+        raise _DSLToJSError(f"unsupported binary op: {op.__name__}")
+
+    def _call(self, node):
+        name = self._call_name(node.func)
+        args = [self._expr(a) for a in node.args]
+        if name == "where":
+            if len(args) != 3:
+                raise _DSLToJSError("where() needs 3 args: where(cond, a, b)")
+            return f"({args[0]} ? {args[1]} : {args[2]})"
+        if name == "int":
+            return f"Math.trunc({args[0]})"
+        if name == "float":
+            return f"({args[0]})"
+        if name == "bool":
+            return f"(({args[0]}) != 0)"
+        if name == "range":
+            raise _DSLToJSError("range() is only valid as a for-loop iterator")
+        if name in _MATH:
+            return f"Math.{_MATH[name]}({', '.join(args)})"
+        raise _DSLToJSError(f"unsupported call: {name}()")
+
+    def _call_name(self, node):
+        if isinstance(node, ast.Name):
+            return node.id
+        if (
+            isinstance(node, ast.Attribute)
+            and isinstance(node.value, ast.Name)
+            and node.value.id in {"np", "numpy", "math"}
+        ):
+            return node.attr
+        raise _DSLToJSError("unsupported call target")
+
+
+def _neg_literal(node) -> bool:
+    if isinstance(node, ast.Constant) and isinstance(node.value, (int, float)):
+        return node.value < 0
+    return isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.USub)
+
+
+def _const(v) -> str:
+    if isinstance(v, bool):
+        return "true" if v else "false"
+    if isinstance(v, int):
+        return str(v)
+    if isinstance(v, float):
+        if v != v:
+            return "NaN"
+        if v == float("inf"):
+            return "Infinity"
+        if v == float("-inf"):
+            return "-Infinity"
+        return repr(v)
+    if isinstance(v, str):
+        return json.dumps(v)
+    raise _DSLToJSError(f"unsupported constant: {v!r}")
+
+
+def dsl_to_js(kernel):
+    """Transpile a DSL kernel to a JS function string. Returns (js_source, param_names)."""
+    tree = ast.parse(_get_source(kernel))
+    func = next((n for n in tree.body if isinstance(n, ast.FunctionDef)), None)
+    if func is None:
+        raise _DSLToJSError("no function definition found in DSL source")
+    return _Transpiler().transpile(func)
+
+
+def build_js_module(kernel) -> str:
+    """Self-contained JS: prelude + kernel + an `__run(ops, isarr, out, n)` element driver."""
+    kernel_js, params = dsl_to_js(kernel)
+    fname = ast.parse(_get_source(kernel)).body[0].name
+    call_args = ", ".join(f"(isarr[{k}] ? ops[{k}][i] : ops[{k}])" for k in range(len(params)))
+    driver = (
+        f"function __run(ops, isarr, out, n) {{ "
+        f"for (let i = 0; i < n; i++) out[i] = {fname}({call_args}); }}"
+    )
+    return f"{JS_PRELUDE}\n{kernel_js}\n{driver}\nreturn __run;"
+
+
+def js_kernel(kernel):
+    """Return a lazyudf-compatible callable that runs the transpiled JS (Pyodide only)."""
+    module = build_js_module(kernel)
+    run = None  # lazily created in-browser
+
+    def bridge(inputs, output, offset=None):
+        nonlocal run
+        import numpy as np
+        from js import Array, Float64Array, Uint8Array  # Pyodide
+
+        if run is None:
+            import js
+
+            run = js.eval(f"(function() {{ {module} }})()")
+
+        n = int(output.size)
+        # Pass real JS Arrays, not Python lists: a Python list arrives in JS as a PyProxy,
+        # so each ops[k][i] in the hot loop would cross the Python<->JS boundary (~10x slower).
+        ops = Array.new()
+        isarr = Array.new()
+        for x in inputs:
+            if isinstance(x, np.ndarray) and x.ndim > 0:
+                ops.push(
+                    _to_jsf64(
+                        np.ascontiguousarray(x, dtype=np.float64).reshape(-1), Float64Array, Uint8Array
+                    )
+                )
+                isarr.push(True)
+            else:
+                ops.push(float(x))
+                isarr.push(False)
+        out_js = Float64Array.new(n)
+        run(ops, isarr, out_js, n)
+        # ponytail: per-block to_js()/to_bytes() copies; swap to a zero-copy HEAPF64 view
+        # onto WASM linear memory only if marshaling shows up as the bottleneck.
+        res = np.frombuffer(bytes(out_js.to_bytes()), dtype=np.float64)
+        output.reshape(-1)[:] = res
+        return output
+
+    bridge.js_source = module
+    return bridge
+
+
+def _to_jsf64(xf, Float64Array, Uint8Array):
+    u8 = Uint8Array.new(xf.nbytes)
+    u8.assign(xf.tobytes())  # Pyodide TypedArray.assign(buffer) copies bytes in
+    return Float64Array.new(u8.buffer)
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 5653beff6..821894f41 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1432,6 +1432,28 @@ def _is_dsl_kernel_expression(expression) -> bool:
     return isinstance(expression, DSLKernel) and expression.dsl_source is not None
 
 
+def _as_js_udf(expression):
+    """For jit_backend="js": transpile a DSL kernel to JS and return a plain per-block
+    callable (so the normal UDF path runs it). Browser/Pyodide only."""
+    if not _is_dsl_kernel_expression(expression):
+        raise ValueError('jit_backend="js" requires a blosc2.dsl_kernel-decorated kernel')
+    if not blosc2.IS_WASM:
+        raise RuntimeError('jit_backend="js" is only available under WebAssembly/Pyodide')
+    from .dsl_js import js_kernel
+
+    return js_kernel(expression)
+
+
+def _maybe_js_backend(expression, jit, jit_backend, reduce_args):
+    """For jit_backend="js", swap the DSL kernel for its JS bridge (a plain per-block
+    callable) and disable miniexpr/backend-pragma. Otherwise a no-op passthrough."""
+    if jit_backend != "js":
+        return expression, jit, jit_backend
+    if reduce_args:
+        raise ValueError('jit_backend="js" does not support reductions')
+    return _as_js_udf(expression), None, None
+
+
 def _format_dsl_parse_error_hint(expr_text: str, backend_msg: str):
     marker = "parse_error_pos="
     pos0 = backend_msg.find(marker)
@@ -2961,6 +2983,9 @@ def chunked_eval(
             operands = {**operands, **where}
 
         reduce_args = kwargs.pop("_reduce_args", {})
+        # jit_backend="js": swap the DSL kernel for its JS bridge (a plain per-block callable).
+        expression, jit, jit_backend = _maybe_js_backend(expression, jit, jit_backend, reduce_args)
+
         fast_path = _validate_chunked_eval_inputs(operands, out, shape, reduce_args)
 
         # Activate last read cache for NDField instances
diff --git a/tests/ndarray/test_dsl_js.py b/tests/ndarray/test_dsl_js.py
new file mode 100644
index 000000000..162181dca
--- /dev/null
+++ b/tests/ndarray/test_dsl_js.py
@@ -0,0 +1,142 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team 
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+"""Tests for the DSL -> JavaScript transpiler (blosc2.dsl_js).
+
+The transpiler itself is pure stdlib and runs anywhere. Where `node` is on PATH we also
+run the emitted JS and check it matches the Python kernel semantics element-by-element.
+"""
+
+import json
+import shutil
+import subprocess
+
+import pytest
+
+from blosc2.dsl_js import build_js_module, dsl_to_js
+
+
+# Same Newton kernel as the demo (scalar semantics), as a plain function.
+def newton_dsl(a, b, max_iter, relax):
+    za = a
+    zb = b
+    mif = float(max_iter)
+    it = mif
+    for k in range(max_iter):
+        a2 = za * za
+        b2 = zb * zb
+        fr = za * a2 - 3.0 * za * b2 - 1.0
+        fi = 3.0 * a2 * zb - zb * b2
+        dr = 3.0 * (a2 - b2)
+        di = 6.0 * za * zb
+        den = dr * dr + di * di + 0.000000000001
+        qr = relax * (fr * dr + fi * di) / den
+        qi = relax * (fi * dr - fr * di) / den
+        za = za - qr
+        zb = zb - qi
+        if qr * qr + qi * qi < 0.000001:
+            it = float(k)
+            break
+    d0 = (za - 1.0) * (za - 1.0) + zb * zb
+    d1 = (za + 0.5) * (za + 0.5) + (zb - 0.8660254) * (zb - 0.8660254)
+    d2 = (za + 0.5) * (za + 0.5) + (zb + 0.8660254) * (zb + 0.8660254)
+    root = 0.0
+    md = d0
+    if d1 < md:
+        md = d1
+        root = 1.0
+    if d2 < md:
+        root = 2.0
+    return root + 0.9 * (it / mif)
+
+
+# Exercises where(), int(), //, %, ** and an elif chain.
+def misc_dsl(x, y):
+    q = int(x) // 3
+    r = x % 7.0
+    s = where(r < 2.0, x**2.0, y**0.5)  # noqa: F821
+    out = q + r + s
+    if out > 10.0:
+        out = out - 10.0
+    elif out > 5.0:
+        out = out - 5.0
+    else:
+        out = out + 1.0
+    return out
+
+
+def _run_node(module, pts, scalars):
+    """Run the emitted JS over `pts` (list of input rows) and return the output list."""
+    node = shutil.which("node")
+    if not node:
+        pytest.skip("node not found; skipping JS numeric-equivalence check")
+    ncols = len(pts[0])
+    cols = "".join(f"const c{j} = Float64Array.from(pts.map(p => p[{j}]));\n" for j in range(ncols))
+    ops = ", ".join([f"c{j}" for j in range(ncols)] + [str(s) for s in scalars])
+    isarr = ", ".join(["true"] * ncols + ["false"] * len(scalars))
+    prog = f"""
+const __run = (function() {{ {module} }})();
+const pts = {json.dumps(pts)};
+const out = new Float64Array(pts.length);
+{cols}__run([{ops}], [{isarr}], out, pts.length);
+console.log(JSON.stringify(Array.from(out)));
+"""
+    res = subprocess.run([node, "-e", prog], capture_output=True, text=True)
+    if res.returncode != 0:
+        raise AssertionError(f"node failed:\n{res.stderr}")
+    return json.loads(res.stdout)
+
+
+def test_transpile_structure():
+    js_src, params = dsl_to_js(newton_dsl)
+    assert params == ["a", "b", "max_iter", "relax"]
+    assert "function newton_dsl(a, b, max_iter, relax)" in js_src
+    assert "for (let k = 0; k < max_iter" in js_src
+    assert "Math.pow" not in js_src  # newton uses no ** -> no Math.pow expected
+    assert "break;" in js_src
+
+    misc_js, _ = dsl_to_js(misc_dsl)
+    assert "Math.pow" in misc_js  # **
+    assert "Math.floor" in misc_js  # //
+    assert "pymod(" in misc_js  # %
+    assert "? " in misc_js  # where()
+    assert "} else if " in misc_js
+
+    for src in (js_src, misc_js, build_js_module(newton_dsl)):
+        assert src.count("{") == src.count("}"), "unbalanced braces"
+
+
+def test_index_symbol_rejected():
+    def uses_index(a):
+        return a + _i0  # noqa: F821
+
+    with pytest.raises(Exception, match="index/shape symbol"):
+        dsl_to_js(uses_index)
+
+
+def test_newton_matches_python():
+    w, h, max_iter, relax = 40, 30, 48, 1.37
+    pts = [[-1.7 + 3.4 * c / (w - 1), -1.1 + 2.2 * r / (h - 1)] for r in range(h) for c in range(w)]
+    py_vals = [newton_dsl(a, b, max_iter, relax) for a, b in pts]
+    js_vals = _run_node(build_js_module(newton_dsl), pts, [max_iter, relax])
+    maxdiff = max(abs(p - j) for p, j in zip(py_vals, js_vals, strict=True))
+    assert maxdiff < 1e-9, f"newton py-vs-js mismatch: maxdiff={maxdiff}"
+
+
+def test_misc_matches_python():
+    pts = [[3.5, 16.0], [1.2, 9.0], [-4.3, 25.0], [8.0, 4.0], [0.0, 100.0]]
+    ref = []
+    for x, y in pts:
+        q = int(x) // 3
+        r = x % 7.0
+        s = (x**2.0) if (r < 2.0) else (y**0.5)
+        o = q + r + s
+        o = o - 10.0 if o > 10.0 else (o - 5.0 if o > 5.0 else o + 1.0)
+        ref.append(o)
+    js_vals = _run_node(build_js_module(misc_dsl), pts, [])
+    mdiff = max(abs(p - j) for p, j in zip(ref, js_vals, strict=True))
+    assert mdiff < 1e-9, f"misc py-vs-js mismatch: maxdiff={mdiff}"

From b7084ca8638be2b56f40cd864c5c27638a387b1f Mon Sep 17 00:00:00 2001
From: Francesc Alted 
Date: Sun, 28 Jun 2026 13:02:24 +0200
Subject: [PATCH 02/13] Experiment with a new worker pool bench

---
 bench/js-transpiler/README.md             |  34 ++++++
 bench/js-transpiler/worker-pool-bench.mjs | 126 ++++++++++++++++++++++
 2 files changed, 160 insertions(+)
 create mode 100644 bench/js-transpiler/worker-pool-bench.mjs

diff --git a/bench/js-transpiler/README.md b/bench/js-transpiler/README.md
index 541868ee6..57016c81b 100644
--- a/bench/js-transpiler/README.md
+++ b/bench/js-transpiler/README.md
@@ -50,3 +50,37 @@ python3 -m http.server          # from the repo root
 
 Serve from the repo root (not `file://`): the page fetches `/src/blosc2/dsl_js.py` (the
 local transpiler, newer than the PyPI wheel) at a server-root-absolute path.
+
+## Multithreading ceiling — `worker-pool-bench.mjs`
+
+Throwaway exploration of *how fast the transpiled kernel could go* with real JS
+multithreading: pure Node (`worker_threads` + `SharedArrayBuffer`), **no Pyodide, no
+blosc2**. Same Newton kernel, partitioned across a persistent worker pool with an Atomics
+barrier; reports speedup vs single-thread for 1/2/4/N workers.
+
+```sh
+node bench/js-transpiler/worker-pool-bench.mjs
+```
+
+Findings (Apple M2, 4 performance + 4 efficiency cores; laptop numbers vary ±10–15% with
+thermal/P-vs-E scheduling, so treat these as representative, not exact):
+
+| workers | ms/frame | speedup |
+|---|---|---|
+| single-thread | ~11.3 | 1.0× |
+| ×2 | ~5.9 | ~1.9× (~94% eff) |
+| ×4 | ~3.1 | ~3.5× (~88% eff) |
+| ×8 | ~2.3 | ~4.8× (~60% eff — E-cores) |
+
+- The worker mechanism is ~free (×1 ≈ 1.0×); scaling is near-linear up to the performance
+  core count. The ×8 drop-off is the M2's efficiency cores, not overhead.
+- **Load balancing is essential.** Contiguous row-bands regress badly (×4 fell to 1.48×)
+  because the per-pixel early-`break` makes some bands all-max-iter and others trivial.
+  **Striped** rows (worker `i` → rows `i, i+nw, …`) fix it — that's what the bench uses.
+
+Why this stays a *headroom* result, not a shipped feature: it measures **pure compute**.
+The real `jit_backend="js"` path also pays ~8 ms/frame of blosc2 decompress/compress that
+does not parallelize this way, plus Pyodide orchestration is single-threaded — so realistic
+end-to-end gain is a fraction of 5×. And a browser integration needs pure-JS workers (not the
+Pyodide bridge), a SharedArrayBuffer, COOP/COEP cross-origin isolation, and a path to get
+decompressed chunks into shared memory. See "Deferred" in [`plans/dsl-js.md`](../../plans/dsl-js.md).
diff --git a/bench/js-transpiler/worker-pool-bench.mjs b/bench/js-transpiler/worker-pool-bench.mjs
new file mode 100644
index 000000000..a9e928d7b
--- /dev/null
+++ b/bench/js-transpiler/worker-pool-bench.mjs
@@ -0,0 +1,126 @@
+// Throwaway: how fast can the transpiled Newton kernel go with real JS multithreading?
+// Pure Node (worker_threads + SharedArrayBuffer), no Pyodide, no blosc2 -- isolates the
+// compute ceiling and per-dispatch overhead a Web Worker pool would hit in a browser.
+// The kernel is the same scalar loop dsl_js emits; hand-written here to avoid Pyodide.
+//
+//   node bench/js-transpiler/worker-pool-bench.mjs
+import { Worker, isMainThread, workerData } from "node:worker_threads";
+import os from "node:os";
+import { performance } from "node:perf_hooks";
+
+const WIDTH = 320, HEIGHT = 213, MAXITER = 48, NFRAMES = 24, SPANX = 3.4;
+const ASPECT = HEIGHT / WIDTH;
+const N = WIDTH * HEIGHT;
+
+// Striped rows (rowStart, rowStep): worker i does rows i, i+nw, ... so the per-pixel
+// early-exit work spreads evenly across workers instead of clumping in contiguous bands.
+function newtonBand(A, B, OUT, rowStart, rowStep, H, W, maxIter, relax) {
+  for (let row = rowStart; row < H; row += rowStep) {
+    for (let col = 0; col < W; col++) {
+      const i = row * W + col;
+      let za = A[i], zb = B[i], it = maxIter;
+      for (let k = 0; k < maxIter; k++) {
+        const a2 = za * za, b2 = zb * zb;
+        const fr = za * a2 - 3 * za * b2 - 1, fi = 3 * a2 * zb - zb * b2;
+        const dr = 3 * (a2 - b2), di = 6 * za * zb, den = dr * dr + di * di + 1e-12;
+        const qr = relax * (fr * dr + fi * di) / den, qi = relax * (fi * dr - fr * di) / den;
+        za -= qr; zb -= qi;
+        if (qr * qr + qi * qi < 1e-6) { it = k; break; }
+      }
+      const d0 = (za - 1) * (za - 1) + zb * zb;
+      const d1 = (za + 0.5) * (za + 0.5) + (zb - 0.8660254) * (zb - 0.8660254);
+      const d2 = (za + 0.5) * (za + 0.5) + (zb + 0.8660254) * (zb + 0.8660254);
+      let root = 0, md = d0;
+      if (d1 < md) { md = d1; root = 1; }
+      if (d2 < md) { root = 2; }
+      OUT[i] = root + 0.9 * (it / maxIter);
+    }
+  }
+}
+
+// ctrl: Int32[ gen, done ].  params: Float64[ relax, maxIter ].
+if (!isMainThread) {
+  const { ctrlSab, paramsSab, aSab, bSab, outSab, rowStart, rowStep, W } = workerData;
+  const ctrl = new Int32Array(ctrlSab), params = new Float64Array(paramsSab);
+  const A = new Float64Array(aSab), B = new Float64Array(bSab), OUT = new Float64Array(outSab);
+  let gen = 0;
+  for (;;) {
+    Atomics.wait(ctrl, 0, gen);          // block until main bumps the generation
+    gen = Atomics.load(ctrl, 0);
+    if (gen < 0) break;                  // shutdown
+    newtonBand(A, B, OUT, rowStart, rowStep, HEIGHT, W, params[1] | 0, params[0]);
+    Atomics.add(ctrl, 1, 1);             // signal this band done
+    Atomics.notify(ctrl, 1);
+  }
+} else {
+  main();
+}
+
+function buildGrid() {
+  const aSab = new SharedArrayBuffer(N * 8), bSab = new SharedArrayBuffer(N * 8),
+        outSab = new SharedArrayBuffer(N * 8);
+  const A = new Float64Array(aSab), B = new Float64Array(bSab);
+  const x0 = -SPANX / 2, dx = SPANX / (WIDTH - 1);
+  const y0 = -SPANX * ASPECT / 2, dy = SPANX * ASPECT / (HEIGHT - 1);
+  for (let r = 0; r < HEIGHT; r++)
+    for (let c = 0; c < WIDTH; c++) { A[r * WIDTH + c] = x0 + dx * c; B[r * WIDTH + c] = y0 + dy * r; }
+  return { aSab, bSab, outSab };
+}
+
+function timeBest(fn, runs) {
+  for (let w = 0; w < 2; w++) fn();      // warm V8 / workers
+  let best = Infinity;
+  for (let r = 0; r < runs; r++) { const t = performance.now(); fn(); best = Math.min(best, performance.now() - t); }
+  return best;
+}
+
+async function benchPool(nw, sabs, relaxes) {
+  const ctrlSab = new SharedArrayBuffer(8), paramsSab = new SharedArrayBuffer(16);
+  const ctrl = new Int32Array(ctrlSab), params = new Float64Array(paramsSab);
+  params[1] = MAXITER;
+  const workers = [];
+  for (let i = 0; i < nw; i++) {
+    workers.push(new Worker(new URL(import.meta.url), {
+      workerData: { ...sabs, ctrlSab, paramsSab, rowStart: i, rowStep: nw, W: WIDTH },
+    }));
+  }
+  const frame = (relax) => {
+    Atomics.store(ctrl, 1, 0);
+    params[0] = relax;
+    Atomics.add(ctrl, 0, 1);
+    Atomics.notify(ctrl, 0, nw);
+    let d;                               // barrier: wait until all bands reported done
+    while ((d = Atomics.load(ctrl, 1)) < nw) Atomics.wait(ctrl, 1, d);
+  };
+  const sweep = () => { for (const rx of relaxes) frame(rx); };
+  const best = timeBest(sweep, 5);
+  Atomics.store(ctrl, 0, -1); Atomics.notify(ctrl, 0, nw);   // shutdown
+  await Promise.all(workers.map((w) => w.terminate()));
+  return best;
+}
+
+async function main() {
+  const cores = os.cpus().length;
+  const sabs = buildGrid();
+  const OUT = new Float64Array(sabs.outSab);
+  const relaxes = Array.from({ length: NFRAMES }, (_, i) => 1.0 + (1.85 - 1.0) * i / (NFRAMES - 1));
+
+  // Single-thread baseline on the main thread (no worker overhead at all).
+  const A = new Float64Array(sabs.aSab), B = new Float64Array(sabs.bSab);
+  const tSingle = timeBest(() => { for (const rx of relaxes) newtonBand(A, B, OUT, 0, 1, HEIGHT, WIDTH, MAXITER, rx); }, 5);
+  const ref = Float64Array.from(OUT);    // last frame (relax=1.85), for correctness check
+
+  console.log(`Newton ${WIDTH}x${HEIGHT}, max_iter=${MAXITER}, ${NFRAMES}-frame sweep | cores=${cores}`);
+  const per = (ms) => `${ms.toFixed(0)} ms total (${(ms / NFRAMES).toFixed(2)} ms/frame)`;
+  console.log(`\nsingle-thread (main): ${per(tSingle)}`);
+
+  const counts = [...new Set([1, 2, 4, cores])].filter((n) => n >= 1 && n <= cores * 2).sort((a, b) => a - b);
+  for (const nw of counts) {
+    const t = await benchPool(nw, sabs, relaxes);
+    let maxdiff = 0;
+    for (let i = 0; i < N; i++) maxdiff = Math.max(maxdiff, Math.abs(OUT[i] - ref[i]));
+    const sp = tSingle / t;
+    console.log(`pool x${String(nw).padStart(2)} : ${per(t)}  | speedup ${sp.toFixed(2)}x` +
+                `  eff ${(100 * sp / nw).toFixed(0)}%  | maxdiff ${maxdiff.toExponential(1)}`);
+  }
+}

From 964de907aae5783b22992dfbfb64d0f30e7faa53 Mon Sep 17 00:00:00 2001
From: Francesc Alted 
Date: Sun, 28 Jun 2026 13:36:21 +0200
Subject: [PATCH 03/13] More experiments for different numerical kernels

---
 bench/js-transpiler/README.md       |  31 ++++--
 bench/js-transpiler/dsl-js-node.mjs | 162 +++++++++++++++-------------
 plans/dsl-js.md                     |   7 ++
 3 files changed, 119 insertions(+), 81 deletions(-)

diff --git a/bench/js-transpiler/README.md b/bench/js-transpiler/README.md
index 57016c81b..30fe18a0b 100644
--- a/bench/js-transpiler/README.md
+++ b/bench/js-transpiler/README.md
@@ -17,20 +17,37 @@ then benches a 24-frame `relax` sweep.
 
 ```sh
 npm i                                         # pulls pyodide@314 (see package.json)
-node bench/js-transpiler/dsl-js-node.mjs       # correctness + 24-frame bench
-node bench/js-transpiler/dsl-js-node.mjs 48    # N frames
+node bench/js-transpiler/dsl-js-node.mjs       # correctness + kernel sweep, 12 reps
+node bench/js-transpiler/dsl-js-node.mjs 24    # N reps
 ```
 
 Needs network on first run (PyPI wheel via micropip). Exits non-zero on a correctness
-mismatch, so it works as a smoke test. Typical output (Apple M-series, blosc2 4.6.0):
+mismatch, so it works as a smoke test. It benches four kernel shapes so the js-vs-JIT ratio
+can be read against the kernel, not generalized from one. Representative (Apple M2, 4.6.0):
 
 ```
-correctness vs numpy:  js maxdiff=0.00e+0  jit maxdiff=0.00e+0  OK
-  jit_backend="js" : ~16 ms/frame
-  jit (miniexpr)   : ~31 ms/frame      -> js ~2x faster
-  no-JIT           : ~130 ms/frame     -> js ~8x faster
+  kernel    js     jit    nojit   js/jit  js/nojit
+  newton    12.1   26.1   114.6   2.15x    9.44x     arithmetic + branches + early-exit
+  deepar    22.5   50.2    53.1   2.23x    2.36x     deep pure-arithmetic loop
+  deep     120.1  143.1   104.4   1.19x    0.87x     deep loop, libm sin every iter
+  trans      5.0    5.0     6.8   1.00x    1.34x     transcendental-heavy
+  poly       3.4    3.0     3.6   0.87x    1.05x     light, branch-free
 ```
 
+**The takeaway: there is no single "js is N× the JIT" number — it depends on what the kernel
+is bottlenecked on.**
+
+- **Arithmetic / control-flow bound** (newton, deepar) → V8's optimizing JIT beats blosc2's
+  miniexpr WASM codegen by **~2×**. This is the sweet spot.
+- **Transcendental bound** (trans, deep) → **~1×**: time is spent in `sin`/`exp`/`log` (libm),
+  which costs about the same whoever runs the loop — `nojit` even edges `js` on `deep`.
+- **Light / trivial** (poly) → **<1×**: the kernel does almost no compute, so the blosc2
+  pipeline + per-call JS marshaling dominate, and `js` can be *slightly slower* than the JIT.
+
+So the honest generalization is qualitative: transpiling to JS wins (~2×, single-threaded)
+for **compute-bound float kernels dominated by arithmetic and control flow**, and is roughly
+a wash for transcendental-bound or trivial kernels.
+
 > The overlay pins `blosc2==4.6.0` to keep the compiled `blosc2_ext` ABI in step with the
 > pure-Python we drop on top. Once these changes ship in a Pyodide-installable wheel, the
 > overlay can go away. If the overlay import ever breaks on version skew, overlay all of
diff --git a/bench/js-transpiler/dsl-js-node.mjs b/bench/js-transpiler/dsl-js-node.mjs
index c9f4c8863..05f607941 100644
--- a/bench/js-transpiler/dsl-js-node.mjs
+++ b/bench/js-transpiler/dsl-js-node.mjs
@@ -3,18 +3,22 @@
 // working tree's pure-Python (src/blosc2/dsl_js.py + lazyexpr.py) on top of it before
 // importing blosc2 -- so the wired path runs without waiting for a new wheel.
 //
+// Benches a spread of kernel shapes to show how the js-vs-JIT ratio depends on the kernel:
+// branchy + early-exit (newton), branch-free light (poly), transcendental-heavy (trans),
+// deep no-exit loop (deep). Reports js / jit / no-jit per kernel.
+//
 //   npm i                                         # pulls pyodide@314 (see package.json)
-//   node bench/js-transpiler/dsl-js-node.mjs      # correctness + 24-frame bench
-//   node bench/js-transpiler/dsl-js-node.mjs 48   # bench with N frames
+//   node bench/js-transpiler/dsl-js-node.mjs       # correctness + bench, 12 reps
+//   node bench/js-transpiler/dsl-js-node.mjs 24    # N reps
 import { loadPyodide } from "pyodide";
 import { readFileSync } from "node:fs";
 import { fileURLToPath } from "node:url";
 
 // Resolve paths from this file, so the harness runs from any CWD (repo root is ../../).
 const ROOT = fileURLToPath(new URL("../../", import.meta.url));
-const NFRAMES = Number(process.argv[2]) || 24;
+const NFRAMES = Number(process.argv[2]) || 12;
 
-// Kernel + bench live in a real module file: @blosc2.dsl_kernel runs inspect.getsource(),
+// Kernels + bench live in a real module file: @blosc2.dsl_kernel runs inspect.getsource(),
 // which needs the function to be file-backed (not exec'd from a string).
 const PYSRC = String.raw`
 import json, time
@@ -26,7 +30,8 @@ SPANX = 3.4
 ASPECT = HEIGHT / WIDTH
 DTYPE = np.float64
 
-@blosc2.dsl_kernel
+# --- kernels spanning the cost/control-flow spectrum -------------------------------------
+@blosc2.dsl_kernel  # branchy, deep, per-pixel early exit
 def newton_dsl(a, b, max_iter, relax):
     za = a
     zb = b
@@ -59,27 +64,34 @@ def newton_dsl(a, b, max_iter, relax):
         root = 2.0
     return root + 0.9 * (it / mif)
 
-def newton_numpy(a, b, max_iter, relax):
-    za = a.copy(); zb = b.copy()
-    mif = float(max_iter)
-    it = np.full(a.shape, mif)
-    alive = np.ones(a.shape, dtype=bool)
-    for k in range(max_iter):
-        a2 = za * za; b2 = zb * zb
-        fr = za * a2 - 3.0 * za * b2 - 1.0; fi = 3.0 * a2 * zb - zb * b2
-        dr = 3.0 * (a2 - b2); di = 6.0 * za * zb; den = dr * dr + di * di + 1e-12
-        qr = relax * (fr * dr + fi * di) / den; qi = relax * (fi * dr - fr * di) / den
-        za = za - qr; zb = zb - qi
-        c = alive & ((qr * qr + qi * qi) < 1e-6); it[c] = k; alive &= ~c
-        if not alive.any():
-            break
-    d0 = (za - 1.0) ** 2 + zb ** 2
-    d1 = (za + 0.5) ** 2 + (zb - 0.8660254) ** 2
-    d2 = (za + 0.5) ** 2 + (zb + 0.8660254) ** 2
-    root = np.zeros(a.shape); md = d0.copy()
-    m = d1 < md; md = np.where(m, d1, md); root = np.where(m, 1.0, root)
-    m = d2 < md; root = np.where(m, 2.0, root)
-    return root + 0.9 * (it / mif)
+@blosc2.dsl_kernel  # light, branch-free, vectorizable arithmetic
+def poly_dsl(a, b):
+    a2 = a * a
+    b2 = b * b
+    return a2 * a - 3.0 * a * b2 + 2.0 * b2 * b - a + 0.5 * b
+
+@blosc2.dsl_kernel  # transcendental-heavy (exercises each engine's libm). miniexpr wants
+def trans_dsl(a, b):  # bare sin/cos/... (not np.sin); the transpiler maps both to Math.*
+    msq = a * a + b * b
+    sc = sin(a * 3.0) * cos(b * 2.0)
+    ex = exp(msq * -0.5)
+    return sc + ex + sqrt(msq + 1.0) + log(msq + 2.0)
+
+@blosc2.dsl_kernel  # deep fixed loop, transcendental-bound (libm sin every iter)
+def deep_dsl(a, b):
+    acc = a
+    for k in range(64):
+        acc = acc * 0.99 + sin(acc + b)
+    return acc
+
+@blosc2.dsl_kernel  # deep fixed loop, pure arithmetic (no libm, no branches); contractive
+def deepar_dsl(a, b):
+    acc = a * 0.1
+    t = b * 0.1
+    for k in range(64):
+        t = t * 0.5 - acc * 0.25 + 0.1
+        acc = acc * 0.5 + t * 0.25 + 0.1
+    return acc + t
 
 _x = np.linspace(-SPANX / 2, SPANX / 2, WIDTH, dtype=DTYPE)
 _y = np.linspace(-SPANX * ASPECT / 2, SPANX * ASPECT / 2, HEIGHT, dtype=DTYPE)
@@ -90,7 +102,16 @@ _cp = blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=1)
 A_B2 = blosc2.asarray(A_NP, chunks=_chunks, blocks=_blocks, cparams=_cp)
 B_B2 = blosc2.asarray(B_NP, chunks=_chunks, blocks=_blocks, cparams=_cp)
 
-def run(relax, backend):
+# (name, kernel, operand tuple). Fixed inputs -> each rep does identical work.
+KERNELS = [
+    ("newton", newton_dsl, (A_B2, B_B2, MAXITER, 1.37)),
+    ("poly",   poly_dsl,   (A_B2, B_B2)),
+    ("trans",  trans_dsl,  (A_B2, B_B2)),
+    ("deep",   deep_dsl,   (A_B2, B_B2)),
+    ("deepar", deepar_dsl, (A_B2, B_B2)),
+]
+
+def run(func, ops, backend):
     kw = {"dtype": DTYPE, "cparams": _cp}
     if backend == "js":
         kw["jit_backend"] = "js"
@@ -98,32 +119,19 @@ def run(relax, backend):
         kw["jit"] = True
     else:
         kw["jit"] = False
-    return blosc2.lazyudf(newton_dsl, (A_B2, B_B2, MAXITER, relax), **kw)[:]
-
-def correctness():
-    rx = 1.37
-    ref = newton_numpy(A_NP, B_NP, MAXITER, rx)
-    return {
-        "diff_js": float(np.max(np.abs(run(rx, "js") - ref))),
-        "diff_jit": float(np.max(np.abs(run(rx, "jit") - ref))),
-    }
-
-def bench(nframes):
-    relaxes = [float(v) for v in np.linspace(1.0, 1.85, nframes)]
-    def sweep_ms(backend):
-        for r in relaxes[:2]:
-            run(r, backend)            # warm
-        best = float("inf")
-        for _ in range(3):
-            t = time.perf_counter()
-            for r in relaxes:
-                run(r, backend)
-            best = min(best, (time.perf_counter() - t) * 1000)
-        return best
-    return {b: sweep_ms(b) for b in ("js", "jit", "nojit")}
+    return blosc2.lazyudf(func, ops, **kw)[:]
+
+def bench(func, ops, backend, reps):
+    run(func, ops, backend)            # warm
+    best = float("inf")
+    for _ in range(3):
+        t = time.perf_counter()
+        for _ in range(reps):
+            run(func, ops, backend)
+        best = min(best, (time.perf_counter() - t) * 1000 / reps)
+    return best
 
 def debug_bridge():
-    # Call the JS bridge directly (no lazyudf/blosc2 machinery) to isolate marshal+compile+compute.
     import blosc2.dsl_js as dj
     bridge = dj.js_kernel(newton_dsl)
     out = np.empty((HEIGHT, WIDTH), dtype=DTYPE)
@@ -134,12 +142,17 @@ def debug_bridge():
         t = time.perf_counter(); bridge(inp, out, 0); best = min(best, (time.perf_counter() - t) * 1000)
     return {"first_ms": first, "warm_ms": best}
 
-def result(nframes):
-    out = correctness()
-    out["ms"] = bench(nframes)
-    out["bridge"] = debug_bridge()
-    out["nframes"] = nframes
-    return json.dumps(out)
+def result(reps):
+    import math
+    kernels = []
+    for name, func, ops in KERNELS:
+        rj = run(func, ops, "js")
+        rjit = run(func, ops, "jit")
+        diff = float(np.max(np.abs(rj - rjit)))
+        diff = diff if math.isfinite(diff) else 1e30   # keep JSON valid; flags as mismatch
+        ms = {b: bench(func, ops, b, reps) for b in ("js", "jit", "nojit")}
+        kernels.append({"name": name, "ms": ms, "diff": diff})
+    return json.dumps({"kernels": kernels, "bridge": debug_bridge(), "reps": reps})
 `;
 
 const py = await loadPyodide();
@@ -160,27 +173,28 @@ import sys, blosc2
 assert hasattr(sys.modules["blosc2.lazyexpr"], "_as_js_udf"), "overlay did not take"
 `);
 console.log("blosc2", await py.runPythonAsync("blosc2.__version__"),
-            "| Pyodide", py.version, "| frames", NFRAMES);
+            "| Pyodide", py.version, "| reps", NFRAMES);
 
-py.FS.writeFile("/newton_bench.py", new TextEncoder().encode(PYSRC));
+py.FS.writeFile("/kernel_bench.py", new TextEncoder().encode(PYSRC));
 const out = await py.runPythonAsync(`
 import sys
 if "/" not in sys.path: sys.path.insert(0, "/")
-import newton_bench
-newton_bench.result(${NFRAMES})
+import kernel_bench
+kernel_bench.result(${NFRAMES})
 `);
 
 const r = JSON.parse(out);
-const ok = r.diff_js < 1e-9 && r.diff_jit < 1e-9;
-console.log(`\ncorrectness vs numpy:  js maxdiff=${r.diff_js.toExponential(2)}  ` +
-            `jit maxdiff=${r.diff_jit.toExponential(2)}  ${ok ? "OK" : "MISMATCH"}`);
-const per = ms => `${ms.toFixed(0)} ms total (${(ms / r.nframes).toFixed(2)} ms/frame)`;
-console.log(`\nperf (${r.nframes}-frame relax sweep, best of 3):`);
-console.log(`  jit_backend="js" : ${per(r.ms.js)}`);
-console.log(`  jit (miniexpr)   : ${per(r.ms.jit)}`);
-console.log(`  no-JIT           : ${per(r.ms.nojit)}`);
-console.log(`  js vs jit  : ${(r.ms.jit / r.ms.js).toFixed(2)}x   ` +
-            `js vs no-jit : ${(r.ms.nojit / r.ms.js).toFixed(2)}x`);
-console.log(`\nbridge probe (direct call, no blosc2 machinery):  ` +
-            `first=${r.bridge.first_ms.toFixed(1)} ms  warm=${r.bridge.warm_ms.toFixed(1)} ms`);
-if (!ok) process.exit(1);
+const fmt = (x, w) => String(x).padStart(w);
+const bad = r.kernels.filter((k) => k.diff > 1e-5);
+console.log(`\ncorrectness (js vs JIT maxdiff): ${bad.length ? "MISMATCH " + bad.map((k) => k.name) : "OK"}`);
+console.log("\nper-kernel bench (ms/frame, lower is better):");
+console.log("  kernel    js     jit    nojit   js/jit  js/nojit  diff");
+for (const k of r.kernels) {
+  const { js, jit, nojit } = k.ms;
+  console.log(
+    `  ${k.name.padEnd(7)} ${fmt(js.toFixed(1), 6)} ${fmt(jit.toFixed(1), 6)} ${fmt(nojit.toFixed(1), 7)}` +
+    `  ${fmt((jit / js).toFixed(2) + "x", 6)} ${fmt((nojit / js).toFixed(2) + "x", 8)}  ${k.diff.toExponential(1)}`,
+  );
+}
+console.log(`\nnewton bridge probe (no blosc2 machinery): first=${r.bridge.first_ms.toFixed(1)} ms  warm=${r.bridge.warm_ms.toFixed(1)} ms`);
+if (bad.length) process.exit(1);
diff --git a/plans/dsl-js.md b/plans/dsl-js.md
index 034a44536..58e3b597d 100644
--- a/plans/dsl-js.md
+++ b/plans/dsl-js.md
@@ -142,6 +142,13 @@ Measured on Apple M-series, blosc2 4.6.0 under Pyodide 314, Newton 320×213 / ma
 
 Correctness exact: `js` and JIT both `maxdiff=0.00` vs numpy.
 
+**The ~2× is kernel-dependent, not a flat rule** (kernel sweep in `dsl-js-node.mjs`, see
+`bench/js-transpiler/README.md`). The js-vs-JIT win tracks what the kernel is bottlenecked on:
+**arithmetic / control-flow** bound → ~2× (newton 2.15×, a deep pure-arithmetic loop 2.23×);
+**transcendental** bound (`sin`/`exp`/`log`) → ~1× (libm cost is engine-independent; `nojit`
+can even edge `js`); **light / trivial** → <1× (blosc2 pipeline + marshaling dominate, `js`
+slightly loses). So JS helps for compute-bound float kernels heavy on arithmetic and branches.
+
 **The PyProxy gotcha (the one real bug the headless harness caught).** The bridge must pass
 the per-call operands to the JS driver as real **JS `Array`s**, not Python lists. A Python
 list arrives in JS as a `PyProxy`, so every `ops[k][i]` in the hot inner loop crosses the

From a306a006e0514906e428382bc4b6b86107369089 Mon Sep 17 00:00:00 2001
From: Francesc Alted 
Date: Mon, 29 Jun 2026 08:04:20 +0200
Subject: [PATCH 04/13] Default to the JS DSL backend under WebAssembly
 (prefer-js with fallback)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Under WASM, a float/transpilable/non-reduction DSL kernel now auto-routes to
jit_backend="js" unless the user opts out; anything JS can't do (non-float
dtypes, reductions, unsupported constructs) silently falls back to miniexpr,
so there's no regression. Since JS is itself a JIT, jit=True prefers it too —
only jit=False, strict_miniexpr=True, or an explicit jit_backend opts out
(force miniexpr with jit_backend="tcc"/"cc").

- lazyexpr.py: _maybe_js_backend prefer-js logic + _js_dtypes_ok gating; "js"
  documented and listed for the jit_backend param.
- test_dsl_kernels.py: autouse fixture keeps this miniexpr-semantics module on
  miniexpr (the prefer-js default would bypass its _set_pref_expr assertions).
- test_wasm_dsl_jit.py: native-CI coverage for explicit js, the prefer-js
  default, and the int fallback (counterpart of the node overlay harness).
- bench/js-transpiler: kernel sweep (newton/poly/trans/deep/deepar) showing the
  js-vs-tcc win is kernel-dependent (~2x arithmetic, ~1x transcendental/light).
---
 bench/js-transpiler/README.md       | 26 +++++---
 bench/js-transpiler/dsl-js-node.mjs | 65 ++++++++++++++------
 plans/dsl-js.md                     | 16 +++--
 src/blosc2/lazyexpr.py              | 95 +++++++++++++++++++++++------
 tests/ndarray/test_dsl_js.py        | 65 ++++++++++++++++++++
 tests/ndarray/test_dsl_kernels.py   | 11 ++++
 tests/ndarray/test_wasm_dsl_jit.py  | 44 ++++++++++++-
 7 files changed, 273 insertions(+), 49 deletions(-)

diff --git a/bench/js-transpiler/README.md b/bench/js-transpiler/README.md
index 30fe18a0b..e41de2674 100644
--- a/bench/js-transpiler/README.md
+++ b/bench/js-transpiler/README.md
@@ -22,18 +22,28 @@ node bench/js-transpiler/dsl-js-node.mjs 24    # N reps
 ```
 
 Needs network on first run (PyPI wheel via micropip). Exits non-zero on a correctness
-mismatch, so it works as a smoke test. It benches four kernel shapes so the js-vs-JIT ratio
-can be read against the kernel, not generalized from one. Representative (Apple M2, 4.6.0):
+mismatch *or* a broken default fallback, so it works as a smoke test. It benches five kernel
+shapes so the js-vs-tcc ratio can be read against the kernel, not generalized from one. The
+`default` column (no `jit`/`jit_backend` set) shows the prefer-js-with-fallback default, and
+it also checks that an int kernel and an index-symbol kernel fall back cleanly to miniexpr.
+Representative (Apple M2, 4.6.0):
 
 ```
-  kernel    js     jit    nojit   js/jit  js/nojit
-  newton    12.1   26.1   114.6   2.15x    9.44x     arithmetic + branches + early-exit
-  deepar    22.5   50.2    53.1   2.23x    2.36x     deep pure-arithmetic loop
-  deep     120.1  143.1   104.4   1.19x    0.87x     deep loop, libm sin every iter
-  trans      5.0    5.0     6.8   1.00x    1.34x     transcendental-heavy
-  poly       3.4    3.0     3.6   0.87x    1.05x     light, branch-free
+default fallback (no jit_backend): int=ok index-symbol=ok  -> falls back cleanly
+  kernel   default      js     tcc   nojit  js/tcc
+  newton      12.0    11.4    23.9   104.5   2.10x
+  poly         3.0     2.9     2.7     3.2   0.91x
+  trans        4.3     4.3     4.4     5.7   1.01x
+  deep       116.0   116.8   143.4   103.3   1.23x
+  deepar      22.3    22.0    50.4    52.7   2.29x
 ```
 
+Columns: `default` = prefer-js-with-fallback, `js` = forced `jit_backend="js"`, `tcc` =
+miniexpr JIT (`jit_backend="tcc"`), `nojit` = miniexpr interpreter (`jit=False`). `default ≈
+js` here (all float + transpilable) → prefer-js engaged. Note `jit=True` *also* prefers js
+(it's a JIT); to force miniexpr use `jit_backend="tcc"`/`"cc"`, and `jit=False` selects the
+interpreter — that's what the `tcc`/`nojit` columns pin.
+
 **The takeaway: there is no single "js is N× the JIT" number — it depends on what the kernel
 is bottlenecked on.**
 
diff --git a/bench/js-transpiler/dsl-js-node.mjs b/bench/js-transpiler/dsl-js-node.mjs
index 05f607941..dedb0a10d 100644
--- a/bench/js-transpiler/dsl-js-node.mjs
+++ b/bench/js-transpiler/dsl-js-node.mjs
@@ -93,6 +93,15 @@ def deepar_dsl(a, b):
         acc = acc * 0.5 + t * 0.25 + 0.1
     return acc + t
 
+# Fallback-path kernels (used only by fallback_check, not the float sweep):
+@blosc2.dsl_kernel  # int output/operands -> default must fall back to miniexpr (float64 bridge unsafe)
+def int_dsl(a, b):
+    return a * 2 + b * 3
+
+@blosc2.dsl_kernel  # index symbol -> transpiler rejects -> default must fall back to miniexpr
+def idx_dsl(a):
+    return a + float(_i0)  # noqa: F821
+
 _x = np.linspace(-SPANX / 2, SPANX / 2, WIDTH, dtype=DTYPE)
 _y = np.linspace(-SPANX * ASPECT / 2, SPANX * ASPECT / 2, HEIGHT, dtype=DTYPE)
 A_NP, B_NP = np.meshgrid(_x, _y)
@@ -111,14 +120,16 @@ KERNELS = [
     ("deepar", deepar_dsl, (A_B2, B_B2)),
 ]
 
-def run(func, ops, backend):
-    kw = {"dtype": DTYPE, "cparams": _cp}
+def run(func, ops, backend, dtype=DTYPE):
+    kw = {"dtype": dtype, "cparams": _cp}
     if backend == "js":
         kw["jit_backend"] = "js"
-    elif backend == "jit":
+    elif backend == "tcc":
         kw["jit"] = True
-    else:
+        kw["jit_backend"] = "tcc"   # miniexpr JIT, TinyCC backend (explicit)
+    elif backend == "nojit":
         kw["jit"] = False
+    # "default": pass nothing -> under WASM this prefers js, falling back to miniexpr.
     return blosc2.lazyudf(func, ops, **kw)[:]
 
 def bench(func, ops, backend, reps):
@@ -142,17 +153,33 @@ def debug_bridge():
         t = time.perf_counter(); bridge(inp, out, 0); best = min(best, (time.perf_counter() - t) * 1000)
     return {"first_ms": first, "warm_ms": best}
 
+def fallback_check():
+    # Default backend must transparently fall back to miniexpr where js can't go, with no
+    # error and the same result. int dtype -> dtype-gated; index symbol -> transpiler rejects.
+    Ai = blosc2.asarray((A_NP * 10).astype(np.int64), chunks=_chunks, blocks=_blocks, cparams=_cp)
+    Bi = blosc2.asarray((B_NP * 10).astype(np.int64), chunks=_chunks, blocks=_blocks, cparams=_cp)
+    int_def = run(int_dsl, (Ai, Bi), "default", dtype=np.int64)
+    int_tcc = run(int_dsl, (Ai, Bi), "tcc", dtype=np.int64)
+    idx_def = run(idx_dsl, (A_B2,), "default")
+    idx_tcc = run(idx_dsl, (A_B2,), "tcc")
+    return {
+        "int_ok": bool(np.array_equal(int_def, int_tcc)),
+        "idx_ok": bool(np.allclose(idx_def, idx_tcc)),
+    }
+
 def result(reps):
     import math
     kernels = []
     for name, func, ops in KERNELS:
         rj = run(func, ops, "js")
-        rjit = run(func, ops, "jit")
-        diff = float(np.max(np.abs(rj - rjit)))
+        rtcc = run(func, ops, "tcc")
+        diff = float(np.max(np.abs(rj - rtcc)))
         diff = diff if math.isfinite(diff) else 1e30   # keep JSON valid; flags as mismatch
-        ms = {b: bench(func, ops, b, reps) for b in ("js", "jit", "nojit")}
+        ms = {b: bench(func, ops, b, reps) for b in ("default", "js", "tcc", "nojit")}
         kernels.append({"name": name, "ms": ms, "diff": diff})
-    return json.dumps({"kernels": kernels, "bridge": debug_bridge(), "reps": reps})
+    return json.dumps({
+        "kernels": kernels, "bridge": debug_bridge(), "fallback": fallback_check(), "reps": reps,
+    })
 `;
 
 const py = await loadPyodide();
@@ -186,15 +213,19 @@ kernel_bench.result(${NFRAMES})
 const r = JSON.parse(out);
 const fmt = (x, w) => String(x).padStart(w);
 const bad = r.kernels.filter((k) => k.diff > 1e-5);
-console.log(`\ncorrectness (js vs JIT maxdiff): ${bad.length ? "MISMATCH " + bad.map((k) => k.name) : "OK"}`);
-console.log("\nper-kernel bench (ms/frame, lower is better):");
-console.log("  kernel    js     jit    nojit   js/jit  js/nojit  diff");
+console.log(`\ncorrectness (js vs tcc maxdiff): ${bad.length ? "MISMATCH " + bad.map((k) => k.name) : "OK"}`);
+const fb = r.fallback;
+const fbOk = fb.int_ok && fb.idx_ok;
+console.log(`default fallback (no jit_backend): int=${fb.int_ok ? "ok" : "FAIL"} ` +
+            `index-symbol=${fb.idx_ok ? "ok" : "FAIL"}  -> ${fbOk ? "falls back cleanly" : "BROKEN"}`);
+console.log("\nper-kernel bench (ms/frame, lower is better; 'default' = prefer-js w/ fallback,");
+console.log("'tcc' = miniexpr JIT, 'nojit' = miniexpr interpreter):");
+const cols = ["default", "js", "tcc", "nojit", "js/tcc"];
+console.log("  " + "kernel".padEnd(8) + cols.map((c) => fmt(c, 8)).join(""));
 for (const k of r.kernels) {
-  const { js, jit, nojit } = k.ms;
-  console.log(
-    `  ${k.name.padEnd(7)} ${fmt(js.toFixed(1), 6)} ${fmt(jit.toFixed(1), 6)} ${fmt(nojit.toFixed(1), 7)}` +
-    `  ${fmt((jit / js).toFixed(2) + "x", 6)} ${fmt((nojit / js).toFixed(2) + "x", 8)}  ${k.diff.toExponential(1)}`,
-  );
+  const { default: def, js, tcc, nojit } = k.ms;
+  const cells = [def, js, tcc, nojit].map((v) => v.toFixed(1)).concat((tcc / js).toFixed(2) + "x");
+  console.log("  " + k.name.padEnd(8) + cells.map((c) => fmt(c, 8)).join(""));
 }
 console.log(`\nnewton bridge probe (no blosc2 machinery): first=${r.bridge.first_ms.toFixed(1)} ms  warm=${r.bridge.warm_ms.toFixed(1)} ms`);
-if (bad.length) process.exit(1);
+if (bad.length || !fbOk) process.exit(1);
diff --git a/plans/dsl-js.md b/plans/dsl-js.md
index 58e3b597d..996e44594 100644
--- a/plans/dsl-js.md
+++ b/plans/dsl-js.md
@@ -27,9 +27,17 @@ the user rewriting the kernel. This is browser/Pyodide-only by nature.
   (tiling driver, COOP/COEP headers, Atomics join) mostly *outside* blosc2. Single-threaded
   JS already beats the JIT 3.3×; parallelism is deferred until measured need. Generalizing
   the per-block bridge to **per-chunk** is the natural next rung.
-- **Shipped as `src/blosc2/dsl_js.py`** behind `jit_backend="js"` (a new backend alongside
-  no-JIT and miniexpr-JIT). Wiring is one swap in `chunked_eval`; no compiled-code changes.
-  Started as a repo-root prototype, graduated once benches confirmed the ~2× win.
+- **Shipped as `src/blosc2/dsl_js.py`**, a new `jit_backend="js"` alongside no-JIT and
+  miniexpr-JIT. Wiring is one swap in `chunked_eval`; no compiled-code changes. Started as a
+  repo-root prototype, graduated once benches confirmed the ~2× win.
+- **Default under WebAssembly is prefer-js-with-fallback.** Unless `jit=False`, a float,
+  transpilable, non-reduction DSL kernel auto-routes to js; anything js can't do
+  (integer/complex dtypes, reductions, unsupported DSL constructs) *silently falls back to
+  miniexpr*, so there is no regression. Because js is itself a JIT (the JS engine compiles
+  it), `jit=True` prefers it too — only `jit=False` (interpreter) or an explicit
+  `jit_backend` opts out; force miniexpr with `jit_backend="tcc"`/`"cc"` (see
+  `_maybe_js_backend`, `_js_dtypes_ok`). Off-WASM, `jit_backend="js"` raises; the default is
+  unchanged (miniexpr).
 
 ## Feasibility summary
 
@@ -143,7 +151,7 @@ Measured on Apple M-series, blosc2 4.6.0 under Pyodide 314, Newton 320×213 / ma
 Correctness exact: `js` and JIT both `maxdiff=0.00` vs numpy.
 
 **The ~2× is kernel-dependent, not a flat rule** (kernel sweep in `dsl-js-node.mjs`, see
-`bench/js-transpiler/README.md`). The js-vs-JIT win tracks what the kernel is bottlenecked on:
+`bench/js-transpiler/README.md`). The js-vs-tcc (miniexpr JIT) win tracks what the kernel is bottlenecked on:
 **arithmetic / control-flow** bound → ~2× (newton 2.15×, a deep pure-arithmetic loop 2.23×);
 **transcendental** bound (`sin`/`exp`/`log`) → ~1× (libm cost is engine-independent; `nojit`
 can even edge `js`); **light / trivial** → <1× (blosc2 pipeline + marshaling dominate, `js`
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 821894f41..be9edb11e 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -479,7 +479,8 @@ def compute(
 
             - ``strict_miniexpr`` (bool): controls whether miniexpr compilation/execution
               failures are raised instead of silently falling back to regular chunked eval
-              for non-DSL expressions.
+              for non-DSL expressions.  Setting it ``True`` also opts a DSL kernel out of the
+              WebAssembly prefer-js default, keeping it on miniexpr.
 
             - ``jit`` (bool | None): enable (``True``) or disable (``False``) JIT compilation
               of the expression via miniexpr.  When ``None`` (default), JIT is only used
@@ -488,9 +489,21 @@ def compute(
               kernels.
 
             - ``jit_backend`` (str | None): select the JIT compiler backend.  Valid
-              values are ``"tcc"`` (bundled Tiny C Compiler) and ``"cc"`` (system C
-              compiler, e.g. gcc or clang).  ``None`` (default) defers to the miniexpr
-              default (``"tcc"``).
+              values are ``"tcc"`` (bundled Tiny C Compiler), ``"cc"`` (system C
+              compiler, e.g. gcc or clang), and ``"js"`` (transpile the DSL kernel to
+              JavaScript; browser/Pyodide only — see below).  ``None`` (default) defers
+              to the miniexpr default (``"tcc"``), except under WebAssembly where — unless
+              ``jit=False`` — it *prefers* ``"js"`` for transpilable float DSL kernels and
+              falls back to miniexpr otherwise.  Since ``"js"`` is itself JIT-compiled by
+              the JS engine, ``jit=True`` prefers it too; force miniexpr with
+              ``jit_backend="tcc"``/``"cc"``.
+
+            - ``"js"`` backend (WebAssembly/Pyodide only): transpiles a
+              :func:`blosc2.dsl_kernel` to JavaScript so it runs at the browser engine's
+              optimized native speed.  It tends to beat the WASM miniexpr JIT (~2x) for
+              float kernels dominated by arithmetic and control flow, and is roughly a
+              wash for transcendental-heavy or trivial kernels.  Outside WebAssembly,
+              ``jit_backend="js"`` raises.  Forcing ``"tcc"``/``"cc"`` always uses miniexpr.
 
             - ``BLOSC_ME_JIT`` environment variable: when set to ``"1"``, ``"true"``,
               ``"on"``, ``"tcc"``, or ``"cc"``, it forces ``jit=True`` and overrides
@@ -1407,8 +1420,11 @@ def fill_chunk_operands(
 def _apply_jit_backend_pragma(expression: str, inputs: dict, jit_backend: str | None) -> str:
     if jit_backend is None:
         return expression
+    if jit_backend == "js":
+        # "js" is handled earlier (DSL kernels -> JS bridge); it never carries a C pragma.
+        return expression
     if jit_backend not in ("tcc", "cc"):
-        raise ValueError("jit_backend must be one of: None, 'tcc', 'cc'")
+        raise ValueError("jit_backend must be one of: None, 'tcc', 'cc', 'js'")
 
     pragma = f"# me:compiler={jit_backend}\n"
     stripped = expression.lstrip()
@@ -1444,14 +1460,52 @@ def _as_js_udf(expression):
     return js_kernel(expression)
 
 
-def _maybe_js_backend(expression, jit, jit_backend, reduce_args):
-    """For jit_backend="js", swap the DSL kernel for its JS bridge (a plain per-block
-    callable) and disable miniexpr/backend-pragma. Otherwise a no-op passthrough."""
-    if jit_backend != "js":
+def _js_dtypes_ok(operands, kwargs) -> bool:
+    """True only if the JS bridge (which computes in float64) is safe for these operands:
+    floating-point NDArray inputs and a floating/inferred output dtype. Integer/complex go
+    to miniexpr instead (float64 can't represent int64 exactly)."""
+    dt = kwargs.get("dtype")
+    if dt is not None and not np.issubdtype(np.dtype(dt), np.floating):
+        return False
+    return all(
+        np.issubdtype(op.dtype, np.floating) for op in operands.values() if isinstance(op, blosc2.NDArray)
+    )
+
+
+def _maybe_js_backend(expression, jit, jit_backend, reduce_args, operands, kwargs):
+    """Resolve the JS backend for a DSL kernel.
+
+    - ``jit_backend="js"`` (explicit): transpile to the JS bridge, or raise if it can't.
+    - ``jit_backend=None`` under WebAssembly, unless ``jit=False``: *prefer* JS (it is a
+      JIT too, and the fastest one here) for transpilable float DSL kernels, silently
+      falling back to miniexpr for anything it can't do (non-float dtypes, reductions, or
+      unsupported DSL constructs).  ``jit=True`` and ``jit=None`` both prefer JS; only
+      ``jit=False`` (interpreter), ``strict_miniexpr=True``, or an explicit ``jit_backend``
+      opts out.
+
+    Returns ``(expression, jit, jit_backend)`` — expression becomes a plain per-block
+    callable when JS is chosen, else everything passes through unchanged.
+    """
+    if jit_backend == "js":
+        if reduce_args:
+            raise ValueError('jit_backend="js" does not support reductions')
+        return _as_js_udf(expression), None, None
+    prefer_js = (
+        jit is not False  # jit=True/None prefer the best JIT (js); only jit=False forces interpreter
+        and jit_backend is None
+        and not kwargs.get("strict_miniexpr")  # explicit strict_miniexpr=True keeps miniexpr
+        and blosc2.IS_WASM
+        and _is_dsl_kernel_expression(expression)
+        and not reduce_args
+        and _js_dtypes_ok(operands, kwargs)
+    )
+    if not prefer_js:
         return expression, jit, jit_backend
-    if reduce_args:
-        raise ValueError('jit_backend="js" does not support reductions')
-    return _as_js_udf(expression), None, None
+    try:
+        bridge = _as_js_udf(expression)  # transpiles; raises on any unsupported construct
+    except Exception:
+        return expression, jit, jit_backend  # fall back to miniexpr, no regression
+    return bridge, None, None
 
 
 def _format_dsl_parse_error_hint(expr_text: str, backend_msg: str):
@@ -2983,8 +3037,11 @@ def chunked_eval(
             operands = {**operands, **where}
 
         reduce_args = kwargs.pop("_reduce_args", {})
-        # jit_backend="js": swap the DSL kernel for its JS bridge (a plain per-block callable).
-        expression, jit, jit_backend = _maybe_js_backend(expression, jit, jit_backend, reduce_args)
+        # Resolve the JS backend: explicit jit_backend="js", or prefer-js-with-fallback under
+        # WebAssembly when the user left jit_backend unset (see _maybe_js_backend).
+        expression, jit, jit_backend = _maybe_js_backend(
+            expression, jit, jit_backend, reduce_args, operands, kwargs
+        )
 
         fast_path = _validate_chunked_eval_inputs(operands, out, shape, reduce_args)
 
@@ -4839,9 +4896,13 @@ def myudf(inputs_tuple, output, offset):
     jit: bool or None, optional
         JIT policy for miniexpr-backed execution:
         ``None`` uses default behavior (currently, JIT is tried out), ``True`` prefers JIT, ``False`` disables JIT.
-    jit_backend: {"tcc", "cc"} or None, optional
-        JIT backend selection for miniexpr-backed execution:
-        ``None`` uses backend defaults (currently "tcc"), ``"tcc"`` forces libtcc, ``"cc"`` forces C compiler backend.
+    jit_backend: {"tcc", "cc", "js"} or None, optional
+        JIT backend selection. ``None`` uses backend defaults (miniexpr "tcc"), except under
+        WebAssembly where — unless ``jit=False`` — it *prefers* ``"js"`` for transpilable
+        float DSL kernels and falls back to miniexpr otherwise (``jit=True`` prefers ``"js"``
+        too, since it is JIT-compiled by the JS engine). ``"tcc"`` forces libtcc, ``"cc"``
+        forces the C compiler backend, and ``"js"`` transpiles a :func:`blosc2.dsl_kernel`
+        to JavaScript (browser/Pyodide only; raises elsewhere).
     kwargs: Any, optional
         Keyword arguments that are supported by the :func:`empty` constructor.
         These arguments will be used by the :meth:`LazyArray.__getitem__` and
diff --git a/tests/ndarray/test_dsl_js.py b/tests/ndarray/test_dsl_js.py
index 162181dca..e09e9fd5c 100644
--- a/tests/ndarray/test_dsl_js.py
+++ b/tests/ndarray/test_dsl_js.py
@@ -14,11 +14,17 @@
 import json
 import shutil
 import subprocess
+import sys
 
+import numpy as np
 import pytest
 
+import blosc2
 from blosc2.dsl_js import build_js_module, dsl_to_js
 
+# `blosc2.lazyexpr` (attribute) is the re-exported function, not the module; grab the module.
+lx = sys.modules["blosc2.lazyexpr"]
+
 
 # Same Newton kernel as the demo (scalar semantics), as a plain function.
 def newton_dsl(a, b, max_iter, relax):
@@ -140,3 +146,62 @@ def test_misc_matches_python():
     js_vals = _run_node(build_js_module(misc_dsl), pts, [])
     mdiff = max(abs(p - j) for p, j in zip(ref, js_vals, strict=True))
     assert mdiff < 1e-9, f"misc py-vs-js mismatch: maxdiff={mdiff}"
+
+
+# --- prefer-js-with-fallback backend selection (logic only; the bridge isn't *run* here, so
+# no real WASM is needed -- IS_WASM is monkeypatched and js_kernel only transpiles) ----------
+@blosc2.dsl_kernel
+def _add(a, b):
+    return a + b
+
+
+@blosc2.dsl_kernel
+def _idx(a):
+    return a + float(_i0)  # noqa: F821  index symbol -> transpiler rejects
+
+
+def test_prefer_js_selection(monkeypatch):
+    monkeypatch.setattr(blosc2, "IS_WASM", True)
+    af = blosc2.asarray(np.ones((4, 4), dtype=np.float64))
+    ai = blosc2.asarray(np.ones((4, 4), dtype=np.int64))
+
+    def sel(jit, jit_backend, operands, kwargs, reduce_args=None):
+        return lx._maybe_js_backend(_add, jit, jit_backend, reduce_args or {}, operands, kwargs)
+
+    # jit=None and jit=True both prefer js (js is a JIT) -> swapped to a plain callable
+    for jit in (None, True):
+        expr, _, jb = sel(jit, None, {"a": af, "b": af}, {"dtype": np.float64})
+        assert callable(expr)
+        assert not lx._is_dsl_kernel_expression(expr)
+        assert jb is None
+
+    # jit=False (interpreter) opts out -> stays the DSL kernel for miniexpr
+    assert sel(False, None, {"a": af, "b": af}, {"dtype": np.float64})[0] is _add
+
+    # explicit jit_backend opts out too (here tcc would force miniexpr)
+    assert sel(True, "tcc", {"a": af, "b": af}, {"dtype": np.float64})[0] is _add
+
+    # explicit strict_miniexpr=True opts out (keep miniexpr); =False/absent does not
+    assert sel(None, None, {"a": af, "b": af}, {"dtype": np.float64, "strict_miniexpr": True})[0] is _add
+    expr, *_ = sel(None, None, {"a": af, "b": af}, {"dtype": np.float64, "strict_miniexpr": False})
+    assert callable(expr)
+    assert not lx._is_dsl_kernel_expression(expr)
+
+    # integer dtype, reductions -> fall back to miniexpr
+    assert sel(None, None, {"a": ai, "b": ai}, {"dtype": np.int64})[0] is _add
+    assert sel(None, None, {"a": af}, {}, reduce_args={"op": "sum"})[0] is _add
+
+
+def test_prefer_js_falls_back_on_untranspilable(monkeypatch):
+    monkeypatch.setattr(blosc2, "IS_WASM", True)
+    af = blosc2.asarray(np.ones((4, 4), dtype=np.float64))
+    # _idx uses an index symbol the transpiler rejects -> default must fall back, not raise.
+    expr, _, _ = lx._maybe_js_backend(_idx, None, None, {}, {"a": af}, {"dtype": np.float64})
+    assert expr is _idx
+
+
+def test_explicit_js_off_wasm_raises():
+    # jit_backend="js" is an explicit choice -> hard error off-WASM (not a silent fallback).
+    assert not blosc2.IS_WASM  # this test runs on a native build
+    with pytest.raises(RuntimeError, match="WebAssembly"):
+        lx._maybe_js_backend(_add, None, "js", {}, {}, {})
diff --git a/tests/ndarray/test_dsl_kernels.py b/tests/ndarray/test_dsl_kernels.py
index d09c479be..99d45d058 100644
--- a/tests/ndarray/test_dsl_kernels.py
+++ b/tests/ndarray/test_dsl_kernels.py
@@ -23,6 +23,17 @@
 clip = np.clip
 
 
+@pytest.fixture(autouse=True)
+def _no_auto_js_backend(monkeypatch):
+    """Keep this module on the miniexpr/DSL path. Under WebAssembly the default prefers the
+    JS backend for float kernels, which would bypass ``_set_pref_expr`` and break the
+    miniexpr-specific assertions here. Stubbing the dtype gate to ``False`` disables only the
+    *auto* prefer-js (explicit ``jit_backend="js"`` still works, and is covered by
+    test_dsl_js.py / test_wasm_dsl_jit.py). No-op off WebAssembly (prefer-js never engages)."""
+    # `blosc2.lazyexpr` the attribute is the re-exported function; patch the actual module.
+    monkeypatch.setattr(sys.modules["blosc2.lazyexpr"], "_js_dtypes_ok", lambda *a, **k: False)
+
+
 def _make_arrays(shape=(8, 8), chunks=(4, 4), blocks=(2, 2)):
     a = np.linspace(0, 1, num=np.prod(shape), dtype=np.float32).reshape(shape)
     b = np.linspace(1, 2, num=np.prod(shape), dtype=np.float32).reshape(shape)
diff --git a/tests/ndarray/test_wasm_dsl_jit.py b/tests/ndarray/test_wasm_dsl_jit.py
index 00df2fd64..28c988da4 100644
--- a/tests/ndarray/test_wasm_dsl_jit.py
+++ b/tests/ndarray/test_wasm_dsl_jit.py
@@ -16,21 +16,59 @@ def _wasm_kernel(x, y):
     return (x + y) * 1.5 - 0.25
 
 
-@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
-def test_wasm_dsl_tcc_jit_smoke():
-    assert getattr(blosc2, "_WASM_MINIEXPR_ENABLED", False)
+@blosc2.dsl_kernel  # integer kernel -> the float64 JS bridge is unsafe -> must fall back to miniexpr
+def _wasm_int_kernel(x, y):
+    return x * 2 + y * 3
 
+
+def _wasm_grids():
     a_np = np.linspace(-1.0, 1.0, 64, dtype=np.float64).reshape(8, 8)
     b_np = np.linspace(0.0, 2.0, 64, dtype=np.float64).reshape(8, 8)
     a = blosc2.asarray(a_np, chunks=(4, 4), blocks=(2, 2))
     b = blosc2.asarray(b_np, chunks=(4, 4), blocks=(2, 2))
+    return a_np, b_np, a, b
+
+
+@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
+def test_wasm_dsl_tcc_jit_smoke():
+    assert getattr(blosc2, "_WASM_MINIEXPR_ENABLED", False)
 
+    a_np, b_np, a, b = _wasm_grids()
     expr = blosc2.lazyudf(_wasm_kernel, (a, b), dtype=np.float64)
     out = expr.compute(jit=True, jit_backend="tcc", strict_miniexpr=True)
     expected = (a_np + b_np) * 1.5 - 0.25
     np.testing.assert_allclose(out[...], expected, rtol=1e-6, atol=1e-8)
 
 
+# The next three are the native-CI counterpart of bench/js-transpiler/dsl-js-node.mjs (which
+# checks the same paths via a micropip overlay): explicit js, the prefer-js default, and the
+# silent fallback to miniexpr when js can't run a kernel.
+@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
+def test_wasm_dsl_js_backend_smoke():
+    a_np, b_np, a, b = _wasm_grids()
+    out = blosc2.lazyudf(_wasm_kernel, (a, b), dtype=np.float64).compute(jit_backend="js")
+    np.testing.assert_allclose(out[...], (a_np + b_np) * 1.5 - 0.25, rtol=1e-6, atol=1e-8)
+
+
+@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
+def test_wasm_dsl_default_prefers_js():
+    # No jit/jit_backend -> under WASM this prefers js for a float kernel; just has to be correct.
+    a_np, b_np, a, b = _wasm_grids()
+    out = blosc2.lazyudf(_wasm_kernel, (a, b), dtype=np.float64).compute()
+    np.testing.assert_allclose(out[...], (a_np + b_np) * 1.5 - 0.25, rtol=1e-6, atol=1e-8)
+
+
+@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
+def test_wasm_dsl_default_falls_back_for_int():
+    # int dtype -> js bridge unsafe -> default must fall back to miniexpr with no error.
+    ai_np = np.arange(64, dtype=np.int64).reshape(8, 8)
+    bi_np = ai_np + 1
+    ai = blosc2.asarray(ai_np, chunks=(4, 4), blocks=(2, 2))
+    bi = blosc2.asarray(bi_np, chunks=(4, 4), blocks=(2, 2))
+    out = blosc2.lazyudf(_wasm_int_kernel, (ai, bi), dtype=np.int64).compute()
+    np.testing.assert_array_equal(out[...], ai_np * 2 + bi_np * 3)
+
+
 @pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
 def test_wasm_string_predicates_strict_miniexpr():
     assert getattr(blosc2, "_WASM_MINIEXPR_ENABLED", False)

From 434118fdc4a428dd5f542ac3e0eb3884b58b0d42 Mon Sep 17 00:00:00 2001
From: Francesc Alted 
Date: Mon, 29 Jun 2026 08:29:20 +0200
Subject: [PATCH 05/13] Fix s windows issue

---
 tests/ndarray/test_dsl_js.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/ndarray/test_dsl_js.py b/tests/ndarray/test_dsl_js.py
index e09e9fd5c..17bbf77c7 100644
--- a/tests/ndarray/test_dsl_js.py
+++ b/tests/ndarray/test_dsl_js.py
@@ -12,9 +12,11 @@
 """
 
 import json
+import os
 import shutil
 import subprocess
 import sys
+import tempfile
 
 import numpy as np
 import pytest
@@ -91,7 +93,13 @@ def _run_node(module, pts, scalars):
 {cols}__run([{ops}], [{isarr}], out, pts.length);
 console.log(JSON.stringify(Array.from(out)));
 """
-    res = subprocess.run([node, "-e", prog], capture_output=True, text=True)
+    # Write to a temp file rather than `node -e `: a big inlined program (the points
+    # are JSON-embedded) overflows the Windows command-line length limit (WinError 206).
+    with tempfile.TemporaryDirectory() as d:
+        script = os.path.join(d, "dsl_js_check.js")
+        with open(script, "w", encoding="utf-8") as fh:
+            fh.write(prog)
+        res = subprocess.run([node, script], capture_output=True, text=True)
     if res.returncode != 0:
         raise AssertionError(f"node failed:\n{res.stderr}")
     return json.loads(res.stdout)

From 831a5fefc99a0f62045558681e08743e4aec056a Mon Sep 17 00:00:00 2001
From: Francesc Alted 
Date: Mon, 29 Jun 2026 10:56:02 +0200
Subject: [PATCH 06/13] Tests spawning node via subprocess now skip on WASM
 because emscripten can't do that

---
 tests/ndarray/test_dsl_js.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/ndarray/test_dsl_js.py b/tests/ndarray/test_dsl_js.py
index 17bbf77c7..965e63ff2 100644
--- a/tests/ndarray/test_dsl_js.py
+++ b/tests/ndarray/test_dsl_js.py
@@ -132,6 +132,7 @@ def uses_index(a):
         dsl_to_js(uses_index)
 
 
+@pytest.mark.skipif(blosc2.IS_WASM, reason="emscripten cannot spawn the node subprocess")
 def test_newton_matches_python():
     w, h, max_iter, relax = 40, 30, 48, 1.37
     pts = [[-1.7 + 3.4 * c / (w - 1), -1.1 + 2.2 * r / (h - 1)] for r in range(h) for c in range(w)]
@@ -141,6 +142,7 @@ def test_newton_matches_python():
     assert maxdiff < 1e-9, f"newton py-vs-js mismatch: maxdiff={maxdiff}"
 
 
+@pytest.mark.skipif(blosc2.IS_WASM, reason="emscripten cannot spawn the node subprocess")
 def test_misc_matches_python():
     pts = [[3.5, 16.0], [1.2, 9.0], [-4.3, 25.0], [8.0, 4.0], [0.0, 100.0]]
     ref = []
@@ -208,6 +210,7 @@ def test_prefer_js_falls_back_on_untranspilable(monkeypatch):
     assert expr is _idx
 
 
+@pytest.mark.skipif(blosc2.IS_WASM, reason="this test asserts off-WASM behavior")
 def test_explicit_js_off_wasm_raises():
     # jit_backend="js" is an explicit choice -> hard error off-WASM (not a silent fallback).
     assert not blosc2.IS_WASM  # this test runs on a native build

From fc8694f1475975e0b4eb8db9fb329a9c4ee00190 Mon Sep 17 00:00:00 2001
From: Francesc Alted 
Date: Mon, 29 Jun 2026 11:02:34 +0200
Subject: [PATCH 07/13] Disable some tests on windows: miniexpr runtime JIT
 backend isn't bundled in the windows wheels

---
 tests/ndarray/test_dsl_kernels.py | 48 +++++++++++++++++++++++++------
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/tests/ndarray/test_dsl_kernels.py b/tests/ndarray/test_dsl_kernels.py
index 99d45d058..0439015cf 100644
--- a/tests/ndarray/test_dsl_kernels.py
+++ b/tests/ndarray/test_dsl_kernels.py
@@ -23,6 +23,35 @@
 clip = np.clip
 
 
+def _jit_backend_available():
+    """Whether the miniexpr runtime JIT backend is bundled in this build.
+
+    The JIT compiler is not shipped on every platform (e.g. the Windows wheels lack
+    it); there a DSL kernel still compiles and runs, but via the interpreter rather
+    than a JIT kernel.  Probe a trivial kernel once so the JIT-specific assertions can
+    be gated on the platform actually having a JIT backend."""
+    try:
+
+        @blosc2.dsl_kernel
+        def _probe(a):
+            return a + 1.0
+
+        return bool(blosc2.validate_dsl_jit(_probe, [np.float64], np.float64).get("jit"))
+    except Exception:
+        return False
+
+
+JIT_AVAILABLE = _jit_backend_available()
+
+
+def _expect_jit(status):
+    """Assert *status* compiled, and that it produced a runtime JIT kernel where the
+    platform actually has a JIT backend (see :func:`_jit_backend_available`)."""
+    assert status["compiled"]
+    if JIT_AVAILABLE:
+        assert status["jit"]
+
+
 @pytest.fixture(autouse=True)
 def _no_auto_js_backend(monkeypatch):
     """Keep this module on the miniexpr/DSL path. Under WebAssembly the default prefers the
@@ -330,9 +359,9 @@ def wrapped_set_pref_expr(self, expression, inputs, fp_accuracy, aux_reduc=None,
     assert "_n1" in captured["expr"]
     assert "_i1" in captured["expr"]
     # ...and it JIT-compiles rather than silently running on the interpreter.
-    assert blosc2.validate_dsl_jit(kernel_index_ramp_float_cast, {"x": np.float32}, np.float32, shape=shape)[
-        "jit"
-    ]
+    _expect_jit(
+        blosc2.validate_dsl_jit(kernel_index_ramp_float_cast, {"x": np.float32}, np.float32, shape=shape)
+    )
 
 
 def test_dsl_kernel_index_symbols_int_cast_matches_expected_ramp():
@@ -463,9 +492,11 @@ def wrapped_set_pref_expr(self, expression, inputs, fp_accuracy, aux_reduc=None,
         assert "range(niter)" not in captured["expr"]
         assert "float(niter)" not in captured["expr"]
         # ...and the loop+scalar kernel JIT-compiles (the original G3 fallback shape).
-        assert blosc2.validate_dsl_jit(
-            kernel_loop_param, {"x": a2.dtype, "y": b2.dtype, "niter": niter}, a2.dtype, shape=(32, 32)
-        )["jit"]
+        _expect_jit(
+            blosc2.validate_dsl_jit(
+                kernel_loop_param, {"x": a2.dtype, "y": b2.dtype, "niter": niter}, a2.dtype, shape=(32, 32)
+            )
+        )
     finally:
         lazyexpr_mod.try_miniexpr = old_try_miniexpr
 
@@ -1097,8 +1128,7 @@ def simple(a, b):
 
     st = blosc2.validate_dsl_jit(simple, [np.float64, np.float64], np.float64)
     assert st["valid"]
-    assert st["compiled"]
-    assert st["jit"]
+    _expect_jit(st)
     assert st["status"] == "ME_COMPILE_SUCCESS"
 
     # A scalar param is inlined (passed as a value); a variable named 'out' (the
@@ -1108,7 +1138,7 @@ def simple(a, b):
         "def withk(a, b, niter):\n    out = a + b * float(niter)\n    return out\n", "withk"
     )
     st = blosc2.validate_dsl_jit(with_out, {"a": np.float64, "b": np.float64, "niter": 3}, np.float64)
-    assert st["jit"]
+    _expect_jit(st)
 
     # Invalid syntax -> not valid, nothing compiled.
     invalid = kernel_from_source("def k(a, b):\n    a = a - b\n    return a\n", "k")

From cfc9406e7a840c76d368af5a09fffaba2d875e87 Mon Sep 17 00:00:00 2001
From: Francesc Alted 
Date: Mon, 29 Jun 2026 12:09:47 +0200
Subject: [PATCH 08/13] =?UTF-8?q?Expand=20DSL=E2=86=92JS=20transpiler:=20i?=
 =?UTF-8?q?ndex/shape=20symbols,=20integer=20inputs,=20caching?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add P1 (index/shape symbols _i0/_n0/_flat_idx) by emitting them as trailing
kernel params and reconstructing per-block global coords in the driver, and
P2 (integer inputs with floating output, matching miniexpr's float promotion).
Memoize transpile + js.eval so repeated evaluations don't re-parse/re-compile,
which closes most of the gap on light kernels. Update tests, the node bench
(new P1/P2 kernels, streamed rows), and plans/dsl-js-coverage.md.
---
 bench/js-transpiler/dsl-js-node.mjs | 108 +++++++++++------
 src/blosc2/dsl_js.py                | 180 +++++++++++++++++++++++-----
 src/blosc2/lazyexpr.py              |  46 +++++--
 tests/ndarray/test_dsl_js.py        | 106 ++++++++++++++--
 tests/ndarray/test_wasm_dsl_jit.py  |  37 +++++-
 5 files changed, 393 insertions(+), 84 deletions(-)

diff --git a/bench/js-transpiler/dsl-js-node.mjs b/bench/js-transpiler/dsl-js-node.mjs
index dedb0a10d..67bb7384e 100644
--- a/bench/js-transpiler/dsl-js-node.mjs
+++ b/bench/js-transpiler/dsl-js-node.mjs
@@ -93,15 +93,33 @@ def deepar_dsl(a, b):
         acc = acc * 0.5 + t * 0.25 + 0.1
     return acc + t
 
-# Fallback-path kernels (used only by fallback_check, not the float sweep):
-@blosc2.dsl_kernel  # int output/operands -> default must fall back to miniexpr (float64 bridge unsafe)
+@blosc2.dsl_kernel  # P1: index/shape symbols -> per-element global coords (radial gradient)
+def idxgrad_dsl(a):
+    dx = float(_i0) - _n0 * 0.5  # noqa: F821
+    dy = float(_i1) - _n1 * 0.5  # noqa: F821
+    return a + sqrt(dx * dx + dy * dy)  # noqa: F821
+
+@blosc2.dsl_kernel  # P2: integer inputs, float output (the bridge float64-converts the operands)
+def intmix_dsl(a, b):
+    return sqrt(a * a + b * b) * 0.25 + (a - b)  # noqa: F821
+
+# Path-coverage kernels (used only by path_check, not the float sweep):
+@blosc2.dsl_kernel  # int *output* -> default must fall back to miniexpr (float64 bridge unsafe)
 def int_dsl(a, b):
     return a * 2 + b * 3
 
-@blosc2.dsl_kernel  # index symbol -> transpiler rejects -> default must fall back to miniexpr
+@blosc2.dsl_kernel  # int *inputs*, float output -> JS is safe (bridge float64-converts operands)
+def intin_dsl(a, b):
+    return (a + b) * 0.5
+
+@blosc2.dsl_kernel  # index/shape symbols -> JS reconstructs global coords per block
 def idx_dsl(a):
     return a + float(_i0)  # noqa: F821
 
+@blosc2.dsl_kernel  # expm1() is valid DSL/miniexpr but outside the JS Math.* set -> falls back
+def unsup_dsl(a, b):
+    return expm1(a + b) * 0.5  # noqa: F821
+
 _x = np.linspace(-SPANX / 2, SPANX / 2, WIDTH, dtype=DTYPE)
 _y = np.linspace(-SPANX * ASPECT / 2, SPANX * ASPECT / 2, HEIGHT, dtype=DTYPE)
 A_NP, B_NP = np.meshgrid(_x, _y)
@@ -110,6 +128,11 @@ _blocks = (max(1, _chunks[0] // 4), max(1, _chunks[1] // 3))
 _cp = blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=1)
 A_B2 = blosc2.asarray(A_NP, chunks=_chunks, blocks=_blocks, cparams=_cp)
 B_B2 = blosc2.asarray(B_NP, chunks=_chunks, blocks=_blocks, cparams=_cp)
+# Small-magnitude integer operands for the P2 (int in / float out) kernels.
+AI_NP = (A_NP * 10).astype(np.int64)
+BI_NP = (B_NP * 10).astype(np.int64)
+AI_B2 = blosc2.asarray(AI_NP, chunks=_chunks, blocks=_blocks, cparams=_cp)
+BI_B2 = blosc2.asarray(BI_NP, chunks=_chunks, blocks=_blocks, cparams=_cp)
 
 # (name, kernel, operand tuple). Fixed inputs -> each rep does identical work.
 KERNELS = [
@@ -118,6 +141,8 @@ KERNELS = [
     ("trans",  trans_dsl,  (A_B2, B_B2)),
     ("deep",   deep_dsl,   (A_B2, B_B2)),
     ("deepar", deepar_dsl, (A_B2, B_B2)),
+    ("idxgrad", idxgrad_dsl, (A_B2,)),         # P1: index/shape symbols (float out)
+    ("intmix",  intmix_dsl,  (AI_B2, BI_B2)),  # P2: integer inputs, float out
 ]
 
 def run(func, ops, backend, dtype=DTYPE):
@@ -153,33 +178,41 @@ def debug_bridge():
         t = time.perf_counter(); bridge(inp, out, 0); best = min(best, (time.perf_counter() - t) * 1000)
     return {"first_ms": first, "warm_ms": best}
 
-def fallback_check():
-    # Default backend must transparently fall back to miniexpr where js can't go, with no
-    # error and the same result. int dtype -> dtype-gated; index symbol -> transpiler rejects.
-    Ai = blosc2.asarray((A_NP * 10).astype(np.int64), chunks=_chunks, blocks=_blocks, cparams=_cp)
-    Bi = blosc2.asarray((B_NP * 10).astype(np.int64), chunks=_chunks, blocks=_blocks, cparams=_cp)
-    int_def = run(int_dsl, (Ai, Bi), "default", dtype=np.int64)
-    int_tcc = run(int_dsl, (Ai, Bi), "tcc", dtype=np.int64)
-    idx_def = run(idx_dsl, (A_B2,), "default")
+def path_check():
+    # The default backend must agree with miniexpr (tcc) on every kernel, whether it runs
+    # the kernel through JS (index symbols, int inputs+float out) or transparently falls back
+    # to miniexpr where JS can't go (int output, unsupported constructs) -- with no error.
+    int_def = run(int_dsl, (AI_B2, BI_B2), "default", dtype=np.int64)  # -> falls back to miniexpr
+    int_tcc = run(int_dsl, (AI_B2, BI_B2), "tcc", dtype=np.int64)
+    intin_def = run(intin_dsl, (AI_B2, BI_B2), "default")              # -> JS (int in, float out)
+    intin_tcc = run(intin_dsl, (AI_B2, BI_B2), "tcc")
+    idx_def = run(idx_dsl, (A_B2,), "default")                        # -> JS (index symbols)
     idx_tcc = run(idx_dsl, (A_B2,), "tcc")
+    unsup_def = run(unsup_dsl, (A_B2, B_B2), "default")               # -> falls back to miniexpr
+    unsup_tcc = run(unsup_dsl, (A_B2, B_B2), "tcc")
     return {
         "int_ok": bool(np.array_equal(int_def, int_tcc)),
+        "intin_ok": bool(np.allclose(intin_def, intin_tcc)),
         "idx_ok": bool(np.allclose(idx_def, idx_tcc)),
+        "unsup_ok": bool(np.allclose(unsup_def, unsup_tcc)),
     }
 
-def result(reps):
+def kernel_names():
+    return json.dumps([name for name, _f, _o in KERNELS])
+
+def bench_kernel(i, reps):
+    # One kernel at a time so the driver can print each row as soon as it is computed.
     import math
-    kernels = []
-    for name, func, ops in KERNELS:
-        rj = run(func, ops, "js")
-        rtcc = run(func, ops, "tcc")
-        diff = float(np.max(np.abs(rj - rtcc)))
-        diff = diff if math.isfinite(diff) else 1e30   # keep JSON valid; flags as mismatch
-        ms = {b: bench(func, ops, b, reps) for b in ("default", "js", "tcc", "nojit")}
-        kernels.append({"name": name, "ms": ms, "diff": diff})
-    return json.dumps({
-        "kernels": kernels, "bridge": debug_bridge(), "fallback": fallback_check(), "reps": reps,
-    })
+    name, func, ops = KERNELS[i]
+    rj = run(func, ops, "js")
+    rtcc = run(func, ops, "tcc")
+    diff = float(np.max(np.abs(rj - rtcc)))
+    diff = diff if math.isfinite(diff) else 1e30   # keep JSON valid; flags as mismatch
+    ms = {b: bench(func, ops, b, reps) for b in ("default", "js", "tcc", "nojit")}
+    return json.dumps({"name": name, "ms": ms, "diff": diff})
+
+def summary():
+    return json.dumps({"bridge": debug_bridge(), "paths": path_check()})
 `;
 
 const py = await loadPyodide();
@@ -203,29 +236,36 @@ console.log("blosc2", await py.runPythonAsync("blosc2.__version__"),
             "| Pyodide", py.version, "| reps", NFRAMES);
 
 py.FS.writeFile("/kernel_bench.py", new TextEncoder().encode(PYSRC));
-const out = await py.runPythonAsync(`
+await py.runPythonAsync(`
 import sys
 if "/" not in sys.path: sys.path.insert(0, "/")
 import kernel_bench
-kernel_bench.result(${NFRAMES})
 `);
 
-const r = JSON.parse(out);
 const fmt = (x, w) => String(x).padStart(w);
-const bad = r.kernels.filter((k) => k.diff > 1e-5);
-console.log(`\ncorrectness (js vs tcc maxdiff): ${bad.length ? "MISMATCH " + bad.map((k) => k.name) : "OK"}`);
-const fb = r.fallback;
-const fbOk = fb.int_ok && fb.idx_ok;
-console.log(`default fallback (no jit_backend): int=${fb.int_ok ? "ok" : "FAIL"} ` +
-            `index-symbol=${fb.idx_ok ? "ok" : "FAIL"}  -> ${fbOk ? "falls back cleanly" : "BROKEN"}`);
+const names = JSON.parse(await py.runPythonAsync("kernel_bench.kernel_names()"));
+
+// Stream the table: print each row as soon as its kernel finishes benchmarking.
 console.log("\nper-kernel bench (ms/frame, lower is better; 'default' = prefer-js w/ fallback,");
 console.log("'tcc' = miniexpr JIT, 'nojit' = miniexpr interpreter):");
 const cols = ["default", "js", "tcc", "nojit", "js/tcc"];
 console.log("  " + "kernel".padEnd(8) + cols.map((c) => fmt(c, 8)).join(""));
-for (const k of r.kernels) {
+const bad = [];
+for (let i = 0; i < names.length; i++) {
+  const k = JSON.parse(await py.runPythonAsync(`kernel_bench.bench_kernel(${i}, ${NFRAMES})`));
   const { default: def, js, tcc, nojit } = k.ms;
   const cells = [def, js, tcc, nojit].map((v) => v.toFixed(1)).concat((tcc / js).toFixed(2) + "x");
   console.log("  " + k.name.padEnd(8) + cells.map((c) => fmt(c, 8)).join(""));
+  if (k.diff > 1e-5) bad.push(k.name);
 }
-console.log(`\nnewton bridge probe (no blosc2 machinery): first=${r.bridge.first_ms.toFixed(1)} ms  warm=${r.bridge.warm_ms.toFixed(1)} ms`);
+
+const s = JSON.parse(await py.runPythonAsync("kernel_bench.summary()"));
+console.log(`\ncorrectness (js vs tcc maxdiff): ${bad.length ? "MISMATCH " + bad : "OK"}`);
+const fb = s.paths;
+const fbOk = fb.int_ok && fb.intin_ok && fb.idx_ok && fb.unsup_ok;
+console.log(`default backend (no jit_backend) vs miniexpr: ` +
+            `int-out=${fb.int_ok ? "ok" : "FAIL"} int-in=${fb.intin_ok ? "ok" : "FAIL"} ` +
+            `index-symbol=${fb.idx_ok ? "ok" : "FAIL"} unsupported=${fb.unsup_ok ? "ok" : "FAIL"}` +
+            `  -> ${fbOk ? "all paths agree" : "BROKEN"}`);
+console.log(`\nnewton bridge probe (no blosc2 machinery): first=${s.bridge.first_ms.toFixed(1)} ms  warm=${s.bridge.warm_ms.toFixed(1)} ms`);
 if (bad.length || !fbOk) process.exit(1);
diff --git a/src/blosc2/dsl_js.py b/src/blosc2/dsl_js.py
index 4446cf092..e270ce528 100644
--- a/src/blosc2/dsl_js.py
+++ b/src/blosc2/dsl_js.py
@@ -5,10 +5,16 @@
 interpreter ~11x. See plans/dsl-js.md.
 
 Public API:
-    dsl_to_js(kernel)  -> (js_source, param_names)   # pure stdlib, runs anywhere
-    js_kernel(kernel)  -> callable for lazyudf(...)   # needs Pyodide `js` to *run*
+    dsl_to_js(kernel)         -> (js_source, param_names)  # pure stdlib, runs anywhere
+    build_js_module(k, ndim)  -> js_source string          # ndim needed for index symbols
+    js_kernel(kernel, shape)  -> callable for lazyudf(...)  # needs Pyodide `js` to *run*
 
 `kernel` may be a blosc2 DSLKernel (has .dsl_source), a plain function, or a source string.
+
+Kernels may use index/shape symbols (`_i0`/`_i1`/.., `_n0`/.., `_flat_idx`); they become
+trailing kernel parameters and the runtime driver reconstructs each element's global
+coordinate per block. That requires the output rank, so `build_js_module`/`js_kernel` need
+`ndim`/`shape`; without them, index-symbol kernels raise (and the caller falls back).
 """
 
 from __future__ import annotations
@@ -21,7 +27,9 @@
 # Wired into lazyexpr via jit_backend="js": a DSL kernel is transpiled here and run as a
 # plain per-block callable. Browser/Pyodide only (js_kernel imports `js` at call time).
 
-_INDEX_SYMBOLS = {"_i0", "_i1", "_i2", "_n0", "_n1", "_n2", "_flat_idx"}
+# Canonical signature order for index/shape symbols passed to the transpiled kernel.
+_INDEX_SYMBOL_ORDER = ("_i0", "_i1", "_i2", "_n0", "_n1", "_n2", "_flat_idx")
+_INDEX_SYMBOLS = set(_INDEX_SYMBOL_ORDER)
 
 # numpy/math function name -> JS Math.* name (numpy aliases included).
 _MATH = {
@@ -115,21 +123,23 @@ def _get_source(obj) -> str:
 class _Transpiler:
     def transpile(self, func: ast.FunctionDef):
         self.params = [a.arg for a in func.args.args]
-        self._reject_index_symbols(func)
+        used_index = self._collect_index_symbols(func)
         hoist = self._hoist_names(func)
         body = self._block(func.body, 1)
-        head = f"function {func.name}({', '.join(self.params)}) {{\n"
+        # Index/shape symbols (`_i0`, `_n1`, `_flat_idx`, ...) become extra trailing
+        # parameters; the runtime driver computes them per element (see _module_with_index).
+        sig = self.params + used_index
+        head = f"function {func.name}({', '.join(sig)}) {{\n"
         decl = f"  let {', '.join(sorted(hoist))};\n" if hoist else ""
-        return head + decl + body + "}", list(self.params)
+        return head + decl + body + "}", list(self.params), used_index
 
     # -- scope analysis -------------------------------------------------
-    def _reject_index_symbols(self, func):
-        for node in ast.walk(func):
-            if isinstance(node, ast.Name) and node.id in _INDEX_SYMBOLS:
-                raise _DSLToJSError(
-                    f"index/shape symbol '{node.id}' is not supported yet (MVP); "
-                    "see plans/dsl-js.md 'Deferred'"
-                )
+    def _collect_index_symbols(self, func):
+        """Return the index/shape symbols the kernel references, in canonical order."""
+        used = {
+            node.id for node in ast.walk(func) if isinstance(node, ast.Name) and node.id in _INDEX_SYMBOLS
+        }
+        return [s for s in _INDEX_SYMBOL_ORDER if s in used]
 
     def _hoist_names(self, func):
         assigned, fortargets = set(), set()
@@ -302,20 +312,101 @@ def _const(v) -> str:
     raise _DSLToJSError(f"unsupported constant: {v!r}")
 
 
-def dsl_to_js(kernel):
-    """Transpile a DSL kernel to a JS function string. Returns (js_source, param_names)."""
-    tree = ast.parse(_get_source(kernel))
+# Transpiling is a pure function of the kernel source, so memoize it: the same kernel is
+# typically re-transpiled on every lazyudf evaluation (e.g. an animation loop rebuilds the
+# expression each frame), and ast.parse + codegen is non-trivial under WASM.
+_TRANSPILE_CACHE: dict[str, tuple] = {}
+
+# module string -> the V8-compiled `__run` function (a Pyodide JsProxy). Populated only when
+# a kernel actually runs in-browser (see js_kernel's bridge); empty off-WASM.
+_RUN_CACHE: dict = {}
+
+
+def _transpile(kernel):
+    """Transpile *kernel* to JS. Returns (js_source, params, index_symbols, func_name)."""
+    src = _get_source(kernel)
+    hit = _TRANSPILE_CACHE.get(src)
+    if hit is not None:
+        return hit
+    tree = ast.parse(src)
     func = next((n for n in tree.body if isinstance(n, ast.FunctionDef)), None)
     if func is None:
         raise _DSLToJSError("no function definition found in DSL source")
-    return _Transpiler().transpile(func)
+    js_src, params, used_index = _Transpiler().transpile(func)
+    result = (js_src, params, used_index, func.name)
+    _TRANSPILE_CACHE[src] = result
+    return result
+
+
+def dsl_to_js(kernel):
+    """Transpile a DSL kernel to a JS function string. Returns (js_source, param_names)."""
+    js_src, params, _used, _name = _transpile(kernel)
+    return js_src, params
+
+
+def _max_index_dim(used_index) -> int:
+    """Highest axis referenced by `_iK`/`_nK` symbols (-1 if only `_flat_idx`/none)."""
+    dims = [int(s[2:]) for s in used_index if s != "_flat_idx"]
+    return max(dims) if dims else -1
+
+
+def _op_call_args(nparams):
+    return [f"(isarr[{k}] ? ops[{k}][i] : ops[{k}])" for k in range(nparams)]
+
+
+def _module_with_index(kernel_js, fname, params, used_index) -> str:
+    """Driver for kernels that use index/shape symbols.
+
+    Signature ``__run(ops, isarr, out, n, off, gshape, cshape)``: `off` is the block's
+    global start coord, `gshape` the whole-array shape, `cshape` the block shape (all JS
+    arrays).  Each element's local coord is unravelled from its flat position (C-order),
+    then the referenced symbols are derived and passed as trailing kernel args."""
+    decls = []
+    for s in used_index:
+        if s == "_flat_idx":
+            continue
+        k = int(s[2:])
+        rhs = f"off[{k}] + loc[{k}]" if s.startswith("_i") else f"gshape[{k}]"
+        decls.append(f"    const {s} = {rhs};")
+    if "_flat_idx" in used_index:
+        decls.append("    let _flat_idx = 0;")
+        decls.append(
+            "    for (let k = 0; k < d; k++) _flat_idx = _flat_idx * gshape[k] + (off[k] + loc[k]);"
+        )
+    call = f"{fname}({', '.join(_op_call_args(len(params)) + used_index)})"
+    driver = "\n".join(
+        [
+            "function __run(ops, isarr, out, n, off, gshape, cshape) {",
+            "  const d = cshape.length;",
+            "  const loc = new Array(d);",
+            "  for (let i = 0; i < n; i++) {",
+            "    let rem = i;",
+            "    for (let k = d - 1; k >= 0; k--) { loc[k] = rem % cshape[k]; rem = (rem - loc[k]) / cshape[k]; }",
+            *decls,
+            f"    out[i] = {call};",
+            "  }",
+            "}",
+        ]
+    )
+    return f"{JS_PRELUDE}\n{kernel_js}\n{driver}\nreturn __run;"
 
 
-def build_js_module(kernel) -> str:
-    """Self-contained JS: prelude + kernel + an `__run(ops, isarr, out, n)` element driver."""
-    kernel_js, params = dsl_to_js(kernel)
-    fname = ast.parse(_get_source(kernel)).body[0].name
-    call_args = ", ".join(f"(isarr[{k}] ? ops[{k}][i] : ops[{k}])" for k in range(len(params)))
+def build_js_module(kernel, ndim: int | None = None) -> str:
+    """Self-contained JS: prelude + kernel + a runtime element driver returning ``__run``.
+
+    Kernels without index/shape symbols get a flat ``__run(ops, isarr, out, n)`` driver.
+    Kernels that use `_i0`/`_n0`/`_flat_idx` get the index-aware driver (see
+    :func:`_module_with_index`) and require *ndim* (the output rank) so the referenced
+    axes can be validated; ``ndim=None`` raises for such kernels."""
+    kernel_js, params, used_index, fname = _transpile(kernel)
+    if used_index:
+        if ndim is None:
+            raise _DSLToJSError("kernel uses index/shape symbols; the output ndim must be supplied")
+        max_dim = _max_index_dim(used_index)
+        if max_dim >= ndim:
+            raise _DSLToJSError(f"kernel references axis {max_dim} but the output is {ndim}-D")
+        return _module_with_index(kernel_js, fname, params, used_index)
+    call_args = ", ".join(_op_call_args(len(params)))
     driver = (
         f"function __run(ops, isarr, out, n) {{ "
         f"for (let i = 0; i < n; i++) out[i] = {fname}({call_args}); }}"
@@ -323,9 +414,16 @@ def build_js_module(kernel) -> str:
     return f"{JS_PRELUDE}\n{kernel_js}\n{driver}\nreturn __run;"
 
 
-def js_kernel(kernel):
-    """Return a lazyudf-compatible callable that runs the transpiled JS (Pyodide only)."""
-    module = build_js_module(kernel)
+def js_kernel(kernel, shape=None):
+    """Return a lazyudf-compatible callable that runs the transpiled JS (Pyodide only).
+
+    *shape* is the whole-array output shape; it is required for kernels that use index/shape
+    symbols (so the driver knows the rank and global geometry) and ignored otherwise."""
+    ndim = len(shape) if shape is not None else None
+    _, _, used_index, _ = _transpile(kernel)  # cached
+    module = build_js_module(kernel, ndim=ndim)
+    uses_index = bool(used_index)
+    gshape = tuple(int(s) for s in shape) if shape is not None else None
     run = None  # lazily created in-browser
 
     def bridge(inputs, output, offset=None):
@@ -334,9 +432,15 @@ def bridge(inputs, output, offset=None):
         from js import Array, Float64Array, Uint8Array  # Pyodide
 
         if run is None:
-            import js
+            # Reuse the V8-compiled function across lazyudf evaluations of the same kernel:
+            # the module is a pure function of (source, ndim), so js.eval need run only once
+            # per distinct module instead of once per frame.
+            run = _RUN_CACHE.get(module)
+            if run is None:
+                import js
 
-            run = js.eval(f"(function() {{ {module} }})()")
+                run = js.eval(f"(function() {{ {module} }})()")
+                _RUN_CACHE[module] = run
 
         n = int(output.size)
         # Pass real JS Arrays, not Python lists: a Python list arrives in JS as a PyProxy,
@@ -355,7 +459,19 @@ def bridge(inputs, output, offset=None):
                 ops.push(float(x))
                 isarr.push(False)
         out_js = Float64Array.new(n)
-        run(ops, isarr, out_js, n)
+        if uses_index:
+            off = offset if offset is not None else (0,) * output.ndim
+            run(
+                ops,
+                isarr,
+                out_js,
+                n,
+                _to_jsint(off, Array),
+                _to_jsint(gshape, Array),
+                _to_jsint(output.shape, Array),
+            )
+        else:
+            run(ops, isarr, out_js, n)
         # ponytail: per-block to_js()/to_bytes() copies; swap to a zero-copy HEAPF64 view
         # onto WASM linear memory only if marshaling shows up as the bottleneck.
         res = np.frombuffer(bytes(out_js.to_bytes()), dtype=np.float64)
@@ -370,3 +486,11 @@ def _to_jsf64(xf, Float64Array, Uint8Array):
     u8 = Uint8Array.new(xf.nbytes)
     u8.assign(xf.tobytes())  # Pyodide TypedArray.assign(buffer) copies bytes in
     return Float64Array.new(u8.buffer)
+
+
+def _to_jsint(seq, Array):
+    """Small geometry vector (offset/shape) -> a real JS Array of ints (avoids PyProxy)."""
+    arr = Array.new()
+    for v in seq:
+        arr.push(int(v))
+    return arr
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index be9edb11e..a7998dd99 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1448,31 +1448,48 @@ def _is_dsl_kernel_expression(expression) -> bool:
     return isinstance(expression, DSLKernel) and expression.dsl_source is not None
 
 
-def _as_js_udf(expression):
+def _as_js_udf(expression, shape=None):
     """For jit_backend="js": transpile a DSL kernel to JS and return a plain per-block
-    callable (so the normal UDF path runs it). Browser/Pyodide only."""
+    callable (so the normal UDF path runs it). Browser/Pyodide only.
+
+    *shape* (the whole-array output shape) is forwarded to the transpiler so kernels using
+    index/shape symbols can reconstruct global coordinates per block."""
     if not _is_dsl_kernel_expression(expression):
         raise ValueError('jit_backend="js" requires a blosc2.dsl_kernel-decorated kernel')
     if not blosc2.IS_WASM:
         raise RuntimeError('jit_backend="js" is only available under WebAssembly/Pyodide')
     from .dsl_js import js_kernel
 
-    return js_kernel(expression)
+    return js_kernel(expression, shape=shape)
 
 
 def _js_dtypes_ok(operands, kwargs) -> bool:
-    """True only if the JS bridge (which computes in float64) is safe for these operands:
-    floating-point NDArray inputs and a floating/inferred output dtype. Integer/complex go
-    to miniexpr instead (float64 can't represent int64 exactly)."""
+    """True only if the JS bridge (which computes in float64) is safe for these operands.
+
+    The output dtype must be floating: integer/complex *output* goes to miniexpr (the bridge
+    can't reproduce integer division/overflow/truncation semantics, and float64 can't hold
+    int64 exactly).  Given a floating output, integer *inputs* are fine -- the bridge converts
+    every operand to float64, which is exactly what miniexpr does when promoting integer inputs
+    for a float result (so any values above 2**53 lose precision identically).  Complex inputs
+    are rejected (the bridge is real-only)."""
     dt = kwargs.get("dtype")
-    if dt is not None and not np.issubdtype(np.dtype(dt), np.floating):
+    if dt is None:
+        # Inferred output: only safe when all operands are float (so the output is float too).
+        return all(
+            np.issubdtype(op.dtype, np.floating)
+            for op in operands.values()
+            if isinstance(op, blosc2.NDArray)
+        )
+    if not np.issubdtype(np.dtype(dt), np.floating):
         return False
     return all(
-        np.issubdtype(op.dtype, np.floating) for op in operands.values() if isinstance(op, blosc2.NDArray)
+        np.issubdtype(op.dtype, np.floating) or np.issubdtype(op.dtype, np.integer)
+        for op in operands.values()
+        if isinstance(op, blosc2.NDArray)
     )
 
 
-def _maybe_js_backend(expression, jit, jit_backend, reduce_args, operands, kwargs):
+def _maybe_js_backend(expression, jit, jit_backend, reduce_args, operands, kwargs, shape=None):
     """Resolve the JS backend for a DSL kernel.
 
     - ``jit_backend="js"`` (explicit): transpile to the JS bridge, or raise if it can't.
@@ -1483,26 +1500,31 @@ def _maybe_js_backend(expression, jit, jit_backend, reduce_args, operands, kwarg
       ``jit=False`` (interpreter), ``strict_miniexpr=True``, or an explicit ``jit_backend``
       opts out.
 
+    *shape* is the whole-array output shape, forwarded to the transpiler for kernels that
+    use index/shape symbols (``_i0``/``_n0``/``_flat_idx``); without it such kernels fall
+    back to miniexpr.
+
     Returns ``(expression, jit, jit_backend)`` — expression becomes a plain per-block
     callable when JS is chosen, else everything passes through unchanged.
     """
     if jit_backend == "js":
         if reduce_args:
             raise ValueError('jit_backend="js" does not support reductions')
-        return _as_js_udf(expression), None, None
+        return _as_js_udf(expression, shape), None, None
     prefer_js = (
         jit is not False  # jit=True/None prefer the best JIT (js); only jit=False forces interpreter
         and jit_backend is None
         and not kwargs.get("strict_miniexpr")  # explicit strict_miniexpr=True keeps miniexpr
         and blosc2.IS_WASM
         and _is_dsl_kernel_expression(expression)
+        and operands  # at least one operand: the zero-input DSL path stays on miniexpr
         and not reduce_args
         and _js_dtypes_ok(operands, kwargs)
     )
     if not prefer_js:
         return expression, jit, jit_backend
     try:
-        bridge = _as_js_udf(expression)  # transpiles; raises on any unsupported construct
+        bridge = _as_js_udf(expression, shape)  # transpiles; raises on any unsupported construct
     except Exception:
         return expression, jit, jit_backend  # fall back to miniexpr, no regression
     return bridge, None, None
@@ -3040,7 +3062,7 @@ def chunked_eval(
         # Resolve the JS backend: explicit jit_backend="js", or prefer-js-with-fallback under
         # WebAssembly when the user left jit_backend unset (see _maybe_js_backend).
         expression, jit, jit_backend = _maybe_js_backend(
-            expression, jit, jit_backend, reduce_args, operands, kwargs
+            expression, jit, jit_backend, reduce_args, operands, kwargs, shape=shape
         )
 
         fast_path = _validate_chunked_eval_inputs(operands, out, shape, reduce_args)
diff --git a/tests/ndarray/test_dsl_js.py b/tests/ndarray/test_dsl_js.py
index 965e63ff2..446421c9a 100644
--- a/tests/ndarray/test_dsl_js.py
+++ b/tests/ndarray/test_dsl_js.py
@@ -124,12 +124,88 @@ def test_transpile_structure():
         assert src.count("{") == src.count("}"), "unbalanced braces"
 
 
-def test_index_symbol_rejected():
-    def uses_index(a):
-        return a + _i0  # noqa: F821
+def test_index_symbols_transpile():
+    # Index/shape symbols become trailing kernel params; the driver gains the geometry args.
+    def ramp(a):
+        return float(_i0) * _n1 + _i1  # noqa: F821
 
-    with pytest.raises(Exception, match="index/shape symbol"):
-        dsl_to_js(uses_index)
+    js_src, params = dsl_to_js(ramp)
+    assert params == ["a"]  # only the user input is reported as a param
+    assert "function ramp(a, _i0, _i1, _n1)" in js_src
+
+    mod = build_js_module(ramp, ndim=2)
+    assert "function __run(ops, isarr, out, n, off, gshape, cshape)" in mod
+    assert "const _i0 = off[0] + loc[0];" in mod
+    assert "const _n1 = gshape[1];" in mod
+
+    # flat_idx pulls in the global-flatten loop.
+    def flat(a):
+        return float(_flat_idx)  # noqa: F821
+
+    flat_mod = build_js_module(flat, ndim=2)
+    assert "_flat_idx = _flat_idx * gshape[k]" in flat_mod
+
+
+def test_index_symbols_need_ndim_and_valid_axis():
+    def ramp(a):
+        return float(_i0) + _i1  # noqa: F821
+
+    # ndim is required to know the rank / validate the referenced axes.
+    with pytest.raises(Exception, match="ndim"):
+        build_js_module(ramp)
+    # axis 1 referenced but the output is 1-D -> rejected.
+    with pytest.raises(Exception, match="axis 1"):
+        build_js_module(ramp, ndim=1)
+
+
+def _run_node_index(module, gshape, off, cshape, ncols=1):
+    """Run an index-aware module over one block and return the (flat) output list."""
+    node = shutil.which("node")
+    if not node:
+        pytest.skip("node not found; skipping JS numeric-equivalence check")
+    n = int(np.prod(cshape))
+    ops = ", ".join([f"new Float64Array({n})"] * ncols)
+    isarr = ", ".join(["true"] * ncols)
+    prog = (
+        f"const __run = (function() {{ {module} }})();\n"
+        f"const out = new Float64Array({n});\n"
+        f"__run([{ops}], [{isarr}], out, {n}, "
+        f"{json.dumps(list(off))}, {json.dumps(list(gshape))}, {json.dumps(list(cshape))});\n"
+        "console.log(JSON.stringify(Array.from(out)));\n"
+    )
+    with tempfile.TemporaryDirectory() as d:
+        script = os.path.join(d, "dsl_js_idx.js")
+        with open(script, "w", encoding="utf-8") as fh:
+            fh.write(prog)
+        res = subprocess.run([node, script], capture_output=True, text=True)
+    if res.returncode != 0:
+        raise AssertionError(f"node failed:\n{res.stderr}")
+    return json.loads(res.stdout)
+
+
+def test_index_ramp_matches_numpy():
+    def ramp(a):
+        return float(_i0) * _n1 + _i1  # noqa: F821
+
+    gshape = (16, 9)
+    expected = np.arange(np.prod(gshape), dtype=np.float64).reshape(gshape)
+    mod = build_js_module(ramp, ndim=2)
+    # A non-origin block exercises the offset handling.
+    off, cshape = (8, 0), (8, 9)
+    got = np.array(_run_node_index(mod, gshape, off, cshape)).reshape(cshape)
+    np.testing.assert_array_equal(got, expected[8:16, :])
+
+
+def test_flat_idx_matches_numpy():
+    def flat(a):
+        return float(_flat_idx) * 2.0  # noqa: F821
+
+    gshape = (16, 9)
+    expected = np.arange(np.prod(gshape), dtype=np.float64).reshape(gshape) * 2.0
+    mod = build_js_module(flat, ndim=2)
+    off, cshape = (4, 3), (3, 4)
+    got = np.array(_run_node_index(mod, gshape, off, cshape)).reshape(cshape)
+    np.testing.assert_array_equal(got, expected[4:7, 3:7])
 
 
 @pytest.mark.skipif(blosc2.IS_WASM, reason="emscripten cannot spawn the node subprocess")
@@ -167,7 +243,7 @@ def _add(a, b):
 
 @blosc2.dsl_kernel
 def _idx(a):
-    return a + float(_i0)  # noqa: F821  index symbol -> transpiler rejects
+    return a + float(_i0)  # noqa: F821  index symbol -> needs the output shape to transpile
 
 
 def test_prefer_js_selection(monkeypatch):
@@ -197,17 +273,29 @@ def sel(jit, jit_backend, operands, kwargs, reduce_args=None):
     assert callable(expr)
     assert not lx._is_dsl_kernel_expression(expr)
 
-    # integer dtype, reductions -> fall back to miniexpr
+    # integer *output*, reductions -> fall back to miniexpr
     assert sel(None, None, {"a": ai, "b": ai}, {"dtype": np.int64})[0] is _add
     assert sel(None, None, {"a": af}, {}, reduce_args={"op": "sum"})[0] is _add
 
+    # zero-input DSL stays on miniexpr (the zero-input fast path needs the DSL kernel)
+    assert sel(None, None, {}, {"dtype": np.float64})[0] is _add
+
+    # integer *inputs* with a float output -> JS (the bridge float64-converts operands)
+    expr, *_ = sel(None, None, {"a": ai, "b": ai}, {"dtype": np.float64})
+    assert callable(expr)
+    assert not lx._is_dsl_kernel_expression(expr)
 
-def test_prefer_js_falls_back_on_untranspilable(monkeypatch):
+
+def test_prefer_js_index_needs_shape(monkeypatch):
     monkeypatch.setattr(blosc2, "IS_WASM", True)
     af = blosc2.asarray(np.ones((4, 4), dtype=np.float64))
-    # _idx uses an index symbol the transpiler rejects -> default must fall back, not raise.
+    # Without a shape the transpiler can't size the index symbols -> fall back to miniexpr.
     expr, _, _ = lx._maybe_js_backend(_idx, None, None, {}, {"a": af}, {"dtype": np.float64})
     assert expr is _idx
+    # With the output shape supplied, the index kernel transpiles and JS is chosen.
+    expr, _, _ = lx._maybe_js_backend(_idx, None, None, {}, {"a": af}, {"dtype": np.float64}, shape=(4, 4))
+    assert callable(expr)
+    assert not lx._is_dsl_kernel_expression(expr)
 
 
 @pytest.mark.skipif(blosc2.IS_WASM, reason="this test asserts off-WASM behavior")
diff --git a/tests/ndarray/test_wasm_dsl_jit.py b/tests/ndarray/test_wasm_dsl_jit.py
index 28c988da4..1dee30e5e 100644
--- a/tests/ndarray/test_wasm_dsl_jit.py
+++ b/tests/ndarray/test_wasm_dsl_jit.py
@@ -16,11 +16,21 @@ def _wasm_kernel(x, y):
     return (x + y) * 1.5 - 0.25
 
 
-@blosc2.dsl_kernel  # integer kernel -> the float64 JS bridge is unsafe -> must fall back to miniexpr
+@blosc2.dsl_kernel  # integer *output* -> the float64 JS bridge is unsafe -> must fall back to miniexpr
 def _wasm_int_kernel(x, y):
     return x * 2 + y * 3
 
 
+@blosc2.dsl_kernel  # integer *inputs*, float output -> JS is safe (bridge float64-converts operands)
+def _wasm_int_input_kernel(x, y):
+    return (x + y) * 0.5
+
+
+@blosc2.dsl_kernel  # index/shape symbols -> JS reconstructs global coords per block
+def _wasm_ramp_kernel(x):
+    return float(_i0) * _n1 + _i1  # noqa: F821
+
+
 def _wasm_grids():
     a_np = np.linspace(-1.0, 1.0, 64, dtype=np.float64).reshape(8, 8)
     b_np = np.linspace(0.0, 2.0, 64, dtype=np.float64).reshape(8, 8)
@@ -69,6 +79,31 @@ def test_wasm_dsl_default_falls_back_for_int():
     np.testing.assert_array_equal(out[...], ai_np * 2 + bi_np * 3)
 
 
+@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
+def test_wasm_dsl_int_inputs_float_output_via_js():
+    # Integer inputs with a float output: the default must prefer js and stay correct
+    # (the bridge converts every operand to float64, matching miniexpr's promotion).
+    ai_np = np.arange(64, dtype=np.int64).reshape(8, 8)
+    bi_np = ai_np + 1
+    ai = blosc2.asarray(ai_np, chunks=(4, 4), blocks=(2, 2))
+    bi = blosc2.asarray(bi_np, chunks=(4, 4), blocks=(2, 2))
+    out = blosc2.lazyudf(_wasm_int_input_kernel, (ai, bi), dtype=np.float64).compute()
+    np.testing.assert_allclose(out[...], (ai_np + bi_np) * 0.5, rtol=1e-6, atol=1e-8)
+
+
+@pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
+def test_wasm_dsl_index_symbols_via_js():
+    # An index/shape-symbol ramp (multi-chunk, so per-block offsets are exercised) must
+    # transpile to JS and reproduce the global C-order ramp, both by default and explicitly.
+    shape = (8, 8)
+    x = blosc2.asarray(np.zeros(shape, dtype=np.float64), chunks=(4, 4), blocks=(2, 2))
+    expected = np.arange(np.prod(shape), dtype=np.float64).reshape(shape)
+    out_default = blosc2.lazyudf(_wasm_ramp_kernel, (x,), dtype=np.float64).compute()
+    np.testing.assert_array_equal(out_default[...], expected)
+    out_js = blosc2.lazyudf(_wasm_ramp_kernel, (x,), dtype=np.float64).compute(jit_backend="js")
+    np.testing.assert_array_equal(out_js[...], expected)
+
+
 @pytest.mark.skipif(not blosc2.IS_WASM, reason="WASM-only integration test")
 def test_wasm_string_predicates_strict_miniexpr():
     assert getattr(blosc2, "_WASM_MINIEXPR_ENABLED", False)

From 49f9084636dba600b620a8253832354138c24f75 Mon Sep 17 00:00:00 2001
From: Francesc Alted 
Date: Mon, 29 Jun 2026 12:26:16 +0200
Subject: [PATCH 09/13] Tests spawning node via subprocess now skip on WASM
 because emscripten can't do that

---
 tests/ndarray/test_dsl_js.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/ndarray/test_dsl_js.py b/tests/ndarray/test_dsl_js.py
index 446421c9a..2acdc3bf0 100644
--- a/tests/ndarray/test_dsl_js.py
+++ b/tests/ndarray/test_dsl_js.py
@@ -183,6 +183,7 @@ def _run_node_index(module, gshape, off, cshape, ncols=1):
     return json.loads(res.stdout)
 
 
+@pytest.mark.skipif(blosc2.IS_WASM, reason="emscripten cannot spawn the node subprocess")
 def test_index_ramp_matches_numpy():
     def ramp(a):
         return float(_i0) * _n1 + _i1  # noqa: F821
@@ -196,6 +197,7 @@ def ramp(a):
     np.testing.assert_array_equal(got, expected[8:16, :])
 
 
+@pytest.mark.skipif(blosc2.IS_WASM, reason="emscripten cannot spawn the node subprocess")
 def test_flat_idx_matches_numpy():
     def flat(a):
         return float(_flat_idx) * 2.0  # noqa: F821

From e24d1832f5ce735aafa1e956c067aaa0b7f5a21e Mon Sep 17 00:00:00 2001
From: Francesc Alted 
Date: Mon, 29 Jun 2026 12:54:23 +0200
Subject: [PATCH 10/13] Wait until operation is complete before visualizing
 again

---
 tests/b2view/test_basics.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tests/b2view/test_basics.py b/tests/b2view/test_basics.py
index 4e8a169e5..f4c12a41a 100644
--- a/tests/b2view/test_basics.py
+++ b/tests/b2view/test_basics.py
@@ -96,6 +96,20 @@ async def wait_for_table(pilot) -> None:
     raise AssertionError("data table never finished loading")
 
 
+async def wait_until(pilot, predicate, *, message="condition not met in time") -> None:
+    """Pump the event loop until *predicate* holds.
+
+    Setting ``Input.value`` posts an ``Input.Changed`` that rebuilds dependent widgets
+    asynchronously; a single ``pilot.pause()`` is not always enough on slower/loaded CI
+    (e.g. Windows), so poll until the resulting state settles.
+    """
+    for _ in range(100):
+        await pilot.pause()
+        if predicate():
+            return
+    raise AssertionError(message)
+
+
 async def focus_data_table(pilot) -> DataTable:
     table = pilot.app.query_one("#data-table", DataTable)
     table.focus()
@@ -655,11 +669,11 @@ async def submit_filter(expr: str) -> None:
 
         # Typing narrows the candidate list (substring, case-insensitive).
         app.screen.query_one("#colfilter-input", Input).value = "v1"
-        await pilot.pause()
+        await wait_until(pilot, lambda: sel.option_count == 10, message="list did not narrow")
         assert sel.option_count == 10  # v10..v19
         # Clear the filter again so the first column ('a') is reachable.
         app.screen.query_one("#colfilter-input", Input).value = ""
-        await pilot.pause()
+        await wait_until(pilot, lambda: sel.option_count == ncols, message="list did not reset")
 
         # ↓ moves focus into the list; Space unchecks the highlighted ('a');
         # Enter applies the remaining set.

From 2308af3ad806573d6fce8e946d78b00292a2ea04 Mon Sep 17 00:00:00 2001
From: Francesc Alted 
Date: Mon, 29 Jun 2026 13:19:10 +0200
Subject: [PATCH 11/13] Add new JS coverage plan

---
 plans/dsl-js-coverage.md | 143 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 plans/dsl-js-coverage.md

diff --git a/plans/dsl-js-coverage.md b/plans/dsl-js-coverage.md
new file mode 100644
index 000000000..e9afba07e
--- /dev/null
+++ b/plans/dsl-js-coverage.md
@@ -0,0 +1,143 @@
+# DSL → JS transpiler: coverage gaps & future work
+
+Status of the `blosc2.dsl_js` transpiler (the `jit_backend="js"` path) versus the
+miniexpr + WASM-JIT backend. Everything listed below as *unsupported* currently rides on
+**miniexpr + jit-wasm** instead of the JS bridge.
+
+## Implemented
+
+- **P1 — Index / shape symbols** (`_i0`/`_n0`/`_flat_idx`, ...). The transpiler emits them
+  as trailing kernel params and the runtime driver reconstructs per-block global coordinates
+  from `(off, gshape, cshape)`; see `_module_with_index` in `src/blosc2/dsl_js.py`. The
+  whole-array shape is threaded `chunked_eval → _maybe_js_backend → _as_js_udf → js_kernel`.
+  Requires ≥1 array operand (zero-input DSL kernels stay on miniexpr) and a known output
+  shape; without a shape such kernels fall back. Covered by `tests/ndarray/test_dsl_js.py`
+  (`test_index_*`) and `tests/ndarray/test_wasm_dsl_jit.py::test_wasm_dsl_index_symbols_via_js`.
+- **P2 (input side) — Integer inputs with a floating output.** The JS bridge already
+  float64-converts every operand, which is exactly miniexpr's promotion of integer inputs for
+  a float result (so values above 2**53 lose precision identically). `_js_dtypes_ok`
+  (`src/blosc2/lazyexpr.py`) now admits integer inputs when the output dtype is floating.
+  Integer/complex *output* still goes to miniexpr — see the remaining P2 work below.
+
+## Performance characteristics (and where the residual cost is)
+
+Measured with `bench/js-transpiler/dsl-js-node.mjs` (Pyodide, ms/frame). JS beats miniexpr's
+TinyCC JIT (`tcc`) on **compute-heavy** kernels and lands at parity / slightly behind on
+**compute-light, vectorizable** ones:
+
+```
+kernel    js/tcc
+newton     2.80x   (heavy: loop + complex arithmetic)
+deepar     2.78x
+idxgrad    2.00x   (P1 index symbols)
+deep       1.30x
+trans      0.99x
+intmix     0.87x   (P2 int inputs; light, vectorizable)
+poly       0.86x   (light, vectorizable)
+```
+
+Two cost components matter, and only the second remains:
+
+- **Per-evaluation transpile + `js.eval` (amortized away).** Each `lazyudf` evaluation used to
+  re-parse the kernel AST and re-`eval` the JS module, while miniexpr caches its compiled
+  program by source. Now memoized: `_TRANSPILE_CACHE` (by kernel source) and `_RUN_CACHE` (the
+  V8-compiled `__run`, by module string) in `src/blosc2/dsl_js.py`. This lifted every ratio
+  (e.g. newton 2.20→2.80x, poly 0.77→0.86x) and is a real win for repeated / animation-loop use.
+
+- **Per-block marshaling (the residual).** The bridge copies each block across the Python↔JS
+  boundary: in via `ascontiguousarray(float64) → tobytes → Float64Array`, out via
+  `to_bytes → np.frombuffer`. miniexpr's prefilter computes **in place** with zero copies. For
+  light kernels (~2 ms compute) these two copies are a meaningful fraction with no compute to
+  hide them behind, so JS sits at parity or just behind `tcc` there. For compute-bound kernels
+  (the reason the JS backend exists) it is negligible.
+
+**Future lever — zero-copy block I/O.** Replace the `tobytes`/`frombuffer` copies with a
+`HEAPF64` view onto WASM linear memory so operands/output alias the block buffers (the
+"ponytail" note in `js_kernel`). This would mostly close the gap on marshaling-bound (light)
+kernels but needs care around WASM-heap lifetime/alignment, and does nothing for compute-bound
+kernels — so build it only if a real marshaling-bound workload appears.
+
+## How routing works today
+
+Under WebAssembly with `jit_backend` unset (and `jit != False`, no `strict_miniexpr`),
+blosc2 *prefers* JS for float DSL kernels and **silently falls back to miniexpr+jit-wasm**
+for anything it can't transpile — see `_maybe_js_backend` (`src/blosc2/lazyexpr.py:1475`):
+
+```python
+try:
+    bridge = _as_js_udf(expression)  # transpiles; raises on any unsupported construct
+except Exception:
+    return expression, jit, jit_backend  # fall back to miniexpr, no regression
+```
+
+With an **explicit** `jit_backend="js"`, the same gaps instead **raise** rather than fall
+back (the user asked for JS specifically, so we don't second-guess them).
+
+The JS backend today covers *float64/float32 element-wise scalar kernels* using arithmetic,
+`where`, comparisons, `if/elif/else`, `range` loops, and whitelisted math functions.
+
+## Remaining P2 — Integer *output*
+
+`_js_dtypes_ok` still sends any non-floating *output* dtype to miniexpr, because the JS
+bridge computes in **float64** and can't reproduce integer semantics for the result:
+
+- **Integer division / modulo / truncation**: `//`, `%`, `int(...)` must match C/miniexpr
+  integer rules, not float `Math.floor`/`pymod`.
+- **Overflow / wraparound**: miniexpr wraps at the integer width; float64 doesn't.
+- **int64 range**: float64 can't represent int64 above 2**53 exactly.
+
+Options, in rough order of effort:
+- **int32 and smaller output**: representable exactly in float64; could be allowed for kernels
+  that provably stay within ±2^53 with integer-valued ops and an explicit safe-range / no-
+  overflow contract. Still needs integer-correct `//`/`%`/`int()` codegen.
+- **int64 output**: requires BigInt or a typed-array split-word scheme — significantly more
+  work and likely slower; probably not worth it until a real workload needs it.
+
+## Other unsupported constructs (lower priority)
+
+All of these raise `_DSLToJSError` in the transpiler → fall back (or raise under explicit
+`jit_backend="js"`).
+
+**Reductions** — any `reduce_args` (`sum`, `prod`, …) → miniexpr. Explicit
+`jit_backend="js"` raises `'jit_backend="js" does not support reductions'`. A JS reduction
+path would need a fundamentally different driver (accumulate, not map).
+
+**Statements** — only `Assign, AugAssign, Return, Expr, If, For(range), While, Break,
+Continue` are emitted (`_stmt`, `src/blosc2/dsl_js.py:151`). Not supported:
+- Tuple / multiple / subscript assignment targets — only a single `Name` target is handled
+  (`node.targets[0].id`). `a, b = ...`, `a = b = ...`, `arr[i] = ...` all fail.
+- `with`, nested `def`, `try`, etc.
+
+**Expressions** — only `Name, Constant, UnaryOp, BinOp, BoolOp, Compare, Call` (`_expr`).
+Not supported:
+- Python ternary `a if cond else b` (`ast.IfExp`) — must be written as `where(cond, a, b)`.
+- Chained comparisons `a < b < c` — only `ops[0]`/`comparators[0]` are read.
+- Subscript / indexing, attribute access (except `np.`/`numpy.`/`math.` call targets),
+  tuples, lists, dicts, comprehensions, slices.
+
+**Calls** — only `where`, `int`, `float`, `bool`, and the `_MATH` whitelist (`sin, cos, exp,
+log, sqrt, pow, floor, abs, min/max, …`, see `src/blosc2/dsl_js.py:27`). Any other call name,
+or a call through a non-`np`/`numpy`/`math` target → fall back.
+
+**For-loops** — only `for v in range(...)`. Iterating over arrays/other iterables is
+unsupported.
+
+## Environment gate (by design)
+
+Browser/Pyodide only. `_as_js_udf` raises `RuntimeError` off-WASM (`js_kernel` imports
+Pyodide's `js` at run time). On native/CI, DSL kernels always go to miniexpr+jit.
+
+## Known semantic ceilings (supported, but lossy)
+
+These transpile but with caveats worth tracking, since miniexpr may differ:
+- 64-bit integer bitwise ops degrade to int32 (JS number semantics).
+- `%` uses a Python-sign helper (`pymod`); large-magnitude float edge cases may differ.
+- `range()` with a non-literal step assumes a positive step (loop-direction guess).
+- float64/float32 are the target; exotic dtypes untested.
+
+## See also
+
+- `plans/dsl-js.md` — original design, perf numbers, and the "Deferred" / "Known ceilings"
+  notes this document expands on.
+- `src/blosc2/dsl_js.py` — the transpiler.
+- `src/blosc2/lazyexpr.py` — `_maybe_js_backend`, `_js_dtypes_ok`, `_as_js_udf` (routing).

From 2a0cf282e871b47b6739024ea5f516b9bd0b1a85 Mon Sep 17 00:00:00 2001
From: Francesc Alted 
Date: Mon, 29 Jun 2026 13:34:18 +0200
Subject: [PATCH 12/13] Reject integer/complex output on explicit
 jit_backend="js"

The explicit JS path bypassed the output-dtype check, so an integer/complex
output kernel would silently compute in float64 instead of failing. Raise a
clear ValueError up front (matching the documented "explicit js raises on
gaps" contract); integer inputs with a floating output still use JS. Keeps
integer output entirely on miniexpr. Add a test and update the coverage plan.
---
 plans/dsl-js-coverage.md     |  4 +++-
 src/blosc2/lazyexpr.py       |  8 ++++++++
 tests/ndarray/test_dsl_js.py | 10 ++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/plans/dsl-js-coverage.md b/plans/dsl-js-coverage.md
index e9afba07e..d048c0f7e 100644
--- a/plans/dsl-js-coverage.md
+++ b/plans/dsl-js-coverage.md
@@ -71,7 +71,9 @@ except Exception:
 ```
 
 With an **explicit** `jit_backend="js"`, the same gaps instead **raise** rather than fall
-back (the user asked for JS specifically, so we don't second-guess them).
+back (the user asked for JS specifically, so we don't second-guess them). This includes a
+non-floating *output* dtype: `_maybe_js_backend` raises a clear `ValueError` up front rather
+than letting the float64 bridge silently compute integer/complex output (see below).
 
 The JS backend today covers *float64/float32 element-wise scalar kernels* using arithmetic,
 `where`, comparisons, `if/elif/else`, `range` loops, and whitelisted math functions.
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index a7998dd99..ae67a366f 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1510,6 +1510,14 @@ def _maybe_js_backend(expression, jit, jit_backend, reduce_args, operands, kwarg
     if jit_backend == "js":
         if reduce_args:
             raise ValueError('jit_backend="js" does not support reductions')
+        out_dtype = kwargs.get("dtype")
+        if out_dtype is not None and not np.issubdtype(np.dtype(out_dtype), np.floating):
+            # The JS bridge computes in float64 and cannot reproduce integer/complex output
+            # semantics (division/overflow/truncation); keep those on miniexpr.
+            raise ValueError(
+                'jit_backend="js" requires a floating-point output dtype '
+                f"(got {np.dtype(out_dtype)}); drop jit_backend to use miniexpr"
+            )
         return _as_js_udf(expression, shape), None, None
     prefer_js = (
         jit is not False  # jit=True/None prefer the best JIT (js); only jit=False forces interpreter
diff --git a/tests/ndarray/test_dsl_js.py b/tests/ndarray/test_dsl_js.py
index 2acdc3bf0..ea2a9fad7 100644
--- a/tests/ndarray/test_dsl_js.py
+++ b/tests/ndarray/test_dsl_js.py
@@ -306,3 +306,13 @@ def test_explicit_js_off_wasm_raises():
     assert not blosc2.IS_WASM  # this test runs on a native build
     with pytest.raises(RuntimeError, match="WebAssembly"):
         lx._maybe_js_backend(_add, None, "js", {}, {}, {})
+
+
+def test_explicit_js_integer_output_raises():
+    # Integer/complex output is left to miniexpr; explicit jit_backend="js" must reject it
+    # (the float64 bridge can't reproduce integer semantics) rather than silently compute.
+    af = blosc2.asarray(np.ones((4, 4), dtype=np.float64))
+    with pytest.raises(ValueError, match="floating-point output"):
+        lx._maybe_js_backend(_add, None, "js", {}, {"a": af, "b": af}, {"dtype": np.int64})
+    with pytest.raises(ValueError, match="floating-point output"):
+        lx._maybe_js_backend(_add, None, "js", {}, {"a": af, "b": af}, {"dtype": np.complex128})

From 006687d120ed0af94ea256370611b55dbfaa570e Mon Sep 17 00:00:00 2001
From: Francesc Alted 
Date: Mon, 29 Jun 2026 13:43:37 +0200
Subject: [PATCH 13/13] =?UTF-8?q?Support=20=5Fndim=20index=20symbol=20in?=
 =?UTF-8?q?=20DSL=E2=86=92JS=20transpiler;=20document=20the=20JS=20backend?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 RELEASE_NOTES.md                  | 12 +++++++++++-
 doc/getting_started/dsl_syntax.md | 21 +++++++++++++++++++++
 plans/dsl-js-coverage.md          |  2 +-
 src/blosc2/dsl_js.py              | 10 ++++++----
 tests/ndarray/test_dsl_js.py      |  8 ++++++++
 5 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 4ac8bff3a..2a2c421b4 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -2,7 +2,17 @@
 
 ## Changes from 4.6.0 to 4.6.1
 
-XXX version-specific blurb XXX
+### DSL → JavaScript backend for WebAssembly (`jit_backend="js"`)
+
+- Under WebAssembly/Pyodide, `@blosc2.dsl_kernel` kernels can now be transpiled
+  to JavaScript and run via the browser's JIT.  It is the **default** there for
+  transpilable floating-point kernels (silently falling back to miniexpr for
+  anything it can't handle), and beats the WASM TinyCC JIT on compute-heavy
+  kernels (e.g. ~2.8x on a Newton-fractal kernel).  Request it explicitly with
+  `compute(jit_backend="js")`; outside WebAssembly that raises.
+- Supports index/shape symbols (`_i0`/`_n0`/`_ndim`/`_flat_idx`) and integer inputs
+  with a floating-point output.  Integer/complex *output*, reductions, and
+  unsupported constructs stay on miniexpr.  Native builds are unaffected.
 
 ## Changes from 4.5.1 to 4.6.0
 
diff --git a/doc/getting_started/dsl_syntax.md b/doc/getting_started/dsl_syntax.md
index 2942e94b4..5aa9a2782 100644
--- a/doc/getting_started/dsl_syntax.md
+++ b/doc/getting_started/dsl_syntax.md
@@ -228,6 +228,27 @@ Runtime error examples:
 - Missing return on executed control path
 - While-loop iteration cap exceeded
 
+## Execution backends
+
+A DSL kernel is compiled and run by one of two backends, selected per evaluation
+via the `jit` / `jit_backend` arguments to `compute()` / `__getitem__`:
+
+- **miniexpr** (default on native builds): a runtime JIT (TinyCC, `jit_backend="tcc"`)
+  with an interpreter fallback (`jit=False`). Supports the full DSL described here,
+  including integer/complex dtypes and reductions.
+- **JavaScript** (`jit_backend="js"`): transpiles the kernel to JavaScript and runs it
+  through the browser's JIT. **WebAssembly/Pyodide only** — requesting it elsewhere raises.
+  Under WebAssembly it is also the *default* for eligible kernels (set `jit=False` or
+  `strict_miniexpr=True` to opt out), and silently falls back to miniexpr for anything it
+  cannot handle.
+
+The JavaScript backend computes in float64 and covers floating-point element-wise kernels:
+arithmetic, comparisons, `where`, `if`/`elif`/`else`, `for ... in range(...)`/`while`
+loops, the index/shape symbols (`_i0`/`_n0`/`_ndim`/`_flat_idx`), and the standard math
+functions. It also accepts **integer inputs** when the output dtype is floating. It does
+**not** support integer/complex *output*, reductions, or constructs outside the transpiled
+subset; those stay on miniexpr (or, with an explicit `jit_backend="js"`, raise).
+
 ## Python syntax that is out of DSL scope
 
 These Python features are not part of this DSL:
diff --git a/plans/dsl-js-coverage.md b/plans/dsl-js-coverage.md
index d048c0f7e..32c6b7b1f 100644
--- a/plans/dsl-js-coverage.md
+++ b/plans/dsl-js-coverage.md
@@ -6,7 +6,7 @@ miniexpr + WASM-JIT backend. Everything listed below as *unsupported* currently
 
 ## Implemented
 
-- **P1 — Index / shape symbols** (`_i0`/`_n0`/`_flat_idx`, ...). The transpiler emits them
+- **P1 — Index / shape symbols** (`_i0`/`_n0`/`_ndim`/`_flat_idx`, ...). The transpiler emits them
   as trailing kernel params and the runtime driver reconstructs per-block global coordinates
   from `(off, gshape, cshape)`; see `_module_with_index` in `src/blosc2/dsl_js.py`. The
   whole-array shape is threaded `chunked_eval → _maybe_js_backend → _as_js_udf → js_kernel`.
diff --git a/src/blosc2/dsl_js.py b/src/blosc2/dsl_js.py
index e270ce528..a5179ab29 100644
--- a/src/blosc2/dsl_js.py
+++ b/src/blosc2/dsl_js.py
@@ -28,7 +28,7 @@
 # plain per-block callable. Browser/Pyodide only (js_kernel imports `js` at call time).
 
 # Canonical signature order for index/shape symbols passed to the transpiled kernel.
-_INDEX_SYMBOL_ORDER = ("_i0", "_i1", "_i2", "_n0", "_n1", "_n2", "_flat_idx")
+_INDEX_SYMBOL_ORDER = ("_i0", "_i1", "_i2", "_n0", "_n1", "_n2", "_ndim", "_flat_idx")
 _INDEX_SYMBOLS = set(_INDEX_SYMBOL_ORDER)
 
 # numpy/math function name -> JS Math.* name (numpy aliases included).
@@ -345,8 +345,8 @@ def dsl_to_js(kernel):
 
 
 def _max_index_dim(used_index) -> int:
-    """Highest axis referenced by `_iK`/`_nK` symbols (-1 if only `_flat_idx`/none)."""
-    dims = [int(s[2:]) for s in used_index if s != "_flat_idx"]
+    """Highest axis referenced by `_iK`/`_nK` symbols (-1 if only `_ndim`/`_flat_idx`/none)."""
+    dims = [int(s[2:]) for s in used_index if s[:2] in ("_i", "_n") and s[2:].isdigit()]
     return max(dims) if dims else -1
 
 
@@ -363,11 +363,13 @@ def _module_with_index(kernel_js, fname, params, used_index) -> str:
     then the referenced symbols are derived and passed as trailing kernel args."""
     decls = []
     for s in used_index:
-        if s == "_flat_idx":
+        if s in ("_flat_idx", "_ndim"):
             continue
         k = int(s[2:])
         rhs = f"off[{k}] + loc[{k}]" if s.startswith("_i") else f"gshape[{k}]"
         decls.append(f"    const {s} = {rhs};")
+    if "_ndim" in used_index:
+        decls.append("    const _ndim = d;")
     if "_flat_idx" in used_index:
         decls.append("    let _flat_idx = 0;")
         decls.append(
diff --git a/tests/ndarray/test_dsl_js.py b/tests/ndarray/test_dsl_js.py
index ea2a9fad7..2fe8e98c2 100644
--- a/tests/ndarray/test_dsl_js.py
+++ b/tests/ndarray/test_dsl_js.py
@@ -145,6 +145,14 @@ def flat(a):
     flat_mod = build_js_module(flat, ndim=2)
     assert "_flat_idx = _flat_idx * gshape[k]" in flat_mod
 
+    # _ndim resolves to the runtime block rank (cshape.length).
+    def ndim_k(a):
+        return float(_ndim)  # noqa: F821
+
+    ndim_mod = build_js_module(ndim_k, ndim=3)
+    assert "function ndim_k(a, _ndim)" in ndim_mod
+    assert "const _ndim = d;" in ndim_mod
+
 
 def test_index_symbols_need_ndim_and_valid_axis():
     def ramp(a):