xqlsystems · alxmrs · Jun 27, 2026 · Jun 27, 2026 · Jun 27, 2026 · Jun 28, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -23,6 +23,7 @@ async-stream = "0.3"
 async-trait = "0.1"
 datafusion = { version = "52.0.0" }
 datafusion-ffi = { version = "52.0.0" }
+sqlparser = { version = "0.59", features = ["visitor"] }
 futures = { version = "0.3" }
 pyo3 = { version = "0.26.0", features = ["extension-module"] }
 tokio = { version = "1.46.1", features = ["rt"] }

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,63 @@
+# Benchmarks & demos
+
+Standalone scripts that exercise xarray-sql against real data. Each declares its
+own dependencies inline (PEP 723) and points `xarray_sql` at this checkout, so
+they run with no setup:
+
+```bash
+uv run benchmarks/grad_era5.py
+```
+
+## `grad_era5.py` — differentiable SQL over ARCO-ERA5
+
+Demonstrates the autograd feature on a real climate archive
+([ARCO-ERA5](https://github.com/google-research/arco-era5), read anonymously
+from GCS — needs `gcsfs` and network access).
+
+The key idea: a physical quantity is written as an **analytic SQL formula** over
+ERA5 variables, and `grad(...)` differentiates that formula **symbolically**,
+evaluated at every grid cell. Because each row is an independent point, this is
+the relational equivalent of `jax.vmap(jax.grad(f))`. It is *not* a finite-
+difference spatial gradient — `grad(f(u, v), u)` is the exact partial derivative
+of `f`.
+
+Two worked cases, each checked against an analytic reference:
+
+| Quantity | SQL | Derivative | Check |
+| --- | --- | --- | --- |
+| Wind speed | `sqrt(power(u,2) + power(v,2))` | `grad(speed, u) = u/speed` | exact |
+| Saturation vapour pressure | `A*exp(B*tc/(tc+C))` | `grad(e_s, T)` | closed-form Clausius-Clapeyron slope |
+
+Each query round-trips back to an `xarray.Dataset` via `.to_dataset(...)`.
+
+## `grad_descent.py` — gradient descent as one declarative SQL query
+
+Fits a line `y ~= a*x + b` by minimising the mean squared error, with the
+**entire training loop expressed as a single recursive CTE** — no Python
+iteration and no precompiled update rule. `grad(...)` lives *inside* the
+recursion: `params(step, a, b)` starts at one row and each recursion appends the
+next generation, descending along the gradient that `grad` computes from the
+loss formula directly (`AVG(grad(loss, a))` is the relational `d/da (Σ loss) /
+N` — differentiation through the aggregate is just linearity):
+
+```sql
+WITH RECURSIVE params(step, a, b) AS (
+  SELECT 0, 0.0, 0.0
+  UNION ALL
+  SELECT params.step + 1,
+         params.a - lr*AVG(grad(loss, a)),
+         params.b - lr*AVG(grad(loss, b))
+  FROM params CROSS JOIN d WHERE params.step < STEPS
+  GROUP BY params.step, params.a, params.b)
+SELECT * FROM params ORDER BY step
+```
+
+So gradient, update, and iteration are all declarative SQL; the trajectory is
+the rows of one query. The fit matches numpy's least-squares solution.
+Self-contained (no network).
+
+`grad` works inside the recursive CTE because it is differentiated as a SQL
+source-to-source rewrite *before* the query is planned — no Substrait round-trip,
+so no plan-shape restrictions. (If you instead want the derivative as a string to
+embed yourself, `xql.differentiate_sql(loss, "a", cols)` compiles a single
+expression to SQL text.)
diff --git a/benchmarks/grad_descent.py b/benchmarks/grad_descent.py
@@ -0,0 +1,113 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#   "xarray_sql",
+#   "xarray",
+#   "numpy",
+# ]
+#
+# [tool.uv.sources]
+# xarray_sql = { path = "..", editable = true }
+# ///
+"""Gradient descent as a single declarative SQL query.
+
+Fits a line ``y ~= a*x + b`` by minimising the mean squared error — with the
+**entire training loop expressed as one recursive CTE**, no Python iteration and
+no precompiled update rule. ``grad(...)`` lives *inside* the recursion:
+
+    WITH RECURSIVE params(step, a, b) AS (
+      SELECT 0, 0.0, 0.0
+      UNION ALL
+      SELECT params.step + 1,
+             params.a - lr * AVG(grad(loss, a)),
+             params.b - lr * AVG(grad(loss, b))
+      FROM params CROSS JOIN d
+      WHERE params.step < STEPS
+      GROUP BY params.step, params.a, params.b)
+    SELECT step, a, b FROM params ORDER BY step
+
+Each recursion appends the next generation, descending along the gradient that
+``grad`` computes from the loss formula directly. ``AVG(grad(loss, a))`` is the
+relational ``d/da (Σ loss) / N`` — differentiation through the aggregate is just
+linearity. So gradient, update, and iteration are all one declarative query; the
+optimisation trajectory is the rows of that query.
+
+``grad`` is differentiated as a SQL source-to-source rewrite *before* the query
+is planned, so the marker works inside the recursive CTE (and any other query
+shape) with no Substrait round-trip. The loss is written once, as ordinary SQL,
+and the engine differentiates it symbolically — the relational equivalent of
+``jax.vmap(jax.grad(f))``, since each row is an independent evaluation point.
+
+Run standalone:
+
+    uv run benchmarks/grad_descent.py
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import xarray as xr
+
+import xarray_sql as xql
+
+# Per-row loss r^2 with residual r = y - (a*x + b). The columns a, b come from
+# the recursive `params` relation; x, y come from the data table `d`.
+RESIDUAL = "(y - (a * x + b))"
+LOSS = f"{RESIDUAL} * {RESIDUAL}"
+LR = 0.4
+STEPS = 200
+
+
+def main() -> None:
+    rng = np.random.default_rng(0)
+    n = 500
+    x = rng.uniform(0.0, 1.0, n)
+    a_true, b_true = 2.0, -1.0
+    y = a_true * x + b_true + rng.normal(0.0, 0.01, n)
+
+    ctx = xql.XarrayContext()
+    ctx.from_dataset(
+        "d",
+        xr.Dataset(
+            {"x": (("i",), x), "y": (("i",), y)}, coords={"i": np.arange(n)}
+        ),
+        chunks={"i": n},
+    )
+
+    # The entire training loop is one declarative recursive query: each step
+    # appends the next generation, descending along the gradient that grad()
+    # computes from the loss — differentiated inside the recursion itself.
+    trajectory = ctx.sql(
+        f"""
+        WITH RECURSIVE params(step, a, b) AS (
+          SELECT 0 AS step, CAST(0.0 AS DOUBLE) AS a, CAST(0.0 AS DOUBLE) AS b
+          UNION ALL
+          SELECT params.step + 1                        AS step,
+                 params.a - {LR} * AVG(grad({LOSS}, a))  AS a,
+                 params.b - {LR} * AVG(grad({LOSS}, b))  AS b
+          FROM params CROSS JOIN d
+          WHERE params.step < {STEPS}
+          GROUP BY params.step, params.a, params.b
+        )
+        SELECT step, a, b FROM params ORDER BY step
+        """
+    ).to_pandas()
+
+    print("trajectory (every 40th generation):")
+    print(trajectory.iloc[::40].to_string(index=False))
+
+    a, b = float(trajectory["a"].iloc[-1]), float(trajectory["b"].iloc[-1])
+    a_ols, b_ols = np.polyfit(x, y, 1)
+    print(
+        f"\nSQL gradient descent:  a={a:.4f}  b={b:.4f}  ({len(trajectory)} generations)"
+    )
+    print(f"least-squares (numpy): a={a_ols:.4f}  b={b_ols:.4f}")
+    assert abs(a - a_ols) < 1e-2 and abs(b - b_ols) < 1e-2
+    print(
+        "\nOK: a single recursive-CTE query with grad() inside fit the line "
+        "to the OLS solution."
+    )
+
+
+if __name__ == "__main__":
+    main()