From f0f70ff82c744490c65d0a12e8e3496efd7d45df Mon Sep 17 00:00:00 2001 From: ghostiee-11 Date: Wed, 24 Jun 2026 23:27:16 +0530 Subject: [PATCH] chores: add lazy_roundtrip Add benchmarks/geospatial/09_lazy_roundtrip.py showing the SQL to xarray round-trip (to_dataset) is lazy: a chunked to_dataset plus .sel(time=t0) pushes a single WHERE into SQL (1,325 vs 3,869,000 rows; ~0.7 vs ~161 MB) and asserts equal to the xarray reference. Also harden _harness.py: assert_grid_close fails on a partial grid, and measured() stops tracemalloc in a finally. Document case 09 in the suite README. --- benchmarks/geospatial/09_lazy_roundtrip.py | 143 +++++++++++++++++++++ benchmarks/geospatial/README.md | 10 +- benchmarks/geospatial/_harness.py | 20 ++- 3 files changed, 168 insertions(+), 5 deletions(-) create mode 100644 benchmarks/geospatial/09_lazy_roundtrip.py diff --git a/benchmarks/geospatial/09_lazy_roundtrip.py b/benchmarks/geospatial/09_lazy_roundtrip.py new file mode 100644 index 0000000..1b3e21c --- /dev/null +++ b/benchmarks/geospatial/09_lazy_roundtrip.py @@ -0,0 +1,143 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "xarray-sql", +# "xarray", +# "numpy", +# "pandas", +# "dask", +# "pooch", +# "netCDF4", +# ] +# +# [tool.uv.sources] +# xarray-sql = { path = "../../", editable = true } +# /// +"""Lazy round-trip: the SQL answer comes back as an array without materializing. + +The other cases prove the SQL computes the *same numbers* as xarray. This one +proves the other half of the claim the suite leans on: the round-trip back to +xarray is **lazy**. ``ctx.sql(...).to_dataset()`` hands you a Dataset whose data +is still a query; slicing it (``.sel(time=t0)``) pushes a ``WHERE`` back down +into SQL, so reading one slab reads one slab, not the whole table. + +That is the property the Large Scale Geospatial Benchmarks discussion +(coiled/benchmarks #1545) actually asks about: not "can you express it" but +"does the stack stay light when you point it at a big archive and pull a slice". +Here we make it a number. Three ways to get one timestep out of SQL: + + eager ctx.sql(...).to_pandas() # whole long table + eager to_dataset(chunks=None)[v].sel(time=t0) # whole grid, then slice + lazy to_dataset(chunks={"time": 1})[v].sel(time=t0) # one WHERE, one slab + +All three return the identical slab (asserted against the xarray reference), but +the lazy path materializes one timestep's worth of rows instead of the whole +``time x lat x lon`` product, and its peak memory tracks that. + +Dataset: ``air_temperature`` from ``xarray.tutorial`` (NCEP reanalysis, +2920 x 25 x 53), the dataset the ``to_dataset`` round-trip (#58 / PR #167) was +benchmarked on. Downloads once via pooch; skips cleanly offline. +""" + +from __future__ import annotations + +import xarray as xr + +import xarray_sql as xql + +from _harness import ( + CaseSkipped, + assert_grid_close, + measured, + run_case, + show_result, + show_sql, + timed, +) + +_VAR = "air" + + +def main() -> None: + try: + ds = xr.tutorial.open_dataset("air_temperature") + except Exception as exc: # noqa: BLE001: no network / no pooch cache, skip + raise CaseSkipped( + f"air_temperature tutorial dataset unavailable ({exc})" + ) from exc + + nt, nlat, nlon = ds.sizes["time"], ds.sizes["lat"], ds.sizes["lon"] + full_rows, slab_rows = nt * nlat * nlon, nlat * nlon + print( + f" air_temperature: {nt}x{nlat}x{nlon} " + f"({full_rows:,} cells; one timestep = {slab_rows:,} cells)" + ) + + # Register the grid lazily, one timestep per chunk, so the WHERE the + # round-trip pushes down on .sel(time=t0) prunes to a single slab. + ctx = xql.XarrayContext() + with timed("register air (one timestep per chunk)"): + ctx.from_dataset(_VAR, ds.chunk({"time": 1}), chunks={"time": 1}) + + sql = f'SELECT * FROM "{_VAR}"' + show_sql(sql) + + # The xarray reference: one timestep, the plain-array way. We compare by + # *label* (.sel(time=t0)) rather than position: `SELECT *` has no inherent + # row order, so to_dataset rebuilds the time axis in result order and a + # positional .isel(time=0) could land on a different slab. + t0 = ds["time"].values[0] + dims = ["time", "lat", "lon"] + ref = ds[_VAR].sel(time=t0) + + # (1) Eager via the DataFusion API: materialize the entire long table, then + # pull the one timestep out of the dataframe. + for _ in measured("eager to_pandas() (whole table into RAM)"): + frame = ctx.sql(sql).to_pandas() + eager_df = ( + frame[frame["time"] == t0] + .set_index(["lat", "lon"])[_VAR] + .to_xarray() + ) + + # (2) Eager round-trip: build the whole gridded Dataset, then slice it. + for _ in measured("eager to_dataset(chunks=None) then sel(time=t0)"): + eager_ds = ( + ctx.sql(sql).to_dataset(dims=dims, chunks=None)[_VAR].sel(time=t0) + ) + + # (3) Lazy round-trip: slice first, so only one WHERE'd slab is read. + lazy = ctx.sql(sql).to_dataset(dims=dims, chunks={"time": 1}) + print(f" lazy to_dataset: {_VAR}.chunks = {lazy[_VAR].chunks}") + got = ref # placeholder; the loop below binds it + for _ in measured("lazy sel(time=t0) (single WHERE pushed into SQL)"): + got = lazy[_VAR].sel(time=t0).load() + + # Correctness: every path returns the same slab as the xarray reference. + assert_grid_close("eager to_pandas slab", eager_df, ref) + assert_grid_close("eager to_dataset slab", eager_ds, ref) + assert_grid_close("lazy to_dataset slab", got, ref) + + # Headline: how many rows each path pulled into memory to answer the slice. + # (Peak memory per path is in the ⏱/📊 lines above.) + print("\n Rows materialized to get one timestep, three ways:\n") + print(f" {'path':<36}{'rows in RAM':>14}") + print(f" {'-' * 50}") + print(f" {'eager to_pandas()':<36}{full_rows:>14,}") + print(f" {'eager to_dataset(chunks=None)':<36}{full_rows:>14,}") + print(f" {'lazy to_dataset(chunks=time:1)':<36}{slab_rows:>14,}") + print( + f"\n Lazy path reads {full_rows // slab_rows}x fewer rows " + f"({slab_rows:,} vs {full_rows:,}): the slice became a SQL WHERE." + ) + + show_result(got) + + +if __name__ == "__main__": + raise SystemExit( + run_case( + main, + "Lazy round-trip: SQL slice -> WHERE pushdown (air_temperature)", + ) + ) diff --git a/benchmarks/geospatial/README.md b/benchmarks/geospatial/README.md index abd526f..01e57e0 100644 --- a/benchmarks/geospatial/README.md +++ b/benchmarks/geospatial/README.md @@ -23,11 +23,15 @@ plain-English definition of the operation, and computes the same numbers. | 06 | `06_zonal_vector.py` | rasterize + mask per region | range `JOIN` raster↔regions | | 07 | `07_reproject_udf.py` | per-pixel CRS transform | scalar **UDF** (`reproject()`), à la PostGIS `ST_Transform` | | 08 | `08_regrid_weights.py` | interpolation to a new grid | sparse-weight table `JOIN` + weighted `GROUP BY` | +| 09 | `09_lazy_roundtrip.py` | read one slab from a big array | lazy round-trip: `.sel()` pushes a `WHERE` into SQL | Cases 01–06 show operations that are *natively* relational. Cases 07–08 are the "hardest" array operations — reprojection and regridding — and show where a UDF fits (a per-row coordinate transform) versus where the operation is really a -sparse matrix multiply expressed as a `JOIN`. See +sparse matrix multiply expressed as a `JOIN`. Case 09 steps back from *which* +operation and measures the round-trip itself: that `to_dataset()` is lazy, so +slicing the result reads only the slab asked for, the property that lets these +queries point at an archive far larger than memory. See [`docs/geospatial.md`](../../docs/geospatial.md) for the full narrative, including *where the array paradigm still earns its keep* (generating the interpolation weights — the geometry — which SQL applies but does not compute). @@ -53,6 +57,10 @@ interpolation weights — the geometry — which SQL applies but does not comput reference, not PROJ-vs-PROJ. 08 regrids real **SRTM elevation** (Sierra Nevada) and validates against xarray's bilinear `.interp()`. Both run against Earth Engine using your existing `gcloud` login, and skip cleanly without it. +- **09 lazy round-trip**: `air_temperature` from `xarray.tutorial` (NCEP + reanalysis, 2920×25×53), downloaded once via `pooch`. Small on purpose: it has + to fit in memory the *eager* way so the lazy path has something to beat. Skips + cleanly offline. ## Running diff --git a/benchmarks/geospatial/_harness.py b/benchmarks/geospatial/_harness.py index e901de5..07cc674 100644 --- a/benchmarks/geospatial/_harness.py +++ b/benchmarks/geospatial/_harness.py @@ -185,10 +185,12 @@ def measured(label: str) -> Iterator[None]: tracemalloc.start() tracemalloc.reset_peak() t0 = time.perf_counter() - yield - elapsed = time.perf_counter() - t0 - _, peak = tracemalloc.get_traced_memory() - tracemalloc.stop() + try: + yield + finally: + elapsed = time.perf_counter() - t0 + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() if i >= warmup: times.append(elapsed) peak_max = max(peak_max, peak) @@ -219,6 +221,16 @@ def assert_grid_close( Helper coordinates xarray attaches along the way (e.g. the ``hour`` label a ``groupby("time.hour")`` leaves behind) are dropped before comparing. """ + short = { + d: (got.sizes[d], ref.sizes[d]) + for d in ref.dims + if d in got.sizes and got.sizes[d] != ref.sizes[d] + } + if short: + raise AssertionError( + f"{name}: SQL result does not cover the reference grid " + f"(dim: got vs ref = {short}); the comparison would be partial" + ) aligned = ref.reindex_like(got).transpose(*got.dims) extra = [c for c in aligned.coords if c not in got.coords] aligned = aligned.drop_vars(extra)