Allow Dataset.rechunk to change split_vars.

shoyer · Xarray-Beam authors · commit 6116d9c02dec · 2025-10-08T12:04:54.000-07:00
This is convenient because the optimal ordering of splitting and rechunking is not obvious.

Also make consolidate_variables() and split_variables() no-ops when appropriate.

PiperOrigin-RevId: 816805573
diff --git a/xarray_beam/__init__.py b/xarray_beam/__init__.py
@@ -55,4 +55,4 @@
     DatasetToZarr as DatasetToZarr,
 )
 
-__version__ = '0.10.4'  # automatically synchronized to pyproject.toml
+__version__ = '0.10.5'  # automatically synchronized to pyproject.toml
diff --git a/xarray_beam/_src/dataset.py b/xarray_beam/_src/dataset.py
@@ -792,6 +792,7 @@ def map_blocks(
   def rechunk(
       self,
       chunks: UnnormalizedChunks,
+      split_vars: bool | None = None,
       min_mem: int | None = None,
       max_mem: int = 2**30,
   ) -> Dataset:
@@ -801,18 +802,24 @@ def rechunk(
       chunks: new chunk sizes, either a dict mapping from dimension name to
         chunk size, or any value that can be passed to
         :py:func:`xarray_beam.normalize_chunks`.
+      split_vars: whether variables should be split across chunks in the
+        ptransform, or all stored in the same chunks. By default, the current
+        value of ``split_vars`` is preserved.
       min_mem: optional minimum memory usage for an intermediate chunk in
         rechunking. Defaults to ``max_mem/100``.
-      max_mem: optional maximum memory usage ffor an intermediate chunk in
+      max_mem: optional maximum memory usage for an intermediate chunk in
         rechunking. Defaults to 1GB.
 
     Returns:
       New Dataset with updated chunks.
     """
+    if split_vars is None:
+      split_vars = self.split_vars
+
     chunks = normalize_chunks(
         chunks,
         self.template,
-        split_vars=self.split_vars,
+        split_vars=split_vars,
         previous_chunks=self.chunks,
     )
     label = _get_label('rechunk')
@@ -823,31 +830,43 @@ def rechunk(
       # Rechunking can be performed by re-reading the source dataset with new
       # chunks, rather than using a separate rechunking transform.
       ptransform = core.DatasetToChunks(
-          self.ptransform.dataset, chunks, self.split_vars
+          self.ptransform.dataset, chunks, split_vars
       )
       ptransform.label = _concat_labels(self.ptransform.label, label)
-    else:
-      # Need to do a full rechunking.
-      rechunk_transform = rechunk.Rechunk(
-          self.sizes,
-          self.chunks,
-          chunks,
-          itemsize=self.itemsize,
-          min_mem=min_mem,
-          max_mem=max_mem,
-      )
-      ptransform = self.ptransform | label >> rechunk_transform
-    return type(self)(self.template, chunks, self.split_vars, ptransform)
+      return type(self)(self.template, chunks, split_vars, ptransform)
+
+    # Need to do a full rechunking.
+    # If also splitting variables, do that first because smaller itemsize allows
+    # much for flexiblity for rechunking. If consolidating, do that afterwards.
+    prechunked = self.split_variables() if split_vars else self
+    rechunk_transform = rechunk.Rechunk(
+        prechunked.sizes,
+        prechunked.chunks,
+        chunks,
+        itemsize=prechunked.itemsize,
+        min_mem=min_mem,
+        max_mem=max_mem,
+    )
+    ptransform = prechunked.ptransform | label >> rechunk_transform
+    rechunked = type(self)(
+        self.template, chunks, prechunked.split_vars, ptransform
+    )
+    result = rechunked if split_vars else rechunked.consolidate_variables()
+    return result
 
   def split_variables(self) -> Dataset:
     """Split variables in this Dataset into separate chunks."""
+    if self.split_vars:
+      return self
     split_vars = True
     label = _get_label('split_vars')
     ptransform = self.ptransform | label >> rechunk.SplitVariables()
     return type(self)(self.template, self.chunks, split_vars, ptransform)
 
   def consolidate_variables(self) -> Dataset:
     """Consolidate variables in this Dataset into a single chunk."""
+    if not self.split_vars:
+      return self
     split_vars = False
     label = _get_label('consolidate_vars')
     ptransform = self.ptransform | label >> rechunk.ConsolidateVariables()
@@ -884,17 +903,13 @@ def mean(
     )
     new_chunks = {k: v for k, v in self.chunks.items() if k not in dims}
     label = _get_label(f"mean_{'_'.join(dims)}")
-    ptransform = (
-        self.ptransform
-        | label
-        >> combiners.MultiStageMean(
-            dims=dims,
-            skipna=skipna,
-            dtype=dtype,
-            chunks=self.chunks,
-            sizes=self.sizes,
-            itemsize=self.itemsize,
-        )
+    ptransform = self.ptransform | label >> combiners.MultiStageMean(
+        dims=dims,
+        skipna=skipna,
+        dtype=dtype,
+        chunks=self.chunks,
+        sizes=self.sizes,
+        itemsize=self.itemsize,
     )
     return type(self)(template, new_chunks, self.split_vars, ptransform)
 
diff --git a/xarray_beam/_src/dataset_test.py b/xarray_beam/_src/dataset_test.py
@@ -1017,7 +1017,7 @@ def test_rechunk_from_zarr_without_ptransform(self):
     actual = rechunked_ds.collect_with_direct_runner()
     xarray.testing.assert_identical(actual, source)
 
-  def test_rechunk_split_vars(self):
+  def test_rechunk_with_existing_split_vars(self):
     source = xarray.Dataset({
         'foo': (('x', 'y'), np.arange(20).reshape(10, 2)),
         'bar': ('x', np.arange(10)),
@@ -1030,6 +1030,29 @@ def test_rechunk_split_vars(self):
     actual = rechunked_ds.collect_with_direct_runner()
     xarray.testing.assert_identical(actual, source)
 
+  @parameterized.product(
+      load_split=[False, True],
+      target_split=[False, True],
+      insert_intermediate=[False, True],
+  )
+  def test_rechunk_and_split(
+      self, load_split, target_split, insert_intermediate
+  ):
+    source = xarray.Dataset({
+        'foo': (('x', 'y'), np.arange(20).reshape(4, 5)),
+        'bar': (('x', 'y'), -np.arange(20).reshape(4, 5)),
+    })
+    beam_ds = xbeam.Dataset.from_xarray(
+        source, {'x': 5, 'y': 2}, split_vars=load_split
+    )
+    if insert_intermediate:
+      beam_ds = beam_ds.map_blocks(lambda ds: ds)
+    rechunked_ds = beam_ds.rechunk({'x': 2, 'y': 1}, split_vars=target_split)
+    self.assertEqual(rechunked_ds.chunks, {'x': 2, 'y': 1})
+    self.assertEqual(rechunked_ds.split_vars, target_split)
+    actual = rechunked_ds.collect_with_direct_runner()
+    xarray.testing.assert_identical(actual, source)
+
 
 class EndToEndTest(test_util.TestCase):
 

Original file line number	Diff line number	Diff line change
`@@ -55,4 +55,4 @@`
`55`	`55`	`DatasetToZarr as DatasetToZarr,`
`56`	`56`	`)`
`57`	`57`
`58`		`-__version__ = '0.10.4' # automatically synchronized to pyproject.toml`
	`58`	`+__version__ = '0.10.5' # automatically synchronized to pyproject.toml`