zarr_chunks_per_shard

shoyer · Xarray-Beam authors · commit 0d2f958db373 · 2025-09-30T15:09:34.000-07:00
PiperOrigin-RevId: 813445392
diff --git a/docs/high-level.ipynb b/docs/high-level.ipynb
@@ -166,7 +166,7 @@
       },
       "cell_type": "markdown",
       "source": [
-        "Weather/climate datasets are typically generated and stored in pancake chunks, but pencil chunks are more useful for most analytics queries, which requires large histories of weather at a single location. Intermediate \"compromise\" chunks can sometimes be a good idea, although if performance and flexibility are critical it may be worth storing multiple copies of your data in different formats.\n",
+        "Weather/climate datasets are typically generated and stored in pancake chunks, but pencil chunks are more useful for most analytics queries, which requires large histories of weather at a single location. Intermediate \"compromise\" chunks can sometimes be a good idea, although if performance and flexibility are critical it may be worth storing multiple copies of your data in different formats. Using Zarr v3's sharding feature to group smaller chunks into shards can also help mitigate the challenges of picking an optimal chunk size.\n",
         "\n",
         "Using the right chunks is *absolutely essentially* for efficient operations with Xarray-Beam and Zarr. For example, reading data from a single location across all times (a \"pencil\" query) is extremely inefficient for a dataset stored in \"pancake\" chunks -- it would require loading the entire dataset from disk!\n",
         "\n",
diff --git a/xarray_beam/_src/dataset.py b/xarray_beam/_src/dataset.py
@@ -257,12 +257,70 @@ def _check_shards_or_chunks(
   def to_zarr(
       self,
       path: str,
+      *,
+      zarr_chunks_per_shard: Mapping[str, int] | None = None,
       zarr_chunks: Mapping[str, int] | None = None,
       zarr_shards: Mapping[str, int] | None = None,
       zarr_format: int | None = None,
   ) -> beam.PTransform:
-    """Write to a Zarr file."""
-    if zarr_chunks is None:
+    """Write this dataset to a Zarr file.
+
+    The extensive options for controlling chunking and sharding are intended for
+    power users:
+
+    * If you are happy with the existing chunk sizes of your dataset
+      and just want to write it to disk, you can omit all of them.
+    * Consider specifying only ``zarr_chunks_per_shard`` to allow for more
+      flexible efficient reading of data from disk. This allows for dividing
+      dataset chunks into much smaller Zarr chunks on disk, with each chunk
+      stored in a single Zarr shard.
+
+    Args:
+      path: path to write to.
+      zarr_chunks_per_shard: If provided, write this dataset into Zarr shards,
+        each with at most this many Zarr chunks per shard (requires Zarr v3).
+      zarr_chunks: Explicit chunk sizes to use for storing data in Zarr, as an
+        alternative to specifying ``zarr_chunks_per_shard``. Zarr chunk sizes
+        must evenly divide the existing chunk sizes of this dataset.
+      zarr_shards: Explicit shards to use for storing data in Zarr, which must
+        evenly divide the existing chunk sizes of this dataset, and be even
+        multiples of chunk sizes. Requires Zarr v3. By default, Zarr sharding is
+        not used unless ``zarr_chunks_per_shard`` is provided, in which case
+        Zarr shards default to the chunk sizes of this dataset.
+      zarr_format: optional integer specifying the explicit Zarr format to use.
+        Defaults to Zarr v3 if using shards, or the default format for your
+        installed version of Zarr.
+
+    Returns:
+      Beam PTransform that writes the dataset to a Zarr file.
+    """
+    if zarr_chunks_per_shard is not None:
+      if zarr_chunks is not None:
+        raise ValueError(
+            'cannot supply both zarr_chunks_per_shard and zarr_chunks'
+        )
+      if zarr_shards is None:
+        zarr_shards = {}
+      zarr_shards = {**self.chunks, **zarr_shards}
+      zarr_chunks = {}
+      for dim, existing_chunk_size in zarr_shards.items():
+        multiple = zarr_chunks_per_shard.get(dim)
+        if multiple is None:
+          raise ValueError(
+              f'cannot write a dataset with chunks {self.chunks} to Zarr with '
+              f'{zarr_chunks_per_shard=}, which does not contain a value for '
+              f'dimension {dim!r}'
+          )
+        zarr_chunks[dim], remainder = divmod(
+            existing_chunk_size, multiple
+        )
+        if remainder != 0:
+          raise ValueError(
+              f'cannot write a dataset with chunks {self.chunks} to Zarr with '
+              f'{zarr_chunks_per_shard=}, which do not evenly divide into '
+              'chunks'
+          )
+    elif zarr_chunks is None:
       if zarr_shards is not None:
         raise ValueError('cannot supply zarr_shards without zarr_chunks')
       zarr_chunks = {}
@@ -274,6 +332,9 @@ def to_zarr(
     else:
       self._check_shards_or_chunks(zarr_chunks, 'chunks')
 
+    if zarr_shards is not None and zarr_format is None:
+      zarr_format = 3  # required for shards
+
     return self.ptransform | _get_label('to_zarr') >> zarr.ChunksToZarr(
         path,
         self.template,
diff --git a/xarray_beam/_src/dataset_test.py b/xarray_beam/_src/dataset_test.py
@@ -177,6 +177,71 @@ def test_to_zarr_shards(self):
             temp_dir, zarr_chunks={'x': 3}, zarr_shards={'x': 9}, zarr_format=3
         )
 
+  def test_to_zarr_chunks_per_shard(self):
+    temp_dir = self.create_tempdir().full_path
+    ds = xarray.Dataset({'foo': ('x', np.arange(12))})
+    beam_ds = xbeam.Dataset.from_xarray(ds, {'x': 6})
+
+    with self.subTest('simple'):
+      with beam.Pipeline() as p:
+        p |= beam_ds.to_zarr(
+            temp_dir, zarr_chunks_per_shard={'x': 2}
+        )
+      opened, chunks = xbeam.open_zarr(temp_dir)
+      xarray.testing.assert_identical(ds, opened)
+      self.assertEqual(chunks, {'x': 3})
+      self.assertEqual(opened['foo'].encoding['chunks'], (3,))
+      self.assertEqual(opened['foo'].encoding['shards'], (6,))
+
+    with self.subTest('explicit_shards'):
+      temp_dir = self.create_tempdir().full_path
+      ds = xarray.Dataset({'foo': ('x', np.arange(24))})
+      beam_ds = xbeam.Dataset.from_xarray(ds, {'x': 12})
+      with beam.Pipeline() as p:
+        p |= beam_ds.to_zarr(
+            temp_dir,
+            zarr_chunks_per_shard={'x': 2},
+            zarr_shards={'x': 6},
+        )
+      opened, chunks = xbeam.open_zarr(temp_dir)
+      xarray.testing.assert_identical(ds, opened)
+      self.assertEqual(chunks, {'x': 3})
+      self.assertEqual(opened['foo'].encoding['chunks'], (3,))
+      self.assertEqual(opened['foo'].encoding['shards'], (6,))
+
+    with self.subTest('chunks_and_chunks_per_shard_error'):
+      ds = xarray.Dataset({'foo': ('x', np.arange(12))})
+      beam_ds = xbeam.Dataset.from_xarray(ds, {'x': 6})
+      with self.assertRaisesWithLiteralMatch(
+          ValueError,
+          'cannot supply both zarr_chunks_per_shard and zarr_chunks',
+      ):
+        beam_ds.to_zarr(
+            temp_dir, zarr_chunks_per_shard={'x': 2}, zarr_chunks={'x': 3}
+        )
+
+    with self.subTest('missing_dim_error'):
+      ds = xarray.Dataset({'foo': ('x', np.arange(12))})
+      beam_ds = xbeam.Dataset.from_xarray(ds, {'x': 6})
+      with self.assertRaisesWithLiteralMatch(
+          ValueError,
+          "cannot write a dataset with chunks {'x': 6} to Zarr with "
+          "zarr_chunks_per_shard={'y': 2}, which does not contain a value for "
+          "dimension 'x'",
+      ):
+        beam_ds.to_zarr(temp_dir, zarr_chunks_per_shard={'y': 2})
+
+    with self.subTest('uneven_division_error'):
+      ds = xarray.Dataset({'foo': ('x', np.arange(12))})
+      beam_ds = xbeam.Dataset.from_xarray(ds, {'x': 6})
+      with self.assertRaisesWithLiteralMatch(
+          ValueError,
+          "cannot write a dataset with chunks {'x': 6} to Zarr with "
+          "zarr_chunks_per_shard={'x': 5}, which do not evenly divide into "
+          'chunks',
+      ):
+        beam_ds.to_zarr(temp_dir, zarr_chunks_per_shard={'x': 5})
+
   def test_to_zarr_default_chunks(self):
     temp_dir = self.create_tempdir().full_path
     ds = xarray.Dataset({'foo': (('x', 'y'), np.arange(20).reshape(10, 2))})