Refactor MeanCombineFn to pre-aggregate sum and count.

shoyer · Xarray-Beam authors · commit 1d04c5f8a79b · 2025-10-06T10:48:44.000-07:00
This should massively reduce the amount of data written to disk via the GroupByKey() inside beam.CombineGlobally and beam.CombinePerKey if dimensions of size larger than 1 are being summed.

PiperOrigin-RevId: 815795593
diff --git a/xarray_beam/_src/combiners.py b/xarray_beam/_src/combiners.py
@@ -13,13 +13,13 @@
 # limitations under the License.
 """Combiners for xarray-beam."""
 from __future__ import annotations
+
 from collections.abc import Sequence
 import dataclasses
 
 import apache_beam as beam
 import numpy.typing as npt
 import xarray
-
 from xarray_beam._src import core
 
 
@@ -30,38 +30,52 @@
 
 
 @dataclasses.dataclass
-class MeanCombineFn(beam.transforms.CombineFn):
-  """CombineFn for computing an arithmetic mean of xarray.Dataset objects."""
+class _SumAndCount:
+  """Calculate the sum and count of an xarray.Dataset."""
 
   dim: DimLike = None
   skipna: bool = True
   dtype: npt.DTypeLike | None = None
 
-  def create_accumulator(self):
-    return (0, 0)
-
-  def add_input(self, sum_count, element):
-    (sum_, count) = sum_count
-
+  def __call__(
+      self, chunk: xarray.Dataset
+  ) -> tuple[xarray.Dataset, xarray.Dataset]:
     if self.dtype is not None:
-      element = element.astype(self.dtype)
+      chunk = chunk.astype(self.dtype)
 
     if self.skipna:
-      sum_increment = element.fillna(0)
-      count_increment = element.notnull()
+      sum_increment = chunk.fillna(0)
+      count_increment = chunk.notnull()
     else:
-      sum_increment = element
-      count_increment = xarray.ones_like(element)
+      sum_increment = chunk
+      count_increment = xarray.ones_like(chunk)
 
     if self.dim is not None:
       # unconditionally set skipna=False because we already explictly fill in
       # missing values explicitly above
       sum_increment = sum_increment.sum(self.dim, skipna=False)
       count_increment = count_increment.sum(self.dim)
 
+    return sum_increment, count_increment
+
+
+@dataclasses.dataclass
+class MeanCombineFn(beam.transforms.CombineFn):
+  """CombineFn for computing an arithmetic mean of xarray.Dataset objects."""
+
+  sum_and_count: _SumAndCount | None = None
+
+  def create_accumulator(self):
+    return (0, 0)
+
+  def add_input(self, sum_count, element):
+    (sum_, count) = sum_count
+    if self.sum_and_count is not None:
+      sum_increment, count_increment = self.sum_and_count(element)
+    else:
+      sum_increment, count_increment = element
     new_sum = sum_ + sum_increment
     new_count = count + count_increment
-
     return new_sum, new_count
 
   def merge_accumulators(self, accumulators):
@@ -72,18 +86,30 @@ def extract_output(self, sum_count):
     (sum_, count) = sum_count
     return sum_ / count
 
-  def for_input_type(self, input_type):
-    return self
-
 
 @dataclasses.dataclass
 class Mean(beam.PTransform):
-  """Calculate the mean over one or more distributed dataset dimensions."""
+  """Calculate the mean over one or more distributed dataset dimensions.
+
+  This PTransform expects a PCollection of `(key, chunk)` pairs, and outputs a
+  PCollection where chunks with the same key (excluding dimensions in `dim`)
+  have been averaged together.
+
+  Args:
+    dim: Dimension(s) to average over.
+    skipna: If True, skip missing values (NaN) when calculating the mean.
+    dtype: Data type to use for sum and count accumulators.
+    fanout: If provided, use `CombinePerKey.with_hot_key_fanout` to handle hot
+      keys by injecting intermediate merging nodes.
+    pre_aggregate: If True, calculate sum and count for each chunk before
+      combining. This is usually more efficient.
+  """
 
   dim: str | Sequence[str]
   skipna: bool = True
   dtype: npt.DTypeLike | None = None
   fanout: int | None = None
+  pre_aggregate: bool = True
 
   def _update_key(
       self, key: core.Key, chunk: xarray.Dataset
@@ -96,7 +122,9 @@ def expand(self, pcoll):
     return (
         pcoll
         | beam.MapTuple(self._update_key)
-        | Mean.PerKey(self.dim, self.skipna, self.dtype, self.fanout)
+        | Mean.PerKey(
+            self.dim, self.skipna, self.dtype, self.fanout, self.pre_aggregate
+        )
     )
 
   @dataclasses.dataclass
@@ -107,9 +135,15 @@ class Globally(beam.PTransform):
     skipna: bool = True
     dtype: npt.DTypeLike | None = None
     fanout: int | None = None
+    pre_aggregate: bool = True
 
     def expand(self, pcoll):
-      combine_fn = MeanCombineFn(self.dim, self.skipna, self.dtype)
+      sum_and_count = _SumAndCount(self.dim, self.skipna, self.dtype)
+      if self.pre_aggregate:
+        pcoll = pcoll | beam.Map(sum_and_count)
+        combine_fn = MeanCombineFn(sum_and_count=None)
+      else:
+        combine_fn = MeanCombineFn(sum_and_count)
       return pcoll | beam.CombineGlobally(combine_fn).with_fanout(self.fanout)
 
   @dataclasses.dataclass
@@ -120,9 +154,15 @@ class PerKey(beam.PTransform):
     skipna: bool = True
     dtype: npt.DTypeLike | None = None
     fanout: int | None = None
+    pre_aggregate: bool = True
 
     def expand(self, pcoll):
-      combine_fn = MeanCombineFn(self.dim, self.skipna, self.dtype)
+      sum_and_count = _SumAndCount(self.dim, self.skipna, self.dtype)
+      if self.pre_aggregate:
+        pcoll = pcoll | beam.MapTuple(lambda k, v: (k, sum_and_count(v)))
+        combine_fn = MeanCombineFn(sum_and_count=None)
+      else:
+        combine_fn = MeanCombineFn(sum_and_count)
       return pcoll | beam.CombinePerKey(combine_fn).with_hot_key_fanout(
           self.fanout
       )
diff --git a/xarray_beam/_src/dataset.py b/xarray_beam/_src/dataset.py
@@ -880,8 +880,13 @@ def mean(
     )
     chunks = {k: v for k, v in self.chunks.items() if k not in dims}
     label = _get_label(f"mean_{'_'.join(dims)}")
+    pre_aggregate = math.prod(self.chunks[d] for d in dims) > 1
     ptransform = self.ptransform | label >> combiners.Mean(
-        dim=dims, skipna=skipna, dtype=dtype, fanout=fanout
+        dim=dims,
+        skipna=skipna,
+        dtype=dtype,
+        fanout=fanout,
+        pre_aggregate=pre_aggregate,
     )
     return type(self)(template, chunks, self.split_vars, ptransform)
 

Original file line number	Diff line number	Diff line change
`@@ -880,8 +880,13 @@ def mean(`
`880`	`880`	`)`
`881`	`881`	`chunks = {k: v for k, v in self.chunks.items() if k not in dims}`
`882`	`882`	`label = _get_label(f"mean_{'_'.join(dims)}")`
	`883`	`+ pre_aggregate = math.prod(self.chunks[d] for d in dims) > 1`
`883`	`884`	`ptransform = self.ptransform \| label >> combiners.Mean(`
`884`		`- dim=dims, skipna=skipna, dtype=dtype, fanout=fanout`
	`885`	`+ dim=dims,`
	`886`	`+ skipna=skipna,`
	`887`	`+ dtype=dtype,`
	`888`	`+ fanout=fanout,`
	`889`	`+ pre_aggregate=pre_aggregate,`
`885`	`890`	`)`
`886`	`891`	`return type(self)(template, chunks, self.split_vars, ptransform)`
`887`	`892`