3030 'chunksize of -1 indicates not to chunk a dimension.'
3131 ),
3232)
33+ TARGET_SHARDS = flags .DEFINE_string (
34+ 'target_shards' ,
35+ None ,
36+ help = (
37+ 'Desired shards for each dimension in the output Zarr dataset, in the '
38+ 'same format as --target_chunks. If omitted, sharding is not used. '
39+ 'Shards should be multiples of corresponding chunk sizes. Only valid '
40+ 'with Zarr v3.'
41+ ),
42+ )
43+ ZARR_FORMAT = flags .DEFINE_integer (
44+ 'zarr_format' ,
45+ None ,
46+ help = 'Zarr format to use for the output.' ,
47+ )
3348RUNNER = flags .DEFINE_string ('runner' , None , help = 'beam.runners.Runner' )
3449
3550
@@ -48,7 +63,14 @@ def _parse_chunks_str(chunks_str: str) -> dict[str, int]:
4863def main (argv ):
4964 source_dataset , source_chunks = xbeam .open_zarr (INPUT_PATH .value )
5065 template = xbeam .make_template (source_dataset )
51- target_chunks = dict (source_chunks , ** _parse_chunks_str (TARGET_CHUNKS .value ))
66+
67+ target_chunks = source_chunks | _parse_chunks_str (TARGET_CHUNKS .value )
68+
69+ if TARGET_SHARDS .value is not None :
70+ target_shards = source_chunks | _parse_chunks_str (TARGET_SHARDS .value )
71+ else :
72+ target_shards = None
73+
5274 itemsize = max (variable .dtype .itemsize for variable in template .values ())
5375
5476 with beam .Pipeline (runner = RUNNER .value , argv = argv ) as root :
@@ -58,10 +80,16 @@ def main(argv):
5880 | xbeam .Rechunk ( # pytype: disable=wrong-arg-types
5981 source_dataset .sizes ,
6082 source_chunks ,
61- target_chunks ,
83+ target_chunks if target_shards is None else target_shards ,
6284 itemsize = itemsize ,
6385 )
64- | xbeam .ChunksToZarr (OUTPUT_PATH .value , template , target_chunks )
86+ | xbeam .ChunksToZarr (
87+ OUTPUT_PATH .value ,
88+ template ,
89+ zarr_chunks = target_chunks ,
90+ zarr_shards = target_shards ,
91+ zarr_format = ZARR_FORMAT .value ,
92+ )
6593 )
6694
6795
0 commit comments