Skip to content

Commit 3176aa6

Browse files
author
umi
committed
rename
1 parent ebcbd19 commit 3176aa6

1 file changed

Lines changed: 11 additions & 10 deletions

File tree

crates/paimon/src/arrow/format/parquet.rs

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -792,9 +792,10 @@ const RANGE_FETCH_CONCURRENCY: usize = 10;
792792
/// metadata prefetch hint: 512 KiB.
793793
const METADATA_SIZE_HINT: usize = 512 * 1024;
794794
/// Minimum range size for splitting: 4 MiB.
795-
/// Ranges smaller than this will not be split further to avoid
796-
/// excessive small IO requests whose per-request overhead dominates.
797-
const MIN_SPLIT_SIZE: u64 = 4 * 1024 * 1024;
795+
/// The block size used for split alignment and as the minimum split
796+
/// granularity. Ranges smaller than this will not be split further to
797+
/// avoid excessive small IO requests whose per-request overhead dominates.
798+
const IO_BLOCK_SIZE: u64 = 4 * 1024 * 1024;
798799

799800
impl ArrowFileReader {
800801
fn new(file_size: u64, r: Box<dyn FileRead>) -> Self {
@@ -995,7 +996,7 @@ fn merge_byte_ranges(ranges: &[Range<u64>], coalesce: u64) -> Vec<Range<u64>> {
995996
/// Split merged ranges into fixed-size batches to utilize concurrency,
996997
/// Each merged range is divided into chunks of `expected_size`,
997998
/// with the last chunk taking whatever remains.
998-
/// Ranges smaller than `2 * MIN_SPLIT_SIZE` are kept as-is to
999+
/// Ranges smaller than `2 * IO_BLOCK_SIZE` are kept as-is to
9991000
/// avoid excessive small IO requests.
10001001
fn split_ranges_for_concurrency(merged: Vec<Range<u64>>, concurrency: usize) -> Vec<Range<u64>> {
10011002
if merged.is_empty() || concurrency <= 1 {
@@ -1006,21 +1007,21 @@ fn split_ranges_for_concurrency(merged: Vec<Range<u64>>, concurrency: usize) ->
10061007

10071008
for range in &merged {
10081009
let length = range.end - range.start;
1009-
let raw_size = MIN_SPLIT_SIZE.max(length / concurrency as u64 + 1);
1010-
// Round up to the nearest multiple of MIN_SPLIT_SIZE (4 MB) so that
1010+
let raw_size = IO_BLOCK_SIZE.max(length.div_ceil(concurrency as u64));
1011+
// Round up to the nearest multiple of IO_BLOCK_SIZE (4 MB) so that
10111012
// every split boundary is 4 MB-aligned relative to the range start.
1012-
let expected_size = raw_size.div_ceil(MIN_SPLIT_SIZE) * MIN_SPLIT_SIZE;
1013-
let min_tail_size = expected_size.max(MIN_SPLIT_SIZE * 2);
1013+
let expected_size = raw_size.div_ceil(IO_BLOCK_SIZE) * IO_BLOCK_SIZE;
1014+
let min_tail_size = expected_size.max(IO_BLOCK_SIZE * 2);
10141015

10151016
let mut offset = range.start;
10161017
let end = range.end;
10171018

10181019
// Align the first split boundary: if `offset` is not 4 MB-aligned,
10191020
// emit a short head chunk so that all subsequent chunks start on a
10201021
// 4 MB boundary.
1021-
let misalign = offset % MIN_SPLIT_SIZE;
1022+
let misalign = offset % IO_BLOCK_SIZE;
10221023
if misalign != 0 {
1023-
let first_end = (offset - misalign + MIN_SPLIT_SIZE).min(end);
1024+
let first_end = (offset - misalign + IO_BLOCK_SIZE).min(end);
10241025
result.push(offset..first_end);
10251026
offset = first_end;
10261027
}

0 commit comments

Comments
 (0)