From 7df3a0db958050080d7e4a0f0ec83b1b42b73a15 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 4 Jul 2026 11:43:03 +0200 Subject: [PATCH 01/12] feat(core): typed LayoutId; reader accepts vortex.zoned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LayoutId mirrors the sealed EncodingId shape — WellKnown constants (FLAT, CHUNKED, STRUCT, ZONED, STATS, DICT) plus Custom — because layouts are runtime-pluggable in the Rust reference (two separate footer spec namespaces sharing the string wire form; vortex.flat is layout-only). Layout's misnamed String encodingId component becomes LayoutId layoutId; unknown layouts still fail loudly (Rust default, no allowUnknown for layouts), now with a typed id in the error. Compat fix uncovered by the reference check: Rust renamed the zone-map layout id to vortex.zoned, keeping vortex.stats as legacy alias — the reader now routes BOTH through the zoned path, so files from current Rust writers scan and prune correctly. The writer keeps emitting vortex.stats, which old and new Rust readers accept; integration oracle confirms byte-identical output. Co-Authored-By: Claude Fable 5 --- .../vortex/cli/tui/VortexInspectorTui.java | 8 +- .../dfa1/vortex/core/model/LayoutId.java | 127 ++++++++++++++++++ .../dfa1/vortex/core/model/LayoutIdTest.java | 92 +++++++++++++ .../dfa1/vortex/inspect/VortexInspector.java | 2 +- .../vortex/inspect/InspectorTreeTest.java | 19 +-- .../vortex/inspect/VortexInspectorTest.java | 25 ++-- .../performance/TaxiColumnByteDiff.java | 2 +- .../performance/TaxiColumnTreeDiff.java | 4 +- .../io/github/dfa1/vortex/reader/Layout.java | 50 +++---- .../dfa1/vortex/reader/PostscriptParser.java | 11 +- .../dfa1/vortex/reader/ScanIterator.java | 5 +- .../dfa1/vortex/reader/LayoutKindTest.java | 53 +++++--- .../PostscriptParserBlobBoundsTest.java | 2 +- .../reader/PostscriptParserLayoutIdTest.java | 100 ++++++++++++++ .../dfa1/vortex/writer/VortexWriter.java | 15 ++- 15 files changed, 426 insertions(+), 89 deletions(-) create mode 100644 core/src/main/java/io/github/dfa1/vortex/core/model/LayoutId.java create mode 100644 core/src/test/java/io/github/dfa1/vortex/core/model/LayoutIdTest.java create mode 100644 reader/src/test/java/io/github/dfa1/vortex/reader/PostscriptParserLayoutIdTest.java diff --git a/cli/src/main/java/io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java b/cli/src/main/java/io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java index cc832134..2d7d887b 100644 --- a/cli/src/main/java/io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java +++ b/cli/src/main/java/io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java @@ -442,7 +442,7 @@ private String renderItem(Item item) { } String label = item.depth() == 0 && node.layout().isStruct() ? "struct" - : node.fieldName().map(n -> n + ": ").orElse("") + node.layout().encodingId(); + : node.fieldName().map(n -> n + ": ").orElse("") + node.layout().layoutId(); String tag = statsChildren.contains(node) ? ", stats" : ""; return " ".repeat(item.depth() * 2) + marker + label + " (" + node.layout().rowCount() + " rows" + tag + ")"; @@ -467,7 +467,7 @@ private List detailLines(InspectorTree.Node node) { List lines = new ArrayList<>(); Layout layout = node.layout(); InspectorTree.Peek p = peek(node); - lines.add("Encoding: " + (p.encoding() != null ? p.encoding() : layout.encodingId())); + lines.add("Encoding: " + (p.encoding() != null ? p.encoding() : layout.layoutId().id())); node.fieldName().ifPresent(name -> lines.add("Field: " + name)); String col = columnOf.get(node); if (col != null && !node.fieldName().isPresent()) { @@ -705,14 +705,14 @@ private List decodeStatsLayout( Layout child = statsLayout.children().get(i); if (!child.isFlat()) { throw new IllegalStateException( - "non-flat stats chunk: " + child.encodingId()); + "non-flat stats chunk: " + child.layoutId()); } all.addAll(decodeStatsFlat(child, statsDtype, arena)); } return all; } throw new IllegalStateException( - "unsupported stats layout: " + statsLayout.encodingId()); + "unsupported stats layout: " + statsLayout.layoutId()); } private List decodeStatsFlat( diff --git a/core/src/main/java/io/github/dfa1/vortex/core/model/LayoutId.java b/core/src/main/java/io/github/dfa1/vortex/core/model/LayoutId.java new file mode 100644 index 00000000..b3beb1b8 --- /dev/null +++ b/core/src/main/java/io/github/dfa1/vortex/core/model/LayoutId.java @@ -0,0 +1,127 @@ +package io.github.dfa1.vortex.core.model; + +import java.util.Map; +import java.util.Objects; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/// Identity of a layout node — either a spec [WellKnown] constant or a third-party [Custom] id. +/// +/// Layout ids and array-encoding ids ([EncodingId]) are separate namespaces — the Vortex footer +/// carries two dictionary tables (`layout_specs` and `array_specs`) — even though some strings +/// (`vortex.chunked`, `vortex.struct`, `vortex.dict`) appear in both. Layouts are runtime-pluggable +/// upstream, so this type is open: [#parse(String)] maps any wire string to a typed value, and +/// [#id()] recovers the wire string from a typed value. +public sealed interface LayoutId permits LayoutId.WellKnown, LayoutId.Custom { + + /// Returns the wire string of this layout id (e.g. `"vortex.flat"`). + /// + /// @return the wire string of this layout id + String id(); + + /// Parses a wire string into its typed representation: the matching [WellKnown] constant, + /// else a [Custom] wrapping the raw string. Total over every non-blank string; blank input + /// is not a valid layout id and is rejected by the [Custom] constructor — callers parsing + /// untrusted input must guard blank ids and raise their own domain error. + /// + /// @param raw the raw layout id string (e.g. `"vortex.flat"`) + /// @return the matching [WellKnown] constant, or a [Custom] wrapping `raw` if none matches + /// @throws NullPointerException if `raw` is `null` + /// @throws IllegalArgumentException if `raw` is blank + static LayoutId parse(String raw) { + WellKnown known = WellKnown.byId(raw); + return known != null ? known : new Custom(raw); + } + + /// Layout ids defined by the Vortex specification and understood by this build. + enum WellKnown implements LayoutId { + /// Flat (leaf) layout — a single encoded segment (`vortex.flat`). + FLAT("vortex.flat"), + /// Chunked layout — a sequence of flat layouts (`vortex.chunked`). + CHUNKED("vortex.chunked"), + /// Struct layout — one child per column (`vortex.struct`). + STRUCT("vortex.struct"), + /// Zone-map layout — wraps a child with per-chunk stats for pruning; the current canonical + /// wire id upstream (`vortex.zoned`). + ZONED("vortex.zoned"), + /// Legacy wire alias of the zoned layout (`vortex.stats`). Rust keeps a legacy vtable for + /// it, and vortex-java currently writes it; readers must route it to the zoned handling. + STATS("vortex.stats"), + /// Dictionary layout for low-cardinality columns (`vortex.dict`). + DICT("vortex.dict"), + ; + + // O(1) access to a WellKnown constant by its string representation + private static final Map LOOKUP = Stream.of(values()) + .collect(Collectors.toUnmodifiableMap(WellKnown::id, Function.identity())); + private final String id; + + WellKnown(String id) { + this.id = id; + } + + /// Returns the well-known constant whose wire string is `id`, or `null` if none matches. + /// + /// @param id the wire string to look up (may be `null`) + /// @return the matching constant, or `null` if unrecognized + static WellKnown byId(String id) { + return LOOKUP.get(id); + } + + @Override + public String id() { + return id; + } + + @Override + public String toString() { + return id; + } + } + + /// A third-party layout id whose wire string is not part of the [WellKnown] set. + /// + /// @param id the wire string of this layout id; must be non-blank and must not collide + /// with a [WellKnown] wire string + record Custom(String id) implements LayoutId { + + /// Validates that `id` is a usable custom layout id. + /// + /// @param id the wire string of this layout id + /// @throws NullPointerException if `id` is `null` + /// @throws IllegalArgumentException if `id` is blank or matches a [WellKnown] wire string + public Custom { + Objects.requireNonNull(id, "id"); + if (id.isBlank()) { + throw new IllegalArgumentException("layout id must not be blank"); + } + WellKnown wellKnown = WellKnown.byId(id); + if (wellKnown != null) { + throw new IllegalArgumentException( + "\"" + id + "\" is a well-known layout id; use LayoutId." + wellKnown.name() + " instead"); + } + } + + @Override + public String toString() { + return id; + } + } + + // Re-export every WellKnown constant, typed as WellKnown, so `LayoutId.FLAT` call sites stay + // usable wherever a WellKnown is required. + + /// Well-known `vortex.flat` id. + WellKnown FLAT = WellKnown.FLAT; + /// Well-known `vortex.chunked` id. + WellKnown CHUNKED = WellKnown.CHUNKED; + /// Well-known `vortex.struct` id. + WellKnown STRUCT = WellKnown.STRUCT; + /// Well-known `vortex.zoned` id. + WellKnown ZONED = WellKnown.ZONED; + /// Well-known `vortex.stats` id. + WellKnown STATS = WellKnown.STATS; + /// Well-known `vortex.dict` id. + WellKnown DICT = WellKnown.DICT; +} diff --git a/core/src/test/java/io/github/dfa1/vortex/core/model/LayoutIdTest.java b/core/src/test/java/io/github/dfa1/vortex/core/model/LayoutIdTest.java new file mode 100644 index 00000000..d1ada902 --- /dev/null +++ b/core/src/test/java/io/github/dfa1/vortex/core/model/LayoutIdTest.java @@ -0,0 +1,92 @@ +package io.github.dfa1.vortex.core.model; + +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.ValueSource; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class LayoutIdTest { + + @Nested + class Parse { + + @ParameterizedTest + @EnumSource(LayoutId.WellKnown.class) + void parse_knownId_returnsMatchingConstant(LayoutId.WellKnown id) { + // Given the wire string of a well-known constant + // When + LayoutId result = LayoutId.parse(id.id()); + // Then the same constant comes back + assertThat(result).isSameAs(id); + } + + @Test + void parse_unknownId_returnsCustomWrappingRawId() { + // Given a wire string no build knows about + String raw = "supermario"; + // When — parse is total, so a miss is a typed Custom rather than an empty Optional + LayoutId result = LayoutId.parse(raw); + // Then + assertThat(result).isEqualTo(new LayoutId.Custom(raw)); + assertThat(result.id()).isEqualTo(raw); + } + + @ParameterizedTest + @ValueSource(strings = {"", " ", " "}) + void parse_blankId_throwsIllegalArgumentException(String blank) { + // Given / When / Then — blank is not a valid id; parse must not silently wrap it, + // so untrusted-input callers are forced to guard it into their own domain error + assertThatThrownBy(() -> LayoutId.parse(blank)) + .isInstanceOf(IllegalArgumentException.class); + } + } + + @Nested + class CustomInvariants { + + @Test + void construct_nullId_throwsNullPointerException() { + // Given / When / Then — a Custom must always carry a wire string + assertThatThrownBy(() -> new LayoutId.Custom(null)) + .isInstanceOf(NullPointerException.class); + } + + @Test + void construct_blankId_throwsIllegalArgumentException() { + // Given / When / Then — blank ids have no wire representation + assertThatThrownBy(() -> new LayoutId.Custom(" ")) + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + void construct_wellKnownId_throwsIllegalArgumentException() { + // Given a wire string that already names a well-known constant + // When / Then — Custom refuses to shadow it and points at the constant to use instead + assertThatThrownBy(() -> new LayoutId.Custom("vortex.flat")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("FLAT"); + } + } + + @Nested + class Properties { + + @ParameterizedTest + @EnumSource(LayoutId.WellKnown.class) + void id_isNonBlankString(LayoutId.WellKnown id) { + // Given / When / Then + assertThat(id.id()).isNotBlank(); + } + + @ParameterizedTest + @EnumSource(LayoutId.WellKnown.class) + void toString_equalsId(LayoutId.WellKnown id) { + // Given / When / Then + assertThat(id).hasToString(id.id()); + } + } +} diff --git a/inspector/src/main/java/io/github/dfa1/vortex/inspect/VortexInspector.java b/inspector/src/main/java/io/github/dfa1/vortex/inspect/VortexInspector.java index 92764c25..c4247184 100644 --- a/inspector/src/main/java/io/github/dfa1/vortex/inspect/VortexInspector.java +++ b/inspector/src/main/java/io/github/dfa1/vortex/inspect/VortexInspector.java @@ -145,7 +145,7 @@ private static String format(Object v) { } private static void appendLayoutInline(StringBuilder sb, Layout layout) { - sb.append(layout.encodingId()).append('(').append(layout.rowCount()).append(" rows)"); + sb.append(layout.layoutId()).append('(').append(layout.rowCount()).append(" rows)"); if (layout.children().isEmpty()) { return; } diff --git a/inspector/src/test/java/io/github/dfa1/vortex/inspect/InspectorTreeTest.java b/inspector/src/test/java/io/github/dfa1/vortex/inspect/InspectorTreeTest.java index 476a68b5..2f360d56 100644 --- a/inspector/src/test/java/io/github/dfa1/vortex/inspect/InspectorTreeTest.java +++ b/inspector/src/test/java/io/github/dfa1/vortex/inspect/InspectorTreeTest.java @@ -2,6 +2,7 @@ import io.github.dfa1.vortex.reader.CompressionScheme; import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.core.model.LayoutId; import io.github.dfa1.vortex.reader.Footer; import io.github.dfa1.vortex.reader.Layout; import io.github.dfa1.vortex.reader.SegmentSpec; @@ -136,9 +137,9 @@ void build_reportsProgressOncePerPeekedSegment() { // Given — struct of two compressed (skipped) + two uncompressed Flat columns. // Only uncompressed leaves trigger peekFlatRoot, so progress should fire twice // with total=2. - Layout c1 = new Layout("vortex.flat", 0, null, List.of(), List.of(0)); - Layout c2 = new Layout("vortex.flat", 0, null, List.of(), List.of(1)); - Layout c3 = new Layout("vortex.flat", 0, null, List.of(), List.of(2)); + Layout c1 = new Layout(LayoutId.parse("vortex.flat"), 0, null, List.of(), List.of(0)); + Layout c2 = new Layout(LayoutId.parse("vortex.flat"), 0, null, List.of(), List.of(1)); + Layout c3 = new Layout(LayoutId.parse("vortex.flat"), 0, null, List.of(), List.of(2)); Layout root = struct(0, List.of(c1, c2, c3)); DType dtype = new DType.Struct(List.of("a", "b", "c"), List.of(DType.I32, @@ -178,8 +179,8 @@ void buildShallow_skipsAllSlicesAndStillNamesColumns() { // Given — shallow build is the path the TUI uses; it must touch zero segment // bytes (so opening a remote file is instant) yet still populate fieldName on // top-level struct children. - Layout col0 = new Layout("vortex.flat", 10, null, List.of(), List.of(0)); - Layout col1 = new Layout("vortex.flat", 10, null, List.of(), List.of(1)); + Layout col0 = new Layout(LayoutId.parse("vortex.flat"), 10, null, List.of(), List.of(0)); + Layout col1 = new Layout(LayoutId.parse("vortex.flat"), 10, null, List.of(), List.of(1)); Layout root = struct(10, List.of(col0, col1)); DType dtype = new DType.Struct(List.of("id", "value"), List.of(DType.I64, @@ -226,7 +227,7 @@ void peek_nonFlatNode_returnsEmptyWithoutSlicing() { void peek_compressedFlatSegment_returnsEmptyWithoutSlicing() { // Given — compressed segments would need the encoding to decompress before // their FlatBuffer can be parsed; peek skips them rather than slicing garbage. - Layout flat = new Layout("vortex.flat", 10, null, List.of(), List.of(0)); + Layout flat = new Layout(LayoutId.parse("vortex.flat"), 10, null, List.of(), List.of(0)); InspectorTree.Node node = new InspectorTree.Node(flat, java.util.Optional.empty(), Set.of(), io.github.dfa1.vortex.reader.ArrayStats.empty(), List.of()); given(handle.footer()).willReturn(new io.github.dfa1.vortex.reader.Footer( @@ -248,7 +249,7 @@ void build_flatChildWithCompressedSegment_skipsRootEncodingPeek() { // Given — peekRootEncoding() reads the segment as a FlatBuffer; compressed segments // are intentionally skipped so a malformed or compressed payload can't crash the // inspector. With code != NONE we should still build a tree, with no encodings used. - Layout root = new Layout("vortex.flat", 0, null, List.of(), List.of(0)); + Layout root = new Layout(LayoutId.parse("vortex.flat"), 0, null, List.of(), List.of(0)); DType dtype = DType.I32; SegmentSpec compressed = new SegmentSpec(0, 1024, (byte) 0, CompressionScheme.ZSTD); givenHandle(dtype, root, List.of("vortex.flat"), List.of(compressed)); @@ -268,10 +269,10 @@ private void givenHandle(DType dtype, Layout layout, List arraySpecs, Li } private static Layout struct(long rows, List children) { - return new Layout("vortex.struct", rows, null, children, List.of()); + return new Layout(LayoutId.parse("vortex.struct"), rows, null, children, List.of()); } private static Layout leaf(String encodingId, long rows) { - return new Layout(encodingId, rows, null, List.of(), List.of()); + return new Layout(LayoutId.parse(encodingId), rows, null, List.of(), List.of()); } } diff --git a/inspector/src/test/java/io/github/dfa1/vortex/inspect/VortexInspectorTest.java b/inspector/src/test/java/io/github/dfa1/vortex/inspect/VortexInspectorTest.java index a96b78d0..bfdb0e07 100644 --- a/inspector/src/test/java/io/github/dfa1/vortex/inspect/VortexInspectorTest.java +++ b/inspector/src/test/java/io/github/dfa1/vortex/inspect/VortexInspectorTest.java @@ -3,6 +3,7 @@ import io.github.dfa1.vortex.reader.ArrayStats; import io.github.dfa1.vortex.reader.CompressionScheme; import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.core.model.LayoutId; import io.github.dfa1.vortex.reader.Layout; import io.github.dfa1.vortex.reader.SegmentSpec; import org.junit.jupiter.api.Test; @@ -67,7 +68,7 @@ void render_segmentTable_listsEverySegment() { @Test void render_nonStruct_inlinesSingleColumnLayout() { // Given - Layout leaf = new Layout("vortex.flat", 100, null, List.of(), List.of()); + Layout leaf = new Layout(LayoutId.parse("vortex.flat"), 100, null, List.of(), List.of()); InspectorTree.Node root = new InspectorTree.Node(leaf, Optional.empty(), Set.of(), ArrayStats.empty(), List.of()); InspectorTree sut = new InspectorTree( 1, 256L, @@ -101,10 +102,10 @@ void render_formatsBytesAcrossUnits() { @Test void render_chainsChildrenWithArrow() { // Given — nested zoned → chunked → flat chain - Layout flat = new Layout("vortex.flat", 1000, null, List.of(), List.of()); - Layout chunked = new Layout("vortex.chunked", 1000, null, List.of(flat), List.of()); - Layout zoned = new Layout("vortex.stats", 1000, null, List.of(chunked), List.of()); - Layout structLayout = new Layout("vortex.struct", 1000, null, List.of(zoned), List.of()); + Layout flat = new Layout(LayoutId.parse("vortex.flat"), 1000, null, List.of(), List.of()); + Layout chunked = new Layout(LayoutId.parse("vortex.chunked"), 1000, null, List.of(flat), List.of()); + Layout zoned = new Layout(LayoutId.parse("vortex.stats"), 1000, null, List.of(chunked), List.of()); + Layout structLayout = new Layout(LayoutId.parse("vortex.struct"), 1000, null, List.of(zoned), List.of()); InspectorTree.Node flatN = new InspectorTree.Node(flat, Optional.empty(), Set.of(), ArrayStats.empty(), List.of()); InspectorTree.Node chunkedN = new InspectorTree.Node(chunked, Optional.empty(), Set.of(), ArrayStats.empty(), List.of(flatN)); @@ -127,10 +128,10 @@ void render_chainsChildrenWithArrow() { @Test void render_aggregatesMinMaxAcrossChunks() { // Given — column with two chunked Flat leaves; aggregate should fold each leaf's stats - Layout chunk1 = new Layout("vortex.flat", 500, null, List.of(), List.of()); - Layout chunk2 = new Layout("vortex.flat", 500, null, List.of(), List.of()); - Layout chunked = new Layout("vortex.chunked", 1000, null, List.of(chunk1, chunk2), List.of()); - Layout structLayout = new Layout("vortex.struct", 1000, null, List.of(chunked), List.of()); + Layout chunk1 = new Layout(LayoutId.parse("vortex.flat"), 500, null, List.of(), List.of()); + Layout chunk2 = new Layout(LayoutId.parse("vortex.flat"), 500, null, List.of(), List.of()); + Layout chunked = new Layout(LayoutId.parse("vortex.chunked"), 1000, null, List.of(chunk1, chunk2), List.of()); + Layout structLayout = new Layout(LayoutId.parse("vortex.struct"), 1000, null, List.of(chunked), List.of()); InspectorTree.Node c1 = new InspectorTree.Node(chunk1, Optional.empty(), Set.of(), new ArrayStats(10L, 50L, null, null, null, null, null), List.of()); @@ -178,9 +179,9 @@ void render_emptyUsedEncodings_omitsBracketSuffix() { } private static InspectorTree struct2col(int version, long fileSize, List specs, Set usedById) { - Layout idLeaf = new Layout("fastlanes.bitpacked", 1000, null, List.of(), List.of()); - Layout valLeaf = new Layout("vortex.constant", 1000, null, List.of(), List.of()); - Layout root = new Layout("vortex.struct", 1000, null, List.of(idLeaf, valLeaf), List.of()); + Layout idLeaf = new Layout(LayoutId.parse("fastlanes.bitpacked"), 1000, null, List.of(), List.of()); + Layout valLeaf = new Layout(LayoutId.parse("vortex.constant"), 1000, null, List.of(), List.of()); + Layout root = new Layout(LayoutId.parse("vortex.struct"), 1000, null, List.of(idLeaf, valLeaf), List.of()); InspectorTree.Node idNode = new InspectorTree.Node(idLeaf, Optional.of("id"), Set.of("fastlanes.bitpacked"), ArrayStats.empty(), List.of()); diff --git a/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnByteDiff.java b/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnByteDiff.java index 5e335d4e..33d43017 100644 --- a/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnByteDiff.java +++ b/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnByteDiff.java @@ -112,7 +112,7 @@ private static Map perColumnBytes(Path file) throws IOException { private static Layout unwrapToStruct(Layout layout) { while (!layout.isStruct()) { if (layout.children().isEmpty()) { - throw new IllegalStateException("hit leaf before struct: " + layout.encodingId()); + throw new IllegalStateException("hit leaf before struct: " + layout.layoutId()); } layout = layout.children().get(0); } diff --git a/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnTreeDiff.java b/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnTreeDiff.java index 5294f001..7ceb8446 100644 --- a/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnTreeDiff.java +++ b/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnTreeDiff.java @@ -102,7 +102,7 @@ private static void dump(Path file, String column) throws IOException { } private static void walkLayout(VortexReader reader, Layout layout, Footer footer, String indent) { - String header = indent + layout.encodingId() + " rows=" + layout.rowCount(); + String header = indent + layout.layoutId() + " rows=" + layout.rowCount(); if (layout.isFlat() && !layout.segments().isEmpty()) { int segIdx = layout.segments().getFirst(); SegmentSpec spec = footer.segmentSpecs().get(segIdx); @@ -203,7 +203,7 @@ private static String describePatches(ProtoPatchesMetadata p) { private static Layout unwrapToStruct(Layout layout) { while (!layout.isStruct()) { if (layout.children().isEmpty()) { - throw new IllegalStateException("hit leaf before struct: " + layout.encodingId()); + throw new IllegalStateException("hit leaf before struct: " + layout.layoutId()); } layout = layout.children().get(0); } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/Layout.java b/reader/src/main/java/io/github/dfa1/vortex/reader/Layout.java index 08d80b9b..55510dfd 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/Layout.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/Layout.java @@ -1,5 +1,7 @@ package io.github.dfa1.vortex.reader; +import io.github.dfa1.vortex.core.model.LayoutId; + import java.lang.foreign.MemorySegment; import java.util.List; @@ -10,61 +12,53 @@ /// Struct → Zoned(Stats) → Chunked → [Flat, Flat, ...] /// ``` /// -/// @param encodingId encoding id string (e.g. `"vortex.flat"`) -/// @param rowCount number of logical rows covered by this node -/// @param metadata optional encoding-specific metadata bytes, or `null` -/// @param children child layout nodes (empty for leaf nodes) -/// @param segments indices into the file's segment table for buffers owned by this node +/// @param layoutId typed layout id (e.g. [LayoutId#FLAT]) +/// @param rowCount number of logical rows covered by this node +/// @param metadata optional encoding-specific metadata bytes, or `null` +/// @param children child layout nodes (empty for leaf nodes) +/// @param segments indices into the file's segment table for buffers owned by this node public record Layout( - String encodingId, + LayoutId layoutId, long rowCount, MemorySegment metadata, List children, List segments ) { - /// Encoding id for flat (leaf) layouts (`"vortex.flat"`). - public static final String FLAT = "vortex.flat"; - /// Encoding id for chunked layouts (`"vortex.chunked"`). - public static final String CHUNKED = "vortex.chunked"; - /// Encoding id for struct layouts (`"vortex.struct"`). - public static final String STRUCT = "vortex.struct"; - /// Encoding id for zone-map layouts (`"vortex.stats"`). - public static final String ZONED = "vortex.stats"; - /// Encoding id for dictionary layouts (`"vortex.dict"`). - public static final String DICT = "vortex.dict"; - /// Returns `true` if this layout is a flat (leaf) layout. /// - /// @return `true` when `encodingId` equals [#FLAT] + /// @return `true` when `layoutId` is [LayoutId#FLAT] public boolean isFlat() { - return FLAT.equals(encodingId); + return layoutId == LayoutId.FLAT; } /// Returns `true` if this layout is a chunked layout. /// - /// @return `true` when `encodingId` equals [#CHUNKED] + /// @return `true` when `layoutId` is [LayoutId#CHUNKED] public boolean isChunked() { - return CHUNKED.equals(encodingId); + return layoutId == LayoutId.CHUNKED; } /// Returns `true` if this layout is a struct layout. /// - /// @return `true` when `encodingId` equals [#STRUCT] + /// @return `true` when `layoutId` is [LayoutId#STRUCT] public boolean isStruct() { - return STRUCT.equals(encodingId); + return layoutId == LayoutId.STRUCT; } - /// Returns `true` if this layout is a zone-map (stats) layout. + /// Returns `true` if this layout is a zone-map layout. + /// + /// Both the canonical [LayoutId#ZONED] (`vortex.zoned`) and its legacy alias + /// [LayoutId#STATS] (`vortex.stats`, which vortex-java currently writes) count as zoned. /// - /// @return `true` when `encodingId` equals [#ZONED] + /// @return `true` when `layoutId` is [LayoutId#ZONED] or [LayoutId#STATS] public boolean isZoned() { - return ZONED.equals(encodingId); + return layoutId == LayoutId.ZONED || layoutId == LayoutId.STATS; } /// Returns `true` if this layout is a dictionary layout. /// - /// @return `true` when `encodingId` equals [#DICT] + /// @return `true` when `layoutId` is [LayoutId#DICT] public boolean isDict() { - return DICT.equals(encodingId); + return layoutId == LayoutId.DICT; } } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java b/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java index aaba93e2..e4f576af 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java @@ -2,6 +2,7 @@ import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.core.io.IoBounds; +import io.github.dfa1.vortex.core.model.LayoutId; import io.github.dfa1.vortex.core.model.PType; import io.github.dfa1.vortex.core.error.VortexException; import io.github.dfa1.vortex.core.fbs.FbsBinary; @@ -174,7 +175,13 @@ private static Layout convertLayout(io.github.dfa1.vortex.core.fbs.FbsLayout l, "layout encoding index " + encIdx + " out of bounds (layoutSpecs.size=" + layoutSpecs.size() + ")"); } - String encodingId = layoutSpecs.get(encIdx); + String rawLayoutId = layoutSpecs.get(encIdx); + if (rawLayoutId.isBlank()) { + // LayoutId.parse rejects blank ids with IllegalArgumentException; the file is + // untrusted input, so a blank spec entry must surface as VortexException instead. + throw new VortexException("blank layout id at layout spec index " + encIdx); + } + LayoutId layoutId = LayoutId.parse(rawLayoutId); MemorySegment metadata = l.metadataAsSegment(); if (metadata != null && metadata.byteSize() > MAX_LAYOUT_METADATA_BYTES) { @@ -193,7 +200,7 @@ private static Layout convertLayout(io.github.dfa1.vortex.core.fbs.FbsLayout l, segments.add((int) l.segments(i)); } - return new Layout(encodingId, l.rowCount(), metadata, List.copyOf(children), List.copyOf(segments)); + return new Layout(layoutId, l.rowCount(), metadata, List.copyOf(children), List.copyOf(segments)); } private static DType convertDType(io.github.dfa1.vortex.core.fbs.FbsDType fbs, int depth) { diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java b/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java index e7332543..39c21b49 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java @@ -769,6 +769,7 @@ private Array decodeLayout(Layout layout, DType dtype, SegmentAllocator arena) { return decodeDictLayout(layout, dtype, arena); } if (layout.isZoned() && !layout.children().isEmpty()) { + // Both vortex.zoned and its legacy vortex.stats alias wrap the data layout as child[0]. return decodeLayout(layout.children().getFirst(), dtype, arena); } if (layout.isChunked()) { @@ -776,7 +777,9 @@ private Array decodeLayout(Layout layout, DType dtype, SegmentAllocator arena) { collectFlats(layout, flats); return decodeChunkedLayout(flats, dtype, layout.rowCount(), arena); } - throw new VortexException("cannot decode layout " + layout.encodingId()); + // Custom (unknown) or any unhandled well-known layout id fails loudly — Rust has no + // allowUnknown for layouts. + throw new VortexException("cannot decode layout " + layout.layoutId()); } private Array decodeChunkedLayout(List flats, DType dtype, long totalRows, SegmentAllocator arena) { diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/LayoutKindTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/LayoutKindTest.java index a144d68f..fd37aa35 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/LayoutKindTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/LayoutKindTest.java @@ -1,5 +1,6 @@ package io.github.dfa1.vortex.reader; +import io.github.dfa1.vortex.core.model.LayoutId; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; @@ -9,52 +10,60 @@ import static org.assertj.core.api.Assertions.assertThat; -/// Pins the [Layout] encoding-kind predicates (`isFlat`, `isChunked`, `isStruct`, `isZoned`, +/// Pins the [Layout] layout-kind predicates (`isFlat`, `isChunked`, `isStruct`, `isZoned`, /// `isDict`). [ScanIterator] dispatches layout-tree traversal on these, so a predicate that /// silently returned a constant would route a whole layout family down the wrong decode path. /// For a layout of each kind, exactly its own predicate must be `true` and every other `false`, -/// which fixes each method's return to its `encodingId` rather than a constant. +/// which fixes each method's return to its `layoutId` rather than a constant. class LayoutKindTest { - /// Each layout kind paired with its encoding id and the predicate that should recognize it. + /// Each layout kind paired with its typed id and the predicate that should recognize it. The + /// legacy [LayoutId#STATS] alias is expected to satisfy `isZoned` alongside [LayoutId#ZONED]. private enum Kind { - FLAT(Layout.FLAT, Layout::isFlat), - CHUNKED(Layout.CHUNKED, Layout::isChunked), - STRUCT(Layout.STRUCT, Layout::isStruct), - ZONED(Layout.ZONED, Layout::isZoned), - DICT(Layout.DICT, Layout::isDict); + FLAT(LayoutId.FLAT, Layout::isFlat), + CHUNKED(LayoutId.CHUNKED, Layout::isChunked), + STRUCT(LayoutId.STRUCT, Layout::isStruct), + ZONED(LayoutId.ZONED, Layout::isZoned), + STATS(LayoutId.STATS, Layout::isZoned), + DICT(LayoutId.DICT, Layout::isDict); - private final String encodingId; + private final LayoutId layoutId; private final Predicate predicate; - Kind(String encodingId, Predicate predicate) { - this.encodingId = encodingId; + Kind(LayoutId layoutId, Predicate predicate) { + this.layoutId = layoutId; this.predicate = predicate; } } - private static Layout layout(String encodingId) { - return new Layout(encodingId, 0L, null, List.of(), List.of()); + private static Layout layout(LayoutId layoutId) { + return new Layout(layoutId, 0L, null, List.of(), List.of()); + } + + private static boolean isZonedFamily(Kind kind) { + return kind == Kind.ZONED || kind == Kind.STATS; } @ParameterizedTest @EnumSource(Kind.class) - void predicate_recognizesOnlyItsOwnEncodingId(Kind kind) { - // Given — a layout carrying this kind's encoding id - Layout sut = layout(kind.encodingId); + void predicate_recognizesOnlyItsOwnLayoutId(Kind kind) { + // Given — a layout carrying this kind's layout id + Layout sut = layout(kind.layoutId); - // When / Then — only this kind's predicate is true; every other kind's is false + // When / Then — this kind's predicate is true; unrelated predicates are false. ZONED and + // STATS both satisfy isZoned (legacy alias), so each is recognized by the other's predicate. for (Kind other : Kind.values()) { + boolean expected = other == kind || (isZonedFamily(other) && isZonedFamily(kind)); assertThat(other.predicate.test(sut)) - .as("%s on %s", other, kind.encodingId) - .isEqualTo(other == kind); + .as("%s on %s", other, kind.layoutId) + .isEqualTo(expected); } } @Test - void predicates_allFalse_forUnknownEncodingId() { - // Given — an id matching no known layout kind - Layout sut = layout("vortex.bogus"); + void predicates_allFalse_forUnknownLayoutId() { + // Given — a custom id matching no known layout kind + Layout sut = layout(new LayoutId.Custom("vortex.bogus")); // When / Then — no kind claims it for (Kind kind : Kind.values()) { diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/PostscriptParserBlobBoundsTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/PostscriptParserBlobBoundsTest.java index 88463d0b..9ac67f77 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/PostscriptParserBlobBoundsTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/PostscriptParserBlobBoundsTest.java @@ -155,7 +155,7 @@ private static MemorySegment buildFooter() { int asv = FbsFooter.createArraySpecsVector(fbb, new int[]{ FbsArraySpec.createFbsArraySpec(fbb, fbb.createString("vortex.primitive"))}); int lsv = FbsFooter.createLayoutSpecsVector(fbb, new int[]{ - FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString(io.github.dfa1.vortex.reader.Layout.FLAT))}); + FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString(io.github.dfa1.vortex.core.model.LayoutId.FLAT.id()))}); // No segment_specs: validateSegmentSpecs has its own dedicated test; keep this fixture // focused on the blob-pointer bounds. FbsFooter.startSegmentSpecsVector(fbb, 0); diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/PostscriptParserLayoutIdTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/PostscriptParserLayoutIdTest.java new file mode 100644 index 00000000..6a92e91f --- /dev/null +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/PostscriptParserLayoutIdTest.java @@ -0,0 +1,100 @@ +package io.github.dfa1.vortex.reader; + +import io.github.dfa1.vortex.core.error.VortexException; +import io.github.dfa1.vortex.core.fbs.FbsArraySpec; +import io.github.dfa1.vortex.core.fbs.FbsBuilder; +import io.github.dfa1.vortex.core.fbs.FbsFooter; +import io.github.dfa1.vortex.core.fbs.FbsLayout; +import io.github.dfa1.vortex.core.fbs.FbsLayoutSpec; +import io.github.dfa1.vortex.core.model.LayoutId; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.lang.foreign.MemorySegment; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/// Drives `PostscriptParser.convertLayout` string → [LayoutId] resolution through the +/// package-private [PostscriptParser#parseBlobs] with crafted footer / layout FlatBuffers — no +/// file needed. Two concerns: +/// - the zoned wire alias: Rust renamed the zone-map layout id to `vortex.zoned` while keeping +/// `vortex.stats` as a legacy alias (which vortex-java writes). A file using either spec string +/// must parse to a zoned layout that traverses identically, so an interop file from a newer Rust +/// writer does not fail to open. +/// - a blank layout spec entry (untrusted input) must surface as [VortexException], not the +/// [IllegalArgumentException] that [LayoutId#parse] raises for a blank id. +class PostscriptParserLayoutIdTest { + + @ParameterizedTest + @ValueSource(strings = {"vortex.stats", "vortex.zoned"}) + void convertLayout_zonedAlias_parsesToZonedLayoutWithPassthroughChild(String zonedSpec) { + // Given — a zoned node (spec index 1) wrapping one flat child (spec index 0). The zoned + // spec is exercised as both the legacy "vortex.stats" and the canonical "vortex.zoned". + MemorySegment footer = footerWithLayoutSpecs("vortex.flat", zonedSpec); + MemorySegment layout = zonedOverFlatLayout(); + + // When + Layout result = PostscriptParser.parseBlobs(footer, layout, null).layout(); + + // Then — both aliases resolve to a zoned layout with the same shape: isZoned() is true and + // the single flat child is preserved for pass-through decoding. + assertThat(result.isZoned()).isTrue(); + assertThat(result.layoutId()).isEqualTo(LayoutId.parse(zonedSpec)); + assertThat(result.children()).hasSize(1); + assertThat(result.children().getFirst().isFlat()).isTrue(); + } + + @Test + void convertLayout_blankLayoutId_throwsVortexException() { + // Given — a zero-length FlatBuffer string in the layout spec table decodes to "", which + // LayoutId.parse rejects with IllegalArgumentException; untrusted input must surface as + // VortexException instead. + MemorySegment footer = footerWithLayoutSpecs(""); + MemorySegment layout = flatLayout(); + + // When / Then + assertThatThrownBy(() -> PostscriptParser.parseBlobs(footer, layout, null)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("blank layout id"); + } + + // ── FlatBuffer builders ───────────────────────────────────────────────────── + + private static MemorySegment footerWithLayoutSpecs(String... layoutSpecs) { + var fbb = new FbsBuilder(256); + int asv = FbsFooter.createArraySpecsVector(fbb, new int[]{ + FbsArraySpec.createFbsArraySpec(fbb, fbb.createString("vortex.primitive"))}); + int[] ls = new int[layoutSpecs.length]; + for (int i = 0; i < layoutSpecs.length; i++) { + ls[i] = FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString(layoutSpecs[i])); + } + int lsv = FbsFooter.createLayoutSpecsVector(fbb, ls); + FbsFooter.startSegmentSpecsVector(fbb, 0); + int ssv = fbb.endVector(); + int footOff = FbsFooter.createFbsFooter(fbb, asv, lsv, ssv, 0, 0); + fbb.finish(footOff); + return fbb.dataSegment(); + } + + private static MemorySegment flatLayout() { + var fbb = new FbsBuilder(128); + int segV = FbsLayout.createSegmentsVector(fbb, new long[]{0}); + int off = FbsLayout.createFbsLayout(fbb, 0, 1L, 0, 0, segV); + FbsLayout.finishFbsLayoutBuffer(fbb, off); + return fbb.dataSegment(); + } + + private static MemorySegment zonedOverFlatLayout() { + var fbb = new FbsBuilder(192); + // Flat child first — FlatBuffer requires children be finished before parents. + int childSegV = FbsLayout.createSegmentsVector(fbb, new long[]{0}); + int flatChild = FbsLayout.createFbsLayout(fbb, 0, 1L, 0, 0, childSegV); + int childV = FbsLayout.createChildrenVector(fbb, new int[]{flatChild}); + // Zoned parent references layout spec index 1. + int zoned = FbsLayout.createFbsLayout(fbb, 1, 1L, 0, childV, 0); + FbsLayout.finishFbsLayoutBuffer(fbb, zoned); + return fbb.dataSegment(); + } +} diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java index 3e05ff68..7bff6dbf 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java @@ -9,6 +9,7 @@ import io.github.dfa1.vortex.core.model.PType; import io.github.dfa1.vortex.core.io.VortexFormat; import io.github.dfa1.vortex.core.model.EncodingId; +import io.github.dfa1.vortex.core.model.LayoutId; import io.github.dfa1.vortex.writer.encode.EncodeContext; import io.github.dfa1.vortex.writer.encode.EncodeNode; import io.github.dfa1.vortex.core.proto.ProtoScalarValue; @@ -1021,12 +1022,14 @@ private ByteBuffer buildFooter() { } int asv = FbsFooter.createArraySpecsVector(fbb, asOffsets); - // layout_specs: ["vortex.flat", "vortex.chunked", "vortex.struct", "vortex.dict"] - int ls0 = FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString("vortex.flat")); - int ls1 = FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString("vortex.chunked")); - int ls2 = FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString("vortex.struct")); - int ls3 = FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString("vortex.dict")); - int ls4 = FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString("vortex.stats")); + // layout_specs, in LAYOUT_* index order: FLAT, CHUNKED, STRUCT, DICT, then the zoned + // layout emitted as the legacy "vortex.stats" alias (old and new Rust readers accept it; + // "vortex.zoned" would break older readers). + int ls0 = FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString(LayoutId.FLAT.id())); + int ls1 = FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString(LayoutId.CHUNKED.id())); + int ls2 = FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString(LayoutId.STRUCT.id())); + int ls3 = FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString(LayoutId.DICT.id())); + int ls4 = FbsLayoutSpec.createFbsLayoutSpec(fbb, fbb.createString(LayoutId.STATS.id())); int lsv = FbsFooter.createLayoutSpecsVector(fbb, new int[]{ls0, ls1, ls2, ls3, ls4}); // segment_specs (inline struct vector — write in reverse order) From b08ace7945f9aec96ca48b35c5e4abef781e9b5c Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 4 Jul 2026 11:47:19 +0200 Subject: [PATCH 02/12] refactor(reader): Layout classes move to reader.layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Layout and ZonedStatsSchema get their own package, mirroring reader.decode on the encoding side — and giving the future LayoutDecoder SPI a landing zone. FlatSegmentDecoder stays in the reader root: its only callers live there and moving it would force it back to public. Pitest targetClasses updated for the new Layout FQN. Co-Authored-By: Claude Fable 5 --- .../io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java | 4 ++-- .../java/io/github/dfa1/vortex/inspect/InspectorTree.java | 2 +- .../java/io/github/dfa1/vortex/inspect/VortexInspector.java | 2 +- .../java/io/github/dfa1/vortex/inspect/InspectorTreeTest.java | 2 +- .../io/github/dfa1/vortex/inspect/VortexInspectorTest.java | 2 +- .../integration/PcoFixtureInspectionIntegrationTest.java | 2 +- .../io/github/dfa1/vortex/performance/TaxiColumnByteDiff.java | 2 +- .../io/github/dfa1/vortex/performance/TaxiColumnTreeDiff.java | 2 +- reader/pom.xml | 2 +- .../java/io/github/dfa1/vortex/reader/PostscriptParser.java | 2 ++ .../main/java/io/github/dfa1/vortex/reader/ScanIterator.java | 3 +++ .../main/java/io/github/dfa1/vortex/reader/VortexHandle.java | 2 ++ .../java/io/github/dfa1/vortex/reader/VortexHttpReader.java | 2 ++ .../main/java/io/github/dfa1/vortex/reader/VortexReader.java | 2 ++ .../io/github/dfa1/vortex/reader/{ => layout}/Layout.java | 2 +- .../dfa1/vortex/reader/{ => layout}/ZonedStatsSchema.java | 2 +- .../dfa1/vortex/reader/PostscriptParserLayoutIdTest.java | 2 ++ .../dfa1/vortex/reader/{ => layout}/LayoutKindTest.java | 2 +- .../dfa1/vortex/reader/{ => layout}/ZonedStatsSchemaTest.java | 2 +- .../java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java | 2 +- 20 files changed, 28 insertions(+), 15 deletions(-) rename reader/src/main/java/io/github/dfa1/vortex/reader/{ => layout}/Layout.java (97%) rename reader/src/main/java/io/github/dfa1/vortex/reader/{ => layout}/ZonedStatsSchema.java (99%) rename reader/src/test/java/io/github/dfa1/vortex/reader/{ => layout}/LayoutKindTest.java (98%) rename reader/src/test/java/io/github/dfa1/vortex/reader/{ => layout}/ZonedStatsSchemaTest.java (99%) diff --git a/cli/src/main/java/io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java b/cli/src/main/java/io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java index 2d7d887b..57983efe 100644 --- a/cli/src/main/java/io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java +++ b/cli/src/main/java/io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java @@ -1,6 +1,6 @@ package io.github.dfa1.vortex.cli.tui; -import io.github.dfa1.vortex.reader.Layout; +import io.github.dfa1.vortex.reader.layout.Layout; import io.github.dfa1.vortex.reader.SegmentSpec; import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.reader.array.Array; @@ -9,7 +9,7 @@ import io.github.dfa1.vortex.cli.tui.term.Terminal; import io.github.dfa1.vortex.inspect.ByteSize; import io.github.dfa1.vortex.inspect.InspectorTree; -import io.github.dfa1.vortex.reader.ZonedStatsSchema; +import io.github.dfa1.vortex.reader.layout.ZonedStatsSchema; import io.github.dfa1.vortex.reader.VortexHandle; import io.github.dfa1.vortex.reader.Chunk; import io.github.dfa1.vortex.reader.ScanIterator; diff --git a/inspector/src/main/java/io/github/dfa1/vortex/inspect/InspectorTree.java b/inspector/src/main/java/io/github/dfa1/vortex/inspect/InspectorTree.java index ed28436d..2e561cc9 100644 --- a/inspector/src/main/java/io/github/dfa1/vortex/inspect/InspectorTree.java +++ b/inspector/src/main/java/io/github/dfa1/vortex/inspect/InspectorTree.java @@ -5,7 +5,7 @@ import io.github.dfa1.vortex.reader.ArrayStats; import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.reader.Footer; -import io.github.dfa1.vortex.reader.Layout; +import io.github.dfa1.vortex.reader.layout.Layout; import io.github.dfa1.vortex.reader.SegmentSpec; import io.github.dfa1.vortex.core.fbs.FbsArray; import io.github.dfa1.vortex.core.fbs.FbsArrayNode; diff --git a/inspector/src/main/java/io/github/dfa1/vortex/inspect/VortexInspector.java b/inspector/src/main/java/io/github/dfa1/vortex/inspect/VortexInspector.java index c4247184..2b1eadc1 100644 --- a/inspector/src/main/java/io/github/dfa1/vortex/inspect/VortexInspector.java +++ b/inspector/src/main/java/io/github/dfa1/vortex/inspect/VortexInspector.java @@ -2,7 +2,7 @@ import io.github.dfa1.vortex.reader.ArrayStats; import io.github.dfa1.vortex.core.model.DType; -import io.github.dfa1.vortex.reader.Layout; +import io.github.dfa1.vortex.reader.layout.Layout; import io.github.dfa1.vortex.reader.SegmentSpec; import io.github.dfa1.vortex.reader.VortexHandle; diff --git a/inspector/src/test/java/io/github/dfa1/vortex/inspect/InspectorTreeTest.java b/inspector/src/test/java/io/github/dfa1/vortex/inspect/InspectorTreeTest.java index 2f360d56..7f1b07dd 100644 --- a/inspector/src/test/java/io/github/dfa1/vortex/inspect/InspectorTreeTest.java +++ b/inspector/src/test/java/io/github/dfa1/vortex/inspect/InspectorTreeTest.java @@ -4,7 +4,7 @@ import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.core.model.LayoutId; import io.github.dfa1.vortex.reader.Footer; -import io.github.dfa1.vortex.reader.Layout; +import io.github.dfa1.vortex.reader.layout.Layout; import io.github.dfa1.vortex.reader.SegmentSpec; import io.github.dfa1.vortex.reader.VortexHandle; import org.junit.jupiter.api.Test; diff --git a/inspector/src/test/java/io/github/dfa1/vortex/inspect/VortexInspectorTest.java b/inspector/src/test/java/io/github/dfa1/vortex/inspect/VortexInspectorTest.java index bfdb0e07..420499e0 100644 --- a/inspector/src/test/java/io/github/dfa1/vortex/inspect/VortexInspectorTest.java +++ b/inspector/src/test/java/io/github/dfa1/vortex/inspect/VortexInspectorTest.java @@ -4,7 +4,7 @@ import io.github.dfa1.vortex.reader.CompressionScheme; import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.core.model.LayoutId; -import io.github.dfa1.vortex.reader.Layout; +import io.github.dfa1.vortex.reader.layout.Layout; import io.github.dfa1.vortex.reader.SegmentSpec; import org.junit.jupiter.api.Test; diff --git a/integration/src/test/java/io/github/dfa1/vortex/integration/PcoFixtureInspectionIntegrationTest.java b/integration/src/test/java/io/github/dfa1/vortex/integration/PcoFixtureInspectionIntegrationTest.java index 7eee4e21..4fbb7b9c 100644 --- a/integration/src/test/java/io/github/dfa1/vortex/integration/PcoFixtureInspectionIntegrationTest.java +++ b/integration/src/test/java/io/github/dfa1/vortex/integration/PcoFixtureInspectionIntegrationTest.java @@ -1,7 +1,7 @@ package io.github.dfa1.vortex.integration; import io.github.dfa1.vortex.core.model.DType; -import io.github.dfa1.vortex.reader.Layout; +import io.github.dfa1.vortex.reader.layout.Layout; import io.github.dfa1.vortex.reader.SegmentSpec; import io.github.dfa1.vortex.reader.ReadRegistry; import io.github.dfa1.vortex.core.fbs.FbsArray; diff --git a/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnByteDiff.java b/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnByteDiff.java index 33d43017..0f6b36d7 100644 --- a/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnByteDiff.java +++ b/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnByteDiff.java @@ -3,7 +3,7 @@ import dev.vortex.api.Session; import dev.vortex.jni.NativeLoader; import io.github.dfa1.vortex.core.model.DType; -import io.github.dfa1.vortex.reader.Layout; +import io.github.dfa1.vortex.reader.layout.Layout; import io.github.dfa1.vortex.reader.ReadRegistry; import io.github.dfa1.vortex.reader.SegmentSpec; import io.github.dfa1.vortex.reader.VortexReader; diff --git a/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnTreeDiff.java b/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnTreeDiff.java index 7ceb8446..149a6496 100644 --- a/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnTreeDiff.java +++ b/performance/src/main/java/io/github/dfa1/vortex/performance/TaxiColumnTreeDiff.java @@ -14,7 +14,7 @@ import io.github.dfa1.vortex.core.proto.ProtoBitPackedMetadata; import io.github.dfa1.vortex.core.proto.ProtoPatchesMetadata; import io.github.dfa1.vortex.reader.Footer; -import io.github.dfa1.vortex.reader.Layout; +import io.github.dfa1.vortex.reader.layout.Layout; import io.github.dfa1.vortex.reader.ReadRegistry; import io.github.dfa1.vortex.reader.SegmentSpec; import io.github.dfa1.vortex.reader.VortexReader; diff --git a/reader/pom.xml b/reader/pom.xml index 54b2ca34..5753227e 100644 --- a/reader/pom.xml +++ b/reader/pom.xml @@ -89,7 +89,7 @@ io.github.dfa1.vortex.reader.Trailer io.github.dfa1.vortex.reader.PostscriptParser io.github.dfa1.vortex.reader.SegmentSpec - io.github.dfa1.vortex.reader.Layout + io.github.dfa1.vortex.reader.layout.Layout io.github.dfa1.vortex.reader.FlatSegmentDecoder diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java b/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java index e4f576af..c9fece5e 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java @@ -1,5 +1,7 @@ package io.github.dfa1.vortex.reader; +import io.github.dfa1.vortex.reader.layout.Layout; + import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.core.io.IoBounds; import io.github.dfa1.vortex.core.model.LayoutId; diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java b/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java index 39c21b49..49dfcf93 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java @@ -1,5 +1,8 @@ package io.github.dfa1.vortex.reader; +import io.github.dfa1.vortex.reader.layout.Layout; +import io.github.dfa1.vortex.reader.layout.ZonedStatsSchema; + import static io.github.dfa1.vortex.core.io.PTypeIO.LE_SHORT; import static io.github.dfa1.vortex.core.io.PTypeIO.LE_INT; import static io.github.dfa1.vortex.core.io.PTypeIO.LE_LONG; diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java index db25511b..17686519 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java @@ -1,5 +1,7 @@ package io.github.dfa1.vortex.reader; +import io.github.dfa1.vortex.reader.layout.Layout; + import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.reader.array.Array; diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java index 15054899..06ca6835 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java @@ -1,5 +1,7 @@ package io.github.dfa1.vortex.reader; +import io.github.dfa1.vortex.reader.layout.Layout; + import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.core.io.IoBounds; import io.github.dfa1.vortex.core.error.VortexException; diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java index 89e335d6..0dc3c2c1 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java @@ -1,5 +1,7 @@ package io.github.dfa1.vortex.reader; +import io.github.dfa1.vortex.reader.layout.Layout; + import static io.github.dfa1.vortex.core.io.PTypeIO.LE_INT; import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.core.io.IoBounds; diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/Layout.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/Layout.java similarity index 97% rename from reader/src/main/java/io/github/dfa1/vortex/reader/Layout.java rename to reader/src/main/java/io/github/dfa1/vortex/reader/layout/Layout.java index 55510dfd..0b677bb3 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/Layout.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/Layout.java @@ -1,4 +1,4 @@ -package io.github.dfa1.vortex.reader; +package io.github.dfa1.vortex.reader.layout; import io.github.dfa1.vortex.core.model.LayoutId; diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/ZonedStatsSchema.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/ZonedStatsSchema.java similarity index 99% rename from reader/src/main/java/io/github/dfa1/vortex/reader/ZonedStatsSchema.java rename to reader/src/main/java/io/github/dfa1/vortex/reader/layout/ZonedStatsSchema.java index f0847d72..e592e31d 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/ZonedStatsSchema.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/ZonedStatsSchema.java @@ -1,4 +1,4 @@ -package io.github.dfa1.vortex.reader; +package io.github.dfa1.vortex.reader.layout; import io.github.dfa1.vortex.core.model.DType; diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/PostscriptParserLayoutIdTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/PostscriptParserLayoutIdTest.java index 6a92e91f..3cb4f862 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/PostscriptParserLayoutIdTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/PostscriptParserLayoutIdTest.java @@ -1,5 +1,7 @@ package io.github.dfa1.vortex.reader; +import io.github.dfa1.vortex.reader.layout.Layout; + import io.github.dfa1.vortex.core.error.VortexException; import io.github.dfa1.vortex.core.fbs.FbsArraySpec; import io.github.dfa1.vortex.core.fbs.FbsBuilder; diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/LayoutKindTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/layout/LayoutKindTest.java similarity index 98% rename from reader/src/test/java/io/github/dfa1/vortex/reader/LayoutKindTest.java rename to reader/src/test/java/io/github/dfa1/vortex/reader/layout/LayoutKindTest.java index fd37aa35..e963d352 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/LayoutKindTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/layout/LayoutKindTest.java @@ -1,4 +1,4 @@ -package io.github.dfa1.vortex.reader; +package io.github.dfa1.vortex.reader.layout; import io.github.dfa1.vortex.core.model.LayoutId; import org.junit.jupiter.api.Test; diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/ZonedStatsSchemaTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/layout/ZonedStatsSchemaTest.java similarity index 99% rename from reader/src/test/java/io/github/dfa1/vortex/reader/ZonedStatsSchemaTest.java rename to reader/src/test/java/io/github/dfa1/vortex/reader/layout/ZonedStatsSchemaTest.java index 4ac75667..e0665b32 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/ZonedStatsSchemaTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/layout/ZonedStatsSchemaTest.java @@ -1,4 +1,4 @@ -package io.github.dfa1.vortex.reader; +package io.github.dfa1.vortex.reader.layout; import java.lang.foreign.MemorySegment; diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java index 95d71957..9e9cb2ac 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java @@ -8,7 +8,7 @@ import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.core.model.PType; -import io.github.dfa1.vortex.reader.Layout; +import io.github.dfa1.vortex.reader.layout.Layout; import io.github.dfa1.vortex.reader.SegmentSpec; import io.github.dfa1.vortex.reader.VortexReader; import io.github.dfa1.vortex.reader.array.DoubleArray; From 7588aa3140c7115c57179a9fb53811a2361b92ce Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 4 Jul 2026 11:54:07 +0200 Subject: [PATCH 03/12] refactor(reader): UnknownArray carries the typed EncodingId MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit String encodingId predated the sealed EncodingId — a closed enum could not represent an unknown id, so the raw string was the only option. Now the component is typed: a Custom, or a WellKnown whose decoder is not registered. Co-Authored-By: Claude Fable 5 --- .../java/io/github/dfa1/vortex/reader/ReadRegistry.java | 3 +-- .../io/github/dfa1/vortex/reader/array/UnknownArray.java | 5 +++-- .../java/io/github/dfa1/vortex/reader/ReadRegistryTest.java | 6 +++--- .../java/io/github/dfa1/vortex/reader/VortexReaderTest.java | 4 ++-- .../github/dfa1/vortex/reader/array/ArrayLimitedTest.java | 3 ++- .../dfa1/vortex/reader/array/ArrayMaterializeTest.java | 3 ++- 6 files changed, 13 insertions(+), 11 deletions(-) diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/ReadRegistry.java b/reader/src/main/java/io/github/dfa1/vortex/reader/ReadRegistry.java index 0274172d..4f541ffa 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/ReadRegistry.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/ReadRegistry.java @@ -99,7 +99,6 @@ public MemorySegment decodeAsSegment(DecodeContext ctx) { } private static UnknownArray decodeUnknown(DecodeContext ctx, ArrayNode node) { - String rawId = node.encodingId().id(); MemorySegment[] bufs = new MemorySegment[node.bufferIndices().length]; for (int i = 0; i < bufs.length; i++) { bufs[i] = ctx.buffer(i); @@ -113,7 +112,7 @@ private static UnknownArray decodeUnknown(DecodeContext ctx, ArrayNode node) { children[i] = decodeUnknown(childCtx, childNode); } return new UnknownArray( - rawId, ctx.dtype(), ctx.rowCount(), + node.encodingId(), ctx.dtype(), ctx.rowCount(), node.metadata(), bufs, children); } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/UnknownArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/UnknownArray.java index dad18a17..4ae11bdb 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/UnknownArray.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/UnknownArray.java @@ -1,6 +1,7 @@ package io.github.dfa1.vortex.reader.array; import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.core.model.EncodingId; import io.github.dfa1.vortex.core.error.VortexException; import java.lang.foreign.MemorySegment; @@ -15,7 +16,7 @@ /// Constructed by `Registry` when `allowUnknown()` is set and an encoding id is not /// in the registry. Data access beyond `buffer(i)` and `child(i)` is not supported. /// -/// @param encodingId the unrecognized encoding id string +/// @param encodingId the unrecognized encoding id — a [EncodingId.Custom], or a [EncodingId.WellKnown] with no registered decoder /// @param dtype logical type of the array /// @param length number of logical rows /// @param metadata raw encoding metadata bytes, or `null` @@ -23,7 +24,7 @@ /// @param children decoded child arrays (also wrapped as unknown) @SuppressWarnings("java:S6218") // internal data carrier; record components are arrays of immutable primitives or refs that flow through pipelines without ever being compared. public record UnknownArray( - String encodingId, + EncodingId encodingId, DType dtype, long length, MemorySegment metadata, diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/ReadRegistryTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/ReadRegistryTest.java index 1a295f4c..dc4ad7a0 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/ReadRegistryTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/ReadRegistryTest.java @@ -64,7 +64,7 @@ void decodeKnownEncodingWithoutDecoderReturnsUnknownArrayWhenAllowed() { // Then assertThat(result).isInstanceOf(UnknownArray.class); - assertThat(((UnknownArray) result).encodingId()).isEqualTo("vortex.primitive"); + assertThat(((UnknownArray) result).encodingId()).isEqualTo(EncodingId.VORTEX_PRIMITIVE); } @Test @@ -85,7 +85,7 @@ void decodeUnknownEncodingReturnsUnknownArrayWhenAllowed() { // Then assertThat(result).isInstanceOf(UnknownArray.class); UnknownArray unknown = (UnknownArray) result; - assertThat(unknown.encodingId()).isEqualTo("some.unknown"); + assertThat(unknown.encodingId()).isEqualTo(new EncodingId.Custom("some.unknown")); assertThat(unknown.dtype()).isEqualTo(DTypes.I32); assertThat(unknown.length()).isEqualTo(5L); assertThat(unknown.metadata()).isEqualTo(metadata); @@ -114,7 +114,7 @@ void decodeUnknownEncodingWrapsChildrenAsUnknown() { UnknownArray unknown = (UnknownArray) result; assertThat(unknown.children()).hasSize(1); assertThat(unknown.children()[0]).isInstanceOf(UnknownArray.class); - assertThat(((UnknownArray) unknown.children()[0]).encodingId()).isEqualTo("vortex.primitive"); + assertThat(((UnknownArray) unknown.children()[0]).encodingId()).isEqualTo(EncodingId.VORTEX_PRIMITIVE); assertThat(sut.isAllowUnknown()).isTrue(); } diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/VortexReaderTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/VortexReaderTest.java index 427497e0..7e25c922 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/VortexReaderTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/VortexReaderTest.java @@ -40,7 +40,7 @@ public EncodingId encodingId() { public Array decode(DecodeContext ctx) { // Generic zero-length stand-in: scan chunk row count comes from the // layout, so the leaf array's contents are irrelevant to this test. - return new UnknownArray("stub", ctx.dtype(), 0, null, + return new UnknownArray(EncodingId.parse("stub"), ctx.dtype(), 0, null, new MemorySegment[0], new Array[0]); } }); @@ -198,7 +198,7 @@ void scan_withNoDecoders_allowUnknown_returnsUnknownArray(String name) throws UR for (Array column : chunk.columns().values()) { assertThat(column).isInstanceOf(UnknownArray.class); UnknownArray foreign = (UnknownArray) column; - assertThat(foreign.encodingId()).describedAs(name).startsWith("vortex."); + assertThat(foreign.encodingId().id()).describedAs(name).startsWith("vortex."); } } } diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/array/ArrayLimitedTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/array/ArrayLimitedTest.java index 89f020c5..401c6e02 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/array/ArrayLimitedTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/array/ArrayLimitedTest.java @@ -1,6 +1,7 @@ package io.github.dfa1.vortex.reader.array; import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.core.model.EncodingId; import io.github.dfa1.vortex.core.error.VortexException; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -321,7 +322,7 @@ class Unsupported { @Test void unknownArrayThrows() { // Given - UnknownArray sut = new UnknownArray("vortex.mystery", I64, 3, null, + UnknownArray sut = new UnknownArray(EncodingId.parse("vortex.mystery"), I64, 3, null, new MemorySegment[0], new Array[0]); // When / Then diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/array/ArrayMaterializeTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/array/ArrayMaterializeTest.java index 2dd3be3d..6d00d848 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/array/ArrayMaterializeTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/array/ArrayMaterializeTest.java @@ -1,6 +1,7 @@ package io.github.dfa1.vortex.reader.array; import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.core.model.EncodingId; import io.github.dfa1.vortex.core.error.VortexException; import io.github.dfa1.vortex.core.io.PTypeIO; import org.junit.jupiter.api.Nested; @@ -303,7 +304,7 @@ void bytePartsDecimalThrows() { @Test void unknownArrayThrows() { // Given an undecoded foreign encoding - UnknownArray sut = new UnknownArray("vortex.mystery", I64, 3, null, + UnknownArray sut = new UnknownArray(EncodingId.parse("vortex.mystery"), I64, 3, null, new MemorySegment[0], new Array[0]); // When / Then From 1c7084b14ac57384fcd3716360b570c312530563 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 4 Jul 2026 11:56:13 +0200 Subject: [PATCH 04/12] style(reader): fold layout imports into the import block Co-Authored-By: Claude Fable 5 --- .../java/io/github/dfa1/vortex/reader/PostscriptParser.java | 3 +-- .../main/java/io/github/dfa1/vortex/reader/ScanIterator.java | 5 ++--- .../main/java/io/github/dfa1/vortex/reader/VortexHandle.java | 3 +-- .../java/io/github/dfa1/vortex/reader/VortexHttpReader.java | 3 +-- .../main/java/io/github/dfa1/vortex/reader/VortexReader.java | 3 +-- 5 files changed, 6 insertions(+), 11 deletions(-) diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java b/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java index c9fece5e..11550cfe 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/PostscriptParser.java @@ -1,7 +1,5 @@ package io.github.dfa1.vortex.reader; -import io.github.dfa1.vortex.reader.layout.Layout; - import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.core.io.IoBounds; import io.github.dfa1.vortex.core.model.LayoutId; @@ -18,6 +16,7 @@ import io.github.dfa1.vortex.core.fbs.FbsType; import io.github.dfa1.vortex.core.fbs.FbsUtf8; import io.github.dfa1.vortex.core.fbs.FbsVariant; +import io.github.dfa1.vortex.reader.layout.Layout; import java.lang.foreign.MemorySegment; import java.util.ArrayList; diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java b/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java index 49dfcf93..7be4183b 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java @@ -1,8 +1,5 @@ package io.github.dfa1.vortex.reader; -import io.github.dfa1.vortex.reader.layout.Layout; -import io.github.dfa1.vortex.reader.layout.ZonedStatsSchema; - import static io.github.dfa1.vortex.core.io.PTypeIO.LE_SHORT; import static io.github.dfa1.vortex.core.io.PTypeIO.LE_INT; import static io.github.dfa1.vortex.core.io.PTypeIO.LE_LONG; @@ -42,6 +39,8 @@ import io.github.dfa1.vortex.reader.array.ShortArray; import io.github.dfa1.vortex.reader.array.StructArray; import io.github.dfa1.vortex.reader.array.VarBinArray; +import io.github.dfa1.vortex.reader.layout.Layout; +import io.github.dfa1.vortex.reader.layout.ZonedStatsSchema; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java index 17686519..9e1af2ca 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java @@ -1,9 +1,8 @@ package io.github.dfa1.vortex.reader; -import io.github.dfa1.vortex.reader.layout.Layout; - import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.reader.array.Array; +import io.github.dfa1.vortex.reader.layout.Layout; import java.io.Closeable; import java.lang.foreign.MemorySegment; diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java index 06ca6835..df98349f 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java @@ -1,12 +1,11 @@ package io.github.dfa1.vortex.reader; -import io.github.dfa1.vortex.reader.layout.Layout; - import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.core.io.IoBounds; import io.github.dfa1.vortex.core.error.VortexException; import io.github.dfa1.vortex.core.io.VortexFormat; import io.github.dfa1.vortex.core.fbs.FbsPostscript; +import io.github.dfa1.vortex.reader.layout.Layout; import java.io.IOException; import java.lang.foreign.Arena; diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java index 0dc3c2c1..f07e47ad 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java @@ -1,12 +1,11 @@ package io.github.dfa1.vortex.reader; -import io.github.dfa1.vortex.reader.layout.Layout; - import static io.github.dfa1.vortex.core.io.PTypeIO.LE_INT; import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.core.io.IoBounds; import io.github.dfa1.vortex.core.error.VortexException; import io.github.dfa1.vortex.core.io.VortexFormat; +import io.github.dfa1.vortex.reader.layout.Layout; import java.io.IOException; import java.lang.foreign.Arena; From b0f06813bbeb0a9c25e6cda97f59dd044333064f Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 4 Jul 2026 11:57:25 +0200 Subject: [PATCH 05/12] docs(changelog): LayoutId, reader.layout package, typed UnknownArray Co-Authored-By: Claude Fable 5 --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b26b9cfb..68a0fe40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `Compute.filteredSum(filterColumn, predicate, aggColumn)` fuses a filter and a sum into a single scan — a row folds into the total only when the predicate selects it (a null filter row is excluded) and the aggregate value is non-null — with no intermediate selection bitmap. It matches a hand-written fused loop and is ~1.5× faster than the two-pass `filter` + `sum`. ([57d2225b](https://github.com/dfa1/vortex-java/commit/57d2225b)) - `Compute.filteredAggregate(chunk, filter, aggColumn)` fuses a whole multi-column `RowFilter` (an n-ary `AND` of column-bound predicate leaves) and folds the selected rows' `SUM`/`MIN`/`MAX`/non-null count over an aggregate column in a single pass — the multi-column counterpart of `filteredSum`, and the row-level kernel behind the Calcite boundary-chunk aggregate push-down. A `null` aggregate column counts selected rows only (`COUNT(*)`). ([2ba54888](https://github.com/dfa1/vortex-java/commit/2ba54888)) +- `core.model.LayoutId` — typed layout identity with the same sealed shape as `EncodingId` (`WellKnown` constants plus `Custom`; layouts are runtime-pluggable in the reference implementation). The reader now recognizes `vortex.zoned`, the current canonical zone-map layout id in the Rust reference, alongside the legacy `vortex.stats` alias it keeps writing — files from current Rust writers scan and prune correctly. ([7df3a0db](https://github.com/dfa1/vortex-java/commit/7df3a0db)) ### Changed @@ -19,6 +20,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - A multi-column `AND` filter no longer forfeits the dictionary lane: the dict-encoded leaf drives the code scan and the remaining predicates are evaluated only on its matches — `SUM(…) WHERE category = 7 AND price > 500` over 100M rows drops from ~2.3 s to ~200 ms (~11×). ([12e13501](https://github.com/dfa1/vortex-java/commit/12e13501)) - `core.model.EncodingId` is now a sealed interface: the spec constants live in the nested `WellKnown` enum (re-exported, so `EncodingId.VORTEX_FOO` call sites compile unchanged) and `Custom` wraps any other wire string, which for the first time lets third-party `EncodingDecoder`/`EncodingEncoder` implementations declare ids outside the spec set. `parse` is total over non-blank ids — an unknown id yields a typed `Custom` instead of an empty `Optional`. ([ea88a91b](https://github.com/dfa1/vortex-java/commit/ea88a91b)) - `reader.decode.ArrayNode` is a single record carrying the typed `EncodingId`; the `KnownArrayNode`/`UnknownArrayNode` split and the `ArrayNode.of` factory are gone. Decode dispatch, the `allowUnknown` passthrough, and error messages are unchanged. A crafted file with a blank encoding id now fails as `VortexException` instead of escaping as `IllegalArgumentException`. ([21810d7e](https://github.com/dfa1/vortex-java/commit/21810d7e)) +- `Layout` and `ZonedStatsSchema` moved to the new `reader.layout` package, and `Layout`'s misnamed `String encodingId` component is now `LayoutId layoutId`. Unknown layouts still fail loudly, now with a typed id in the error. ([7df3a0db](https://github.com/dfa1/vortex-java/commit/7df3a0db), [b08ace79](https://github.com/dfa1/vortex-java/commit/b08ace79)) +- `UnknownArray.encodingId` is a typed `EncodingId` instead of a raw string — a `Custom`, or a `WellKnown` whose decoder is not registered. ([7588aa31](https://github.com/dfa1/vortex-java/commit/7588aa31)) ## [0.11.0] — 2026-06-28 From fc488d04db0b4bd7307d2ecdb2265ca85a15b0b6 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 4 Jul 2026 13:28:05 +0200 Subject: [PATCH 06/12] =?UTF-8?q?feat(reader):=20LayoutDecoder=20SPI=20?= =?UTF-8?q?=E2=80=94=20layout=20decode=20is=20pluggable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LayoutDecoder + LayoutRegistry in reader.layout mirror the ReadRegistry idiom (builder final-freeze, string-keyed dispatch, duplicate registration throws, no service file — programmatic registration like ExtensionDecoder). The four built-ins move out of ScanIterator verbatim: Flat, Chunked, Zoned (claims both the canonical vortex.zoned and legacy vortex.stats ids via the layoutIds() set), Dict. ScanIterator.decodeLayout is now one registry call; zone-map pruning and chunk planning keep inspecting built-ins only — the SPI covers full-column subtree decode. Wired end-to-end per the no-decorative-flags rule: VortexHandle gains layoutRegistry(), both readers take open(..., LayoutRegistry) overloads, and a scan through a custom registry is proven by test. Unknown layouts still fail loudly (Rust default). Reverses the "Layout is a fixed set, no SPI" design decision — the reference implementation treats layouts as runtime-pluggable. Co-Authored-By: Claude Fable 5 --- CLAUDE.md | 17 +- .../dfa1/vortex/reader/ScanIterator.java | 267 ++---------------- .../dfa1/vortex/reader/VortexHandle.java | 8 + .../dfa1/vortex/reader/VortexHttpReader.java | 26 +- .../dfa1/vortex/reader/VortexReader.java | 29 +- .../reader/layout/ChunkedLayoutDecoder.java | 98 +++++++ .../reader/layout/DictLayoutDecoder.java | 203 +++++++++++++ .../reader/layout/FlatLayoutDecoder.java | 39 +++ .../reader/layout/LayoutDecodeContext.java | 48 ++++ .../vortex/reader/layout/LayoutDecoder.java | 42 +++ .../vortex/reader/layout/LayoutRegistry.java | 114 ++++++++ .../reader/layout/ZonedLayoutDecoder.java | 35 +++ .../reader/layout/LayoutRegistryTest.java | 176 ++++++++++++ 13 files changed, 849 insertions(+), 253 deletions(-) create mode 100644 reader/src/main/java/io/github/dfa1/vortex/reader/layout/ChunkedLayoutDecoder.java create mode 100644 reader/src/main/java/io/github/dfa1/vortex/reader/layout/DictLayoutDecoder.java create mode 100644 reader/src/main/java/io/github/dfa1/vortex/reader/layout/FlatLayoutDecoder.java create mode 100644 reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutDecodeContext.java create mode 100644 reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutDecoder.java create mode 100644 reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutRegistry.java create mode 100644 reader/src/main/java/io/github/dfa1/vortex/reader/layout/ZonedLayoutDecoder.java create mode 100644 reader/src/test/java/io/github/dfa1/vortex/reader/layout/LayoutRegistryTest.java diff --git a/CLAUDE.md b/CLAUDE.md index 4bf1e857..c9e61023 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -23,17 +23,18 @@ Benchmark classes follow this: `JavaVsJni{Read,Write,Filter}Benchmark`, ``` core — everything lives under `io.github.dfa1.vortex.core.*`: - core.model DType, PType, TimeUnit, EncodingId, ExtensionId, TimeDtype, TimestampDtype + core.model DType, PType, TimeUnit, EncodingId, LayoutId, ExtensionId, TimeDtype, TimestampDtype core.io IoBounds, PTypeIO, VortexFormat core.error VortexException core.compute FastLanes, PrimitiveArrays core.fbs / core.proto — generated wire codecs + their runtimes reader — VortexReader, VortexHttpReader, VortexHandle, ReadRegistry, Chunk, ArrayStats, - ScanOptions, RowFilter; file internals (Footer, Layout, Trailer, - PostscriptParser, …) + ScanOptions, RowFilter; file internals (Footer, Trailer, PostscriptParser, …) reader.array — Array + all subtypes (decode outputs) reader.decode — EncodingDecoder, DecodeContext, ArrayNode + *EncodingDecoder impls reader.extension — ExtensionDecoder + Date/Time/Timestamp/Uuid impls + reader.layout — Layout, LayoutDecoder, LayoutDecodeContext, LayoutRegistry + + built-in *LayoutDecoder impls, ZonedStatsSchema writer — VortexWriter, WriteRegistry, WriteOptions, ExtensionEncoder writer.encode — EncodingEncoder, EncodeContext, NullableData + *EncodingEncoder impls, extension encoders @@ -194,9 +195,13 @@ in the Rust source for the exact schema, then implement from spec. not add variants. Use `new DType.Extension("ip.address", new DType.Primitive(PType.I32, false), null, false)` and register decoders/encoders on the registries (or `ServiceLoader`). Mirrors Rust (`vortex.date`, `vortex.uuid`, …). No SPI for DType variants planned. -- **Layout is a fixed set, no SPI.** `ScanIterator.decodeLayout()` dispatches the known IDs - (flat/chunked/zoned/struct/dict) and throws otherwise. Keep the fixed set; revisit only for a - concrete downstream case unaddressable by a different flat-segment encoding. +- **Layout decode is pluggable via `LayoutDecoder` + `LayoutRegistry`** (`reader.layout`) — the + Rust reference registers layouts at runtime, so ours are open too. Builder-registered only + (`LayoutRegistry.builder().registerDefaults().register(custom).build()`, pass to + `VortexReader.open(path, readRegistry, layoutRegistry)`) — **no service file**. Unknown layouts + fail loudly (`VortexException`, Rust default; no allowUnknown for layouts). Scope: the SPI covers + full-column subtree decode; zone-map pruning, filtered scans, and chunk planning recognize the + built-in layouts only. - **Small public APIs.** Don't expose internals — when in doubt, leave it out or make it private. - **POM deps** grouped with comments: `` then ``, each with project-internal (`io.github.dfa1.vortex:*`) deps first, then external. Omit empty sections. diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java b/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java index 7be4183b..8b8df4bf 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java @@ -1,29 +1,14 @@ package io.github.dfa1.vortex.reader; -import static io.github.dfa1.vortex.core.io.PTypeIO.LE_SHORT; import static io.github.dfa1.vortex.core.io.PTypeIO.LE_INT; -import static io.github.dfa1.vortex.core.io.PTypeIO.LE_LONG; import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.core.io.IoBounds; -import io.github.dfa1.vortex.core.model.PType; import io.github.dfa1.vortex.core.error.VortexException; -import io.github.dfa1.vortex.core.model.EncodingId; import io.github.dfa1.vortex.reader.array.Array; import io.github.dfa1.vortex.reader.compute.Compare; import io.github.dfa1.vortex.reader.compute.Predicate; import io.github.dfa1.vortex.reader.array.BoolArray; import io.github.dfa1.vortex.reader.array.ByteArray; -import io.github.dfa1.vortex.reader.array.ChunkedBoolArray; -import io.github.dfa1.vortex.reader.array.ChunkedByteArray; -import io.github.dfa1.vortex.reader.array.ChunkedDoubleArray; -import io.github.dfa1.vortex.reader.array.ChunkedFloatArray; -import io.github.dfa1.vortex.reader.array.ChunkedIntArray; -import io.github.dfa1.vortex.reader.array.ChunkedLongArray; -import io.github.dfa1.vortex.reader.array.ChunkedShortArray; -import io.github.dfa1.vortex.reader.array.DictDoubleArray; -import io.github.dfa1.vortex.reader.array.DictFloatArray; -import io.github.dfa1.vortex.reader.array.DictIntArray; -import io.github.dfa1.vortex.reader.array.DictLongArray; import io.github.dfa1.vortex.reader.array.DoubleArray; import io.github.dfa1.vortex.reader.array.FloatArray; import io.github.dfa1.vortex.reader.array.IntArray; @@ -40,6 +25,7 @@ import io.github.dfa1.vortex.reader.array.StructArray; import io.github.dfa1.vortex.reader.array.VarBinArray; import io.github.dfa1.vortex.reader.layout.Layout; +import io.github.dfa1.vortex.reader.layout.LayoutDecodeContext; import io.github.dfa1.vortex.reader.layout.ZonedStatsSchema; import java.lang.foreign.Arena; @@ -53,7 +39,6 @@ import java.util.List; import java.util.Map; import java.util.NoSuchElementException; -import java.util.Optional; import java.util.function.Consumer; /// Iterates over decoded chunks from a [io.github.dfa1.vortex.reader.VortexReader]. @@ -214,59 +199,6 @@ private static Map expandStruct(StructArray sa) { return Map.copyOf(map); } - // ── Column map builder ──────────────────────────────────────────────────── - - private static Array expandDictStrings( - VarBinArray.OffsetMode values, MemorySegment codesSegs, - PType codesPType, DType dtype, - long n, SegmentAllocator arena - ) { - MemorySegment valBytes = values.bytesSegment(); - MemorySegment valOffsets = values.offsetsSegment(); - PType valOffPType = values.offsetsPtype(); - - // First pass: total output byte length - long totalBytes = 0L; - for (long i = 0; i < n; i++) { - long code = readUnsigned(codesSegs, i, codesPType); - long start = readUnsigned(valOffsets, code, valOffPType); - long end = readUnsigned(valOffsets, code + 1, valOffPType); - totalBytes += end - start; - } - - MemorySegment outBytes = arena.allocate(totalBytes > 0 ? totalBytes : 1); - MemorySegment outOffsets = arena.allocate((n + 1) * 4L, 4); - outOffsets.setAtIndex(LE_INT, 0, 0); - - long bytePos = 0L; - for (long i = 0; i < n; i++) { - long code = readUnsigned(codesSegs, i, codesPType); - long start = readUnsigned(valOffsets, code, valOffPType); - long end = readUnsigned(valOffsets, code + 1, valOffPType); - long strLen = end - start; - if (strLen > 0) { - MemorySegment.copy(valBytes, start, outBytes, bytePos, strLen); - bytePos += strLen; - } - outOffsets.setAtIndex(LE_INT, i + 1, (int) bytePos); - } - - return new VarBinArray.OffsetMode(dtype, n, outBytes.asReadOnly(), outOffsets.asReadOnly(), PType.I32); - } - - // ── Flat segment decoding ───────────────────────────────────────────────── - - private static long readUnsigned(MemorySegment seg, long idx, PType ptype) { - return switch (ptype) { - case U8 -> Byte.toUnsignedLong(seg.get(ValueLayout.JAVA_BYTE, idx)); - case U16 -> Short.toUnsignedLong(seg.get(LE_SHORT, idx * 2)); - case U32 -> Integer.toUnsignedLong(seg.getAtIndex(LE_INT, idx)); - case I32 -> seg.getAtIndex(LE_INT, idx); - case I64, U64 -> seg.getAtIndex(LE_LONG, idx); - default -> throw new VortexException(EncodingId.VORTEX_DICT, "layout: unsupported ptype " + ptype); - }; - } - // ── Zone-map pruning ────────────────────────────────────────────────────── private static Map limitedColumns(Map columns, long rows) { @@ -764,182 +696,11 @@ private static Array sliceArray(Array full, long offset, long length, DType dtyp } private Array decodeLayout(Layout layout, DType dtype, SegmentAllocator arena) { - if (layout.isFlat()) { - return decodeFlat(layout, dtype, arena); - } - if (layout.isDict()) { - return decodeDictLayout(layout, dtype, arena); - } - if (layout.isZoned() && !layout.children().isEmpty()) { - // Both vortex.zoned and its legacy vortex.stats alias wrap the data layout as child[0]. - return decodeLayout(layout.children().getFirst(), dtype, arena); - } - if (layout.isChunked()) { - var flats = new ArrayList(); - collectFlats(layout, flats); - return decodeChunkedLayout(flats, dtype, layout.rowCount(), arena); - } - // Custom (unknown) or any unhandled well-known layout id fails loudly — Rust has no - // allowUnknown for layouts. - throw new VortexException("cannot decode layout " + layout.layoutId()); - } - - private Array decodeChunkedLayout(List flats, DType dtype, long totalRows, SegmentAllocator arena) { - if (flats.isEmpty()) { - throw new VortexException(EncodingId.VORTEX_CHUNKED, "no flat children"); - } - if (flats.size() == 1) { - return decodeFlat(flats.getFirst(), dtype, arena); - } - // ADR 0012: every primitive ptype gets the zero-copy ChunkedXxxArray shape. - // The concat path is gone. - var chunkArrays = new ArrayList(flats.size()); - for (Layout flat : flats) { - chunkArrays.add(decodeFlat(flat, dtype, arena)); - } - if (dtype instanceof DType.Bool) { - return ChunkedBoolArray.of(dtype, totalRows, chunkArrays); - } - if (dtype instanceof DType.Utf8 || dtype instanceof DType.Binary) { - return VarBinArray.ChunkedMode.of(dtype, totalRows, chunkArrays); - } - PType ptype = ((DType.Primitive) dtype).ptype(); - return switch (ptype) { - case I64, U64 -> ChunkedLongArray.of(dtype, totalRows, chunkArrays); - case I32, U32 -> ChunkedIntArray.of(dtype, totalRows, chunkArrays); - case F64 -> ChunkedDoubleArray.of(dtype, totalRows, chunkArrays); - case F32 -> ChunkedFloatArray.of(dtype, totalRows, chunkArrays); - case I16, U16 -> ChunkedShortArray.of(dtype, totalRows, chunkArrays); - case I8, U8 -> ChunkedByteArray.of(dtype, totalRows, chunkArrays); - default -> throw new VortexException("unsupported ptype for chunked layout: " + ptype); - }; + return file.layoutRegistry().decode(new ScanLayoutContext(file, arena), layout, dtype); } // ── Limit truncation ───────────────────────────────────────────────────── - private Array decodeFlat(Layout flat, DType dtype, SegmentAllocator arena) { - if (flat.segments().isEmpty()) { - throw new VortexException("no segments"); - } - int segIdx = flat.segments().getFirst(); - SegmentSpec spec = file.footer().segmentSpecs().get(segIdx); - return file.decodeFlatSegment(spec, dtype, flat.rowCount(), arena); - } - - private Array decodeDictLayout(Layout dictLayout, DType dtype, SegmentAllocator arena) { - MemorySegment rawMeta = dictLayout.metadata(); - // DictLayoutMetadata proto (Rust format): field 1 = codes_ptype (PType varint). - // Read the varint directly to avoid field-number mismatch with the array-level DictMetadata proto. - PType codesPType = readDictLayoutCodesPType(rawMeta); - - // child[0] = values layout; child[1] = codes layout - Layout valuesLayout = dictLayout.children().get(0); - Layout codesLayout = dictLayout.children().get(1); - long n = codesLayout.rowCount(); - - Array values = decodeLayout(valuesLayout, dtype, arena); - Array codes = decodeLayout(codesLayout, new DType.Primitive(codesPType, false), arena); - - // VarBin (string) dict: VarBinArray is a sealed interface; ofDict returns the - // lazy DictMode record (no eager expansion into per-row offsets/bytes). - if (values instanceof VarBinArray.OffsetMode vb) { - // Zip-bomb guard: read the codes as a segment so we can validate the buffer - // before allocating the expansion output. For direct-mapped encodings (e.g. - // vortex.primitive), the codes buffer is mmap-bounded and can be much smaller - // than the claimed rowCount. Full-decode encodings (e.g. bitpacked) already - // wrote n * elemBytes to the arena during decodeLayout above, so their buffer - // matches n. - MemorySegment codesSeg = codes.materialize(arena); - long bufferCodes = codesSeg.byteSize() / codesPType.byteSize(); - if (bufferCodes < n) { - throw new VortexException(EncodingId.VORTEX_DICT, - "dict codes: layout row_count=" + n + " exceeds buffer capacity=" + bufferCodes); - } - MemorySegment valOffsets = vb.offsetsSegment(); - PType valOffPType = vb.offsetsPtype(); - return VarBinArray.ofDict(dtype, n, vb.bytesSegment(), valOffsets, valOffPType, - codesSeg, codesPType); - } - if (dtype instanceof DType.Primitive pDtype) { - // Zip-bomb guard (lazy path): the codes Array has already been decoded above; - // its length() reflects the claimed rowCount but its backing buffer may be - // mmap-bounded. Validate by inspecting the underlying segment without forcing - // materialization of non-segment-backed codes (lazy variants). - validateDictCodesCapacity(codes, codesPType, n); - return buildLazyDictPrimitive(pDtype, n, values, codes); - } - // Non-Utf8, non-Primitive dict — e.g. extension types backed by VarBin. Fall through - // to the existing string expansion for compatibility. - MemorySegment codesSegFallback = codes.materialize(arena); - long bufferCodesFallback = codesSegFallback.byteSize() / codesPType.byteSize(); - if (bufferCodesFallback < n) { - throw new VortexException(EncodingId.VORTEX_DICT, - "dict codes: layout row_count=" + n + " exceeds buffer capacity=" + bufferCodesFallback); - } - return expandDictStrings(VarBinArray.toOffsetMode((VarBinArray) values, arena), - codesSegFallback, codesPType, dtype, n, arena); - } - - /// Lazy-path zip-bomb guard. Inspects `codes`'s primary segment when available - /// (segment-backed encodings can be mmap-bounded and undersized); skips validation - /// for non-segment variants whose own decoder has already enforced length. - /// - /// @param codes the decoded codes array - /// @param codesPType code ptype reported by the dict layout metadata - /// @param n claimed dict row count - private static void validateDictCodesCapacity(Array codes, PType codesPType, long n) { - Optional maybeSeg = codes.segmentIfPresent(); - if (maybeSeg.isEmpty()) { - return; - } - long bufferCodes = maybeSeg.get().byteSize() / codesPType.byteSize(); - if (bufferCodes < n) { - throw new VortexException(EncodingId.VORTEX_DICT, - "dict codes: layout row_count=" + n + " exceeds buffer capacity=" + bufferCodes); - } - } - - /// Builds the matching `DictXxxArray` for a primitive dictionary, unwrapping - /// any [MaskedArray] layer on either side — dictionary lookups are keyed by code - /// so value-side validity is meaningless at this layer. - /// - /// @param dtype primitive logical type of dict values - /// @param n total logical row count - /// @param values dictionary values - /// @param codes per-row codes into `values` - /// @return a lazy `DictXxxArray` matching the value ptype - private static Array buildLazyDictPrimitive(DType.Primitive dtype, long n, Array values, Array codes) { - Array valuesData = values instanceof MaskedArray mv ? mv.inner() : values; - Array codesData = codes instanceof MaskedArray mc ? mc.inner() : codes; - PType ptype = dtype.ptype(); - return switch (ptype) { - case I64, U64 -> DictLongArray.of(dtype, n, (LongArray) valuesData, codesData); - case I32, U32 -> DictIntArray.of(dtype, n, (IntArray) valuesData, codesData); - case F64 -> DictDoubleArray.of(dtype, n, (DoubleArray) valuesData, codesData); - case F32 -> DictFloatArray.of(dtype, n, (FloatArray) valuesData, codesData); - default -> throw new VortexException(EncodingId.VORTEX_DICT, - "layout: unsupported ptype for lazy dict: " + ptype); - }; - } - - private static PType readDictLayoutCodesPType(MemorySegment rawMeta) { - // DictLayoutMetadata (Rust): field 1 = codes_ptype, wire type 0 (varint). - // Tag byte = (field_number << 3) | wire_type = (1 << 3) | 0 = 0x08. - // Proto3 omits field 1 when it holds the default value (0 = U8), so empty metadata means U8. - if (rawMeta == null || rawMeta.byteSize() == 0) { - return PType.U8; - } - byte tag = rawMeta.get(ValueLayout.JAVA_BYTE, 0); - if (tag == 0x08 && rawMeta.byteSize() > 1) { - int ordinal = rawMeta.get(ValueLayout.JAVA_BYTE, 1) & 0xFF; - PType[] values = PType.values(); - if (ordinal < values.length) { - return values[ordinal]; - } - } - return PType.U8; - } - private boolean canPruneChunk(ChunkSpec chunk, RowFilter filter) { return switch (filter) { case RowFilter.And(var filters) -> { @@ -1043,6 +804,30 @@ private ArrayStats readFlatStats(Layout flat) { return ArrayStats.fromFbs(root.stats()); } + // ── Layout decode context ───────────────────────────────────────────────── + + /// Binds a [LayoutDecodeContext] to one decode epoch (one arena). Recursion into children + /// routes back through the file's [LayoutRegistry] with the same arena, so nested layouts + /// land in the chunk the scan is currently filling. + private record ScanLayoutContext(VortexHandle file, SegmentAllocator arena) + implements LayoutDecodeContext { + + @Override + public Array decodeChild(Layout child, DType dtype) { + return file.layoutRegistry().decode(this, child, dtype); + } + + @Override + public Array decodeFlatSegment(SegmentSpec spec, DType dtype, long rowCount) { + return file.decodeFlatSegment(spec, dtype, rowCount, arena); + } + + @Override + public SegmentSpec segmentSpec(int index) { + return file.footer().segmentSpecs().get(index); + } + } + // ── Internal record ─────────────────────────────────────────────────────── @SuppressWarnings("java:S6218") // internal data carrier; record components are arrays of immutable primitives or refs that flow through pipelines without ever being compared. diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java index 9e1af2ca..4a11d66d 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java @@ -3,6 +3,7 @@ import io.github.dfa1.vortex.core.model.DType; import io.github.dfa1.vortex.reader.array.Array; import io.github.dfa1.vortex.reader.layout.Layout; +import io.github.dfa1.vortex.reader.layout.LayoutRegistry; import java.io.Closeable; import java.lang.foreign.MemorySegment; @@ -57,6 +58,13 @@ public interface VortexHandle extends Closeable { /// @return the registry used to resolve encoding ids during scan ReadRegistry registry(); + /// Returns the [LayoutRegistry] this handle was opened with, dispatching full-column layout + /// subtree decode. Defaults to [LayoutRegistry#defaults()] unless a custom registry was + /// supplied at open time. + /// + /// @return the registry used to decode layout nodes during scan + LayoutRegistry layoutRegistry(); + @Override void close(); } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java index df98349f..eaa0c460 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java @@ -6,6 +6,7 @@ import io.github.dfa1.vortex.core.io.VortexFormat; import io.github.dfa1.vortex.core.fbs.FbsPostscript; import io.github.dfa1.vortex.reader.layout.Layout; +import io.github.dfa1.vortex.reader.layout.LayoutRegistry; import java.io.IOException; import java.lang.foreign.Arena; @@ -47,11 +48,12 @@ public final class VortexHttpReader implements VortexHandle { private final DType dtype; private final Layout layout; private final ReadRegistry registry; + private final LayoutRegistry layoutRegistry; private VortexHttpReader( URI uri, HttpClient client, long fileSize, int version, Footer footer, DType dtype, Layout layout, - ReadRegistry registry + ReadRegistry registry, LayoutRegistry layoutRegistry ) { this.uri = uri; this.client = client; @@ -62,6 +64,7 @@ private VortexHttpReader( this.dtype = dtype; this.layout = layout; this.registry = registry; + this.layoutRegistry = layoutRegistry; } public static VortexHttpReader open(URI uri) throws IOException { @@ -83,6 +86,20 @@ public static VortexHttpReader open(URI uri, ReadRegistry registry) throws IOExc /// @return an open handle to the remote file /// @throws IOException if the file cannot be opened or parsed public static VortexHttpReader open(URI uri, ReadRegistry registry, HttpClient client) throws IOException { + return open(uri, registry, LayoutRegistry.defaults(), client); + } + + /// Opens a remote Vortex file with explicit encoding and layout registries and a + /// caller-supplied [HttpClient]. + /// + /// @param uri HTTP(S) URL of the Vortex file + /// @param registry the encoding decode registry + /// @param layoutRegistry the layout decode registry (custom layouts register here) + /// @param client HTTP client to use for all Range requests + /// @return an open handle to the remote file + /// @throws IOException if the file cannot be opened or parsed + public static VortexHttpReader open(URI uri, ReadRegistry registry, LayoutRegistry layoutRegistry, + HttpClient client) throws IOException { // Single suffix Range request — Content-Range response header gives us fileSize. // Avoids a separate HEAD round trip. TailFetch tf = fetchTail(uri, client); @@ -133,7 +150,7 @@ public static VortexHttpReader open(URI uri, ReadRegistry registry, HttpClient c return new VortexHttpReader( uri, client, fileSize, trailer.version(), parsed.footer(), parsed.dtype(), parsed.layout(), - registry + registry, layoutRegistry ); } @@ -312,6 +329,11 @@ public ReadRegistry registry() { return registry; } + @Override + public LayoutRegistry layoutRegistry() { + return layoutRegistry; + } + @Override public void close() { arena.close(); diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java index f07e47ad..f9a2016e 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java @@ -6,6 +6,7 @@ import io.github.dfa1.vortex.core.error.VortexException; import io.github.dfa1.vortex.core.io.VortexFormat; import io.github.dfa1.vortex.reader.layout.Layout; +import io.github.dfa1.vortex.reader.layout.LayoutRegistry; import java.io.IOException; import java.lang.foreign.Arena; @@ -34,11 +35,12 @@ public final class VortexReader implements VortexHandle { private final DType dtype; private final Layout layout; private final ReadRegistry registry; + private final LayoutRegistry layoutRegistry; private VortexReader( Arena arena, MemorySegment fileSegment, long fileSize, int version, Footer footer, DType dtype, Layout layout, - ReadRegistry registry + ReadRegistry registry, LayoutRegistry layoutRegistry ) { this.arena = arena; this.fileSegment = fileSegment; @@ -48,6 +50,7 @@ private VortexReader( this.dtype = dtype; this.layout = layout; this.registry = registry; + this.layoutRegistry = layoutRegistry; } /// Open a Vortex file. Memory-maps the entire file; all subsequent reads @@ -57,6 +60,19 @@ public static VortexReader open(Path path) throws IOException { } public static VortexReader open(Path path, ReadRegistry registry) throws IOException { + return open(path, registry, LayoutRegistry.defaults()); + } + + /// Opens a Vortex file with explicit encoding and layout registries. Memory-maps the entire + /// file; all subsequent reads are zero-copy slices. Call [#close()] when done. + /// + /// @param path the file to open + /// @param registry the encoding decode registry + /// @param layoutRegistry the layout decode registry (custom layouts register here) + /// @return an open handle to the file + /// @throws IOException if the file cannot be opened or parsed + public static VortexReader open(Path path, ReadRegistry registry, LayoutRegistry layoutRegistry) + throws IOException { Arena arena = Arena.ofConfined(); try (var channel = FileChannel.open(path, StandardOpenOption.READ)) { long size = channel.size(); @@ -67,7 +83,7 @@ public static VortexReader open(Path path, ReadRegistry registry) throws IOExcep // lifetime. try-with-resources closes the file descriptor while all Array // buffers remain valid zero-copy slices until arena.close() is called. var segment = channel.map(FileChannel.MapMode.READ_ONLY, 0, size, arena); - return parse(segment, size, arena, registry); + return parse(segment, size, arena, registry, layoutRegistry); } catch (Exception e) { arena.close(); throw e; @@ -75,7 +91,7 @@ public static VortexReader open(Path path, ReadRegistry registry) throws IOExcep } private static VortexReader parse( - MemorySegment seg, long size, Arena arena, ReadRegistry registry + MemorySegment seg, long size, Arena arena, ReadRegistry registry, LayoutRegistry layoutRegistry ) { long bodyBytes = size - VortexFormat.TRAILER_SIZE; var trailerSeg = IoBounds.slice(seg, bodyBytes, VortexFormat.TRAILER_SIZE); @@ -96,7 +112,7 @@ private static VortexReader parse( return new VortexReader( arena, seg, size, trailer.version(), parsed.footer(), parsed.dtype(), parsed.layout(), - registry + registry, layoutRegistry ); } @@ -160,6 +176,11 @@ public ReadRegistry registry() { return registry; } + @Override + public LayoutRegistry layoutRegistry() { + return layoutRegistry; + } + /// Returns the number of chunks in this file. /// /// Equal to the length of [ScanIterator#chunkRowCounts()] from a full scan, and the number diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/ChunkedLayoutDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/ChunkedLayoutDecoder.java new file mode 100644 index 00000000..c9a633a2 --- /dev/null +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/ChunkedLayoutDecoder.java @@ -0,0 +1,98 @@ +package io.github.dfa1.vortex.reader.layout; + +import io.github.dfa1.vortex.core.error.VortexException; +import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.core.model.EncodingId; +import io.github.dfa1.vortex.core.model.LayoutId; +import io.github.dfa1.vortex.core.model.PType; +import io.github.dfa1.vortex.reader.array.Array; +import io.github.dfa1.vortex.reader.array.ChunkedBoolArray; +import io.github.dfa1.vortex.reader.array.ChunkedByteArray; +import io.github.dfa1.vortex.reader.array.ChunkedDoubleArray; +import io.github.dfa1.vortex.reader.array.ChunkedFloatArray; +import io.github.dfa1.vortex.reader.array.ChunkedIntArray; +import io.github.dfa1.vortex.reader.array.ChunkedLongArray; +import io.github.dfa1.vortex.reader.array.ChunkedShortArray; +import io.github.dfa1.vortex.reader.array.VarBinArray; + +import java.lang.foreign.ValueLayout; +import java.util.ArrayList; +import java.util.List; + +/// Built-in decoder for the `vortex.chunked` layout — a sequence of flat leaves decoded into the +/// zero-copy `ChunkedXxxArray` shape (ADR 0012). Extracted verbatim from `ScanIterator`'s +/// `collectFlats` + `decodeChunkedLayout`. +final class ChunkedLayoutDecoder implements LayoutDecoder { + + @Override + public LayoutId layoutId() { + return LayoutId.CHUNKED; + } + + @Override + public Array decode(LayoutDecodeContext ctx, Layout layout, DType dtype) { + var flats = new ArrayList(); + collectFlats(layout, flats); + return decodeChunkedLayout(ctx, flats, dtype, layout.rowCount()); + } + + /// Flattens a layout subtree into its ordered flat (and dict) leaves. A private copy of + /// `ScanIterator.collectFlats`: the scan keeps its own for chunk-shape planning, while the + /// chunked decoder needs the same flattening to build the leaf array list. + /// + /// @param layout the layout subtree to flatten + /// @param out accumulator for the flat leaves in scan order + private static void collectFlats(Layout layout, List out) { + if (layout.isFlat()) { + out.add(layout); + } else if (layout.isDict()) { + // Dict layout is a leaf chunk — decoded as a unit (values + codes). + out.add(layout); + } else if (layout.isZoned()) { + // vortex.stats wraps one child (the data layout) — pass through for data + if (!layout.children().isEmpty()) { + collectFlats(layout.children().getFirst(), out); + } + } else if (layout.isChunked()) { + // metadata[0] == 1 means children[0] is the per-chunk stats layout; skip it + int start = (layout.metadata() != null + && layout.metadata().byteSize() > 0 + && layout.metadata().get(ValueLayout.JAVA_BYTE, 0) == 1) ? 1 : 0; + for (int i = start; i < layout.children().size(); i++) { + collectFlats(layout.children().get(i), out); + } + } + } + + private static Array decodeChunkedLayout(LayoutDecodeContext ctx, List flats, DType dtype, + long totalRows) { + if (flats.isEmpty()) { + throw new VortexException(EncodingId.VORTEX_CHUNKED, "no flat children"); + } + if (flats.size() == 1) { + return FlatLayoutDecoder.decodeFlat(ctx, flats.getFirst(), dtype); + } + // ADR 0012: every primitive ptype gets the zero-copy ChunkedXxxArray shape. + // The concat path is gone. + var chunkArrays = new ArrayList(flats.size()); + for (Layout flat : flats) { + chunkArrays.add(FlatLayoutDecoder.decodeFlat(ctx, flat, dtype)); + } + if (dtype instanceof DType.Bool) { + return ChunkedBoolArray.of(dtype, totalRows, chunkArrays); + } + if (dtype instanceof DType.Utf8 || dtype instanceof DType.Binary) { + return VarBinArray.ChunkedMode.of(dtype, totalRows, chunkArrays); + } + PType ptype = ((DType.Primitive) dtype).ptype(); + return switch (ptype) { + case I64, U64 -> ChunkedLongArray.of(dtype, totalRows, chunkArrays); + case I32, U32 -> ChunkedIntArray.of(dtype, totalRows, chunkArrays); + case F64 -> ChunkedDoubleArray.of(dtype, totalRows, chunkArrays); + case F32 -> ChunkedFloatArray.of(dtype, totalRows, chunkArrays); + case I16, U16 -> ChunkedShortArray.of(dtype, totalRows, chunkArrays); + case I8, U8 -> ChunkedByteArray.of(dtype, totalRows, chunkArrays); + default -> throw new VortexException("unsupported ptype for chunked layout: " + ptype); + }; + } +} diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/DictLayoutDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/DictLayoutDecoder.java new file mode 100644 index 00000000..a1de4888 --- /dev/null +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/DictLayoutDecoder.java @@ -0,0 +1,203 @@ +package io.github.dfa1.vortex.reader.layout; + +import static io.github.dfa1.vortex.core.io.PTypeIO.LE_SHORT; +import static io.github.dfa1.vortex.core.io.PTypeIO.LE_INT; +import static io.github.dfa1.vortex.core.io.PTypeIO.LE_LONG; + +import io.github.dfa1.vortex.core.error.VortexException; +import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.core.model.EncodingId; +import io.github.dfa1.vortex.core.model.LayoutId; +import io.github.dfa1.vortex.core.model.PType; +import io.github.dfa1.vortex.reader.array.Array; +import io.github.dfa1.vortex.reader.array.DictDoubleArray; +import io.github.dfa1.vortex.reader.array.DictFloatArray; +import io.github.dfa1.vortex.reader.array.DictIntArray; +import io.github.dfa1.vortex.reader.array.DictLongArray; +import io.github.dfa1.vortex.reader.array.DoubleArray; +import io.github.dfa1.vortex.reader.array.FloatArray; +import io.github.dfa1.vortex.reader.array.IntArray; +import io.github.dfa1.vortex.reader.array.LongArray; +import io.github.dfa1.vortex.reader.array.MaskedArray; +import io.github.dfa1.vortex.reader.array.VarBinArray; + +import java.lang.foreign.MemorySegment; +import java.lang.foreign.SegmentAllocator; +import java.lang.foreign.ValueLayout; +import java.util.Optional; + +/// Built-in decoder for the `vortex.dict` layout — a low-cardinality column stored as dictionary +/// values plus per-row codes. Extracted verbatim from `ScanIterator.decodeDictLayout` and its +/// private helpers. +final class DictLayoutDecoder implements LayoutDecoder { + + @Override + public LayoutId layoutId() { + return LayoutId.DICT; + } + + @Override + public Array decode(LayoutDecodeContext ctx, Layout dictLayout, DType dtype) { + SegmentAllocator arena = ctx.arena(); + MemorySegment rawMeta = dictLayout.metadata(); + // DictLayoutMetadata proto (Rust format): field 1 = codes_ptype (PType varint). + // Read the varint directly to avoid field-number mismatch with the array-level DictMetadata proto. + PType codesPType = readDictLayoutCodesPType(rawMeta); + + // child[0] = values layout; child[1] = codes layout + Layout valuesLayout = dictLayout.children().get(0); + Layout codesLayout = dictLayout.children().get(1); + long n = codesLayout.rowCount(); + + Array values = ctx.decodeChild(valuesLayout, dtype); + Array codes = ctx.decodeChild(codesLayout, new DType.Primitive(codesPType, false)); + + // VarBin (string) dict: VarBinArray is a sealed interface; ofDict returns the + // lazy DictMode record (no eager expansion into per-row offsets/bytes). + if (values instanceof VarBinArray.OffsetMode vb) { + // Zip-bomb guard: read the codes as a segment so we can validate the buffer + // before allocating the expansion output. For direct-mapped encodings (e.g. + // vortex.primitive), the codes buffer is mmap-bounded and can be much smaller + // than the claimed rowCount. Full-decode encodings (e.g. bitpacked) already + // wrote n * elemBytes to the arena during decodeChild above, so their buffer + // matches n. + MemorySegment codesSeg = codes.materialize(arena); + long bufferCodes = codesSeg.byteSize() / codesPType.byteSize(); + if (bufferCodes < n) { + throw new VortexException(EncodingId.VORTEX_DICT, + "dict codes: layout row_count=" + n + " exceeds buffer capacity=" + bufferCodes); + } + MemorySegment valOffsets = vb.offsetsSegment(); + PType valOffPType = vb.offsetsPtype(); + return VarBinArray.ofDict(dtype, n, vb.bytesSegment(), valOffsets, valOffPType, + codesSeg, codesPType); + } + if (dtype instanceof DType.Primitive pDtype) { + // Zip-bomb guard (lazy path): the codes Array has already been decoded above; + // its length() reflects the claimed rowCount but its backing buffer may be + // mmap-bounded. Validate by inspecting the underlying segment without forcing + // materialization of non-segment-backed codes (lazy variants). + validateDictCodesCapacity(codes, codesPType, n); + return buildLazyDictPrimitive(pDtype, n, values, codes); + } + // Non-Utf8, non-Primitive dict — e.g. extension types backed by VarBin. Fall through + // to the existing string expansion for compatibility. + MemorySegment codesSegFallback = codes.materialize(arena); + long bufferCodesFallback = codesSegFallback.byteSize() / codesPType.byteSize(); + if (bufferCodesFallback < n) { + throw new VortexException(EncodingId.VORTEX_DICT, + "dict codes: layout row_count=" + n + " exceeds buffer capacity=" + bufferCodesFallback); + } + return expandDictStrings(VarBinArray.toOffsetMode((VarBinArray) values, arena), + codesSegFallback, codesPType, dtype, n, arena); + } + + /// Lazy-path zip-bomb guard. Inspects `codes`'s primary segment when available + /// (segment-backed encodings can be mmap-bounded and undersized); skips validation + /// for non-segment variants whose own decoder has already enforced length. + /// + /// @param codes the decoded codes array + /// @param codesPType code ptype reported by the dict layout metadata + /// @param n claimed dict row count + private static void validateDictCodesCapacity(Array codes, PType codesPType, long n) { + Optional maybeSeg = codes.segmentIfPresent(); + if (maybeSeg.isEmpty()) { + return; + } + long bufferCodes = maybeSeg.get().byteSize() / codesPType.byteSize(); + if (bufferCodes < n) { + throw new VortexException(EncodingId.VORTEX_DICT, + "dict codes: layout row_count=" + n + " exceeds buffer capacity=" + bufferCodes); + } + } + + /// Builds the matching `DictXxxArray` for a primitive dictionary, unwrapping + /// any [MaskedArray] layer on either side — dictionary lookups are keyed by code + /// so value-side validity is meaningless at this layer. + /// + /// @param dtype primitive logical type of dict values + /// @param n total logical row count + /// @param values dictionary values + /// @param codes per-row codes into `values` + /// @return a lazy `DictXxxArray` matching the value ptype + private static Array buildLazyDictPrimitive(DType.Primitive dtype, long n, Array values, Array codes) { + Array valuesData = values instanceof MaskedArray mv ? mv.inner() : values; + Array codesData = codes instanceof MaskedArray mc ? mc.inner() : codes; + PType ptype = dtype.ptype(); + return switch (ptype) { + case I64, U64 -> DictLongArray.of(dtype, n, (LongArray) valuesData, codesData); + case I32, U32 -> DictIntArray.of(dtype, n, (IntArray) valuesData, codesData); + case F64 -> DictDoubleArray.of(dtype, n, (DoubleArray) valuesData, codesData); + case F32 -> DictFloatArray.of(dtype, n, (FloatArray) valuesData, codesData); + default -> throw new VortexException(EncodingId.VORTEX_DICT, + "layout: unsupported ptype for lazy dict: " + ptype); + }; + } + + private static PType readDictLayoutCodesPType(MemorySegment rawMeta) { + // DictLayoutMetadata (Rust): field 1 = codes_ptype, wire type 0 (varint). + // Tag byte = (field_number << 3) | wire_type = (1 << 3) | 0 = 0x08. + // Proto3 omits field 1 when it holds the default value (0 = U8), so empty metadata means U8. + if (rawMeta == null || rawMeta.byteSize() == 0) { + return PType.U8; + } + byte tag = rawMeta.get(ValueLayout.JAVA_BYTE, 0); + if (tag == 0x08 && rawMeta.byteSize() > 1) { + int ordinal = rawMeta.get(ValueLayout.JAVA_BYTE, 1) & 0xFF; + PType[] values = PType.values(); + if (ordinal < values.length) { + return values[ordinal]; + } + } + return PType.U8; + } + + private static Array expandDictStrings( + VarBinArray.OffsetMode values, MemorySegment codesSegs, + PType codesPType, DType dtype, + long n, SegmentAllocator arena + ) { + MemorySegment valBytes = values.bytesSegment(); + MemorySegment valOffsets = values.offsetsSegment(); + PType valOffPType = values.offsetsPtype(); + + // First pass: total output byte length + long totalBytes = 0L; + for (long i = 0; i < n; i++) { + long code = readUnsigned(codesSegs, i, codesPType); + long start = readUnsigned(valOffsets, code, valOffPType); + long end = readUnsigned(valOffsets, code + 1, valOffPType); + totalBytes += end - start; + } + + MemorySegment outBytes = arena.allocate(totalBytes > 0 ? totalBytes : 1); + MemorySegment outOffsets = arena.allocate((n + 1) * 4L, 4); + outOffsets.setAtIndex(LE_INT, 0, 0); + + long bytePos = 0L; + for (long i = 0; i < n; i++) { + long code = readUnsigned(codesSegs, i, codesPType); + long start = readUnsigned(valOffsets, code, valOffPType); + long end = readUnsigned(valOffsets, code + 1, valOffPType); + long strLen = end - start; + if (strLen > 0) { + MemorySegment.copy(valBytes, start, outBytes, bytePos, strLen); + bytePos += strLen; + } + outOffsets.setAtIndex(LE_INT, i + 1, (int) bytePos); + } + + return new VarBinArray.OffsetMode(dtype, n, outBytes.asReadOnly(), outOffsets.asReadOnly(), PType.I32); + } + + private static long readUnsigned(MemorySegment seg, long idx, PType ptype) { + return switch (ptype) { + case U8 -> Byte.toUnsignedLong(seg.get(ValueLayout.JAVA_BYTE, idx)); + case U16 -> Short.toUnsignedLong(seg.get(LE_SHORT, idx * 2)); + case U32 -> Integer.toUnsignedLong(seg.getAtIndex(LE_INT, idx)); + case I32 -> seg.getAtIndex(LE_INT, idx); + case I64, U64 -> seg.getAtIndex(LE_LONG, idx); + default -> throw new VortexException(EncodingId.VORTEX_DICT, "layout: unsupported ptype " + ptype); + }; + } +} diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/FlatLayoutDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/FlatLayoutDecoder.java new file mode 100644 index 00000000..90db9d7f --- /dev/null +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/FlatLayoutDecoder.java @@ -0,0 +1,39 @@ +package io.github.dfa1.vortex.reader.layout; + +import io.github.dfa1.vortex.core.error.VortexException; +import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.core.model.LayoutId; +import io.github.dfa1.vortex.reader.SegmentSpec; +import io.github.dfa1.vortex.reader.array.Array; + +/// Built-in decoder for the `vortex.flat` layout — a single encoded segment. Extracted verbatim +/// from `ScanIterator.decodeFlat`. +final class FlatLayoutDecoder implements LayoutDecoder { + + @Override + public LayoutId layoutId() { + return LayoutId.FLAT; + } + + @Override + public Array decode(LayoutDecodeContext ctx, Layout layout, DType dtype) { + return decodeFlat(ctx, layout, dtype); + } + + /// Decodes one flat leaf: resolves its single segment index to a [SegmentSpec] and delegates + /// to the file handle. Shared with [ChunkedLayoutDecoder], which decodes each of its collected + /// leaves through exactly this path (not registry dispatch), preserving the original behavior. + /// + /// @param ctx the decode context + /// @param flat the flat layout node + /// @param dtype logical type of the decoded array + /// @return the decoded [Array] + static Array decodeFlat(LayoutDecodeContext ctx, Layout flat, DType dtype) { + if (flat.segments().isEmpty()) { + throw new VortexException("no segments"); + } + int segIdx = flat.segments().getFirst(); + SegmentSpec spec = ctx.segmentSpec(segIdx); + return ctx.decodeFlatSegment(spec, dtype, flat.rowCount()); + } +} diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutDecodeContext.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutDecodeContext.java new file mode 100644 index 00000000..6d2e631e --- /dev/null +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutDecodeContext.java @@ -0,0 +1,48 @@ +package io.github.dfa1.vortex.reader.layout; + +import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.reader.SegmentSpec; +import io.github.dfa1.vortex.reader.array.Array; + +import java.lang.foreign.SegmentAllocator; + +/// Capabilities a [LayoutDecoder] needs to decode one layout subtree. +/// +/// A context is bound to a single decode epoch — in particular a single [#arena()] — so a +/// [LayoutDecoder] never chooses where its output lands; the caller (the scan) supplies the +/// arena that owns the chunk it is filling. Recursion into child layouts routes back through +/// the [LayoutRegistry] via [#decodeChild(Layout, DType)], so a decoder handling one layout id +/// need not know how any other id decodes. +public interface LayoutDecodeContext { + + /// Decodes a child layout by routing it back through the [LayoutRegistry], into the same + /// arena and epoch as the current decode. This is the recursion primitive: a container + /// layout (chunked, zoned, dict) decodes its children without knowing their layout ids. + /// + /// @param child the child layout node to decode + /// @param dtype logical type the child decodes to + /// @return the decoded child [Array] + Array decodeChild(Layout child, DType dtype); + + /// Decodes the encoded array stored in a single flat segment, delegating to the underlying + /// file handle. + /// + /// @param spec the segment to read and decode + /// @param dtype logical type of the decoded array + /// @param rowCount number of logical rows in the segment + /// @return the decoded [Array] + Array decodeFlatSegment(SegmentSpec spec, DType dtype, long rowCount); + + /// Resolves a segment index (as stored on a flat [Layout]) to its [SegmentSpec] in the + /// file's segment table. + /// + /// @param index the segment index recorded on a flat layout node + /// @return the [SegmentSpec] at that index + SegmentSpec segmentSpec(int index); + + /// Returns the allocator for decode output. Its lifetime matches the chunk epoch being + /// filled, so decoders allocate expansion buffers here rather than on the heap. + /// + /// @return the arena for this decode epoch + SegmentAllocator arena(); +} diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutDecoder.java new file mode 100644 index 00000000..682f6498 --- /dev/null +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutDecoder.java @@ -0,0 +1,42 @@ +package io.github.dfa1.vortex.reader.layout; + +import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.core.model.LayoutId; +import io.github.dfa1.vortex.reader.array.Array; + +import java.util.Set; + +/// SPI for decoding one kind of layout node into an [Array]. Registered on a [LayoutRegistry], +/// mirroring how [io.github.dfa1.vortex.reader.decode.EncodingDecoder] plugs into the encoding +/// registry. +/// +/// Scope — this SPI covers full-column subtree decode only. Zone-map pruning, filtered scans, +/// and chunk iteration recognize the built-in layouts (`flat`, `chunked`, `zoned`/`stats`, +/// `dict`) directly and are not extension points; a custom layout decodes as a whole column but +/// does not participate in those optimizations. +public interface LayoutDecoder { + + /// Returns the canonical layout id this decoder handles. + /// + /// @return the primary [LayoutId] of this decoder + LayoutId layoutId(); + + /// Returns every layout id this decoder handles. Defaults to just [#layoutId()]; a decoder + /// serving a wire alias (e.g. the zoned layout under both `vortex.zoned` and its legacy + /// `vortex.stats` id) overrides this to return the full set. The [LayoutRegistry] registers + /// the decoder under each id in this set. + /// + /// @return the set of layout ids this decoder handles; never empty + default Set layoutIds() { + return Set.of(layoutId()); + } + + /// Decodes the given layout node into an [Array] of `dtype`, using `ctx` for child recursion, + /// flat-segment reads, and allocation. + /// + /// @param ctx the decode context bound to the current chunk epoch + /// @param layout the layout node to decode + /// @param dtype logical type the layout decodes to + /// @return the decoded [Array] + Array decode(LayoutDecodeContext ctx, Layout layout, DType dtype); +} diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutRegistry.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutRegistry.java new file mode 100644 index 00000000..5bf9068c --- /dev/null +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutRegistry.java @@ -0,0 +1,114 @@ +package io.github.dfa1.vortex.reader.layout; + +import io.github.dfa1.vortex.core.error.VortexException; +import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.core.model.LayoutId; +import io.github.dfa1.vortex.reader.array.Array; + +import java.util.Collections; +import java.util.Map; +import java.util.TreeMap; + +/// Read-side registry mapping [LayoutId] to [LayoutDecoder] implementations, making layout decode +/// pluggable the way [io.github.dfa1.vortex.reader.ReadRegistry] makes encodings pluggable. +/// +/// Instances are immutable after construction. Build one via [#builder()], or take the four +/// built-ins directly via [#defaults()]. +/// +/// Unlike encodings, layouts register programmatically only — there is no [java.util.ServiceLoader] +/// discovery (mirroring `ExtensionDecoder`). Unknown layout ids fail loudly: there is no +/// allow-unknown mode, matching the Rust reference. +public final class LayoutRegistry { + + // Keyed by the wire string, ordered naturally by it, mirroring ReadRegistry. Decode dispatch + // is keyed, so order is not load-bearing, but a stable order keeps the registries consistent. + private final Map decoders; + + private LayoutRegistry(Map decoders) { + this.decoders = Collections.unmodifiableMap(new TreeMap<>(decoders)); + } + + /// Returns a registry populated with the four built-in layout decoders (flat, chunked, + /// zoned/stats, dict). + /// + /// @return an immutable [LayoutRegistry] with the built-in decoders registered + public static LayoutRegistry defaults() { + return builder().registerDefaults().build(); + } + + /// Returns a new [Builder]. + /// + /// @return a fresh builder + public static Builder builder() { + return new Builder(); + } + + /// Returns `true` if a decoder is registered for the given layout id. + /// + /// @param layoutId the layout id to query + /// @return `true` if a decoder is registered + public boolean hasDecoder(LayoutId layoutId) { + return decoders.containsKey(layoutId.id()); + } + + /// Decodes `layout` into an [Array] of `dtype` by dispatching to the decoder registered for + /// its [LayoutId]. An unregistered id fails loudly. + /// + /// @param ctx the decode context bound to the current chunk epoch + /// @param layout the layout node to decode + /// @param dtype logical type the layout decodes to + /// @return the decoded [Array] + /// @throws VortexException if no decoder is registered for `layout`'s id + public Array decode(LayoutDecodeContext ctx, Layout layout, DType dtype) { + LayoutDecoder decoder = decoders.get(layout.layoutId().id()); + if (decoder == null) { + throw new VortexException("cannot decode layout " + layout.layoutId()); + } + return decoder.decode(ctx, layout, dtype); + } + + /// Builder for [LayoutRegistry]. + /// + /// Not thread-safe. Build once, use everywhere — the produced [LayoutRegistry] is immutable. + public static final class Builder { + + private final Map decoders = new TreeMap<>(); + + private Builder() { + } + + /// Registers a decoder under every id it declares via [LayoutDecoder#layoutIds()], so an + /// aliased decoder (e.g. the zoned decoder under both `vortex.zoned` and `vortex.stats`) + /// registers once and resolves under either id. + /// + /// @param decoder the [LayoutDecoder] to register + /// @return this builder, for chaining + /// @throws VortexException if any of the decoder's ids is already registered + public Builder register(LayoutDecoder decoder) { + for (LayoutId id : decoder.layoutIds()) { + LayoutDecoder old = decoders.put(id.id(), decoder); + if (old != null) { + throw new VortexException("layout decoder %s already registered".formatted(id)); + } + } + return this; + } + + /// Registers the four built-in layout decoders (flat, chunked, zoned/stats, dict). + /// + /// @return this builder, for chaining + public Builder registerDefaults() { + return register(new FlatLayoutDecoder()) + .register(new ChunkedLayoutDecoder()) + .register(new ZonedLayoutDecoder()) + .register(new DictLayoutDecoder()); + } + + /// Builds an immutable [LayoutRegistry]. + /// + /// @return the immutable registry + public LayoutRegistry build() { + return new LayoutRegistry(decoders); + } + } +} diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/ZonedLayoutDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/ZonedLayoutDecoder.java new file mode 100644 index 00000000..3cafc57c --- /dev/null +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/ZonedLayoutDecoder.java @@ -0,0 +1,35 @@ +package io.github.dfa1.vortex.reader.layout; + +import io.github.dfa1.vortex.core.error.VortexException; +import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.core.model.LayoutId; +import io.github.dfa1.vortex.reader.array.Array; + +import java.util.Set; + +/// Built-in decoder for the zone-map layout — both the canonical `vortex.zoned` and its legacy +/// `vortex.stats` alias wrap the data layout as `child[0]`; decode passes straight through to it, +/// the per-zone stats being a pruning optimization handled elsewhere. Extracted verbatim from the +/// zoned branch of `ScanIterator.decodeLayout`. +final class ZonedLayoutDecoder implements LayoutDecoder { + + @Override + public LayoutId layoutId() { + return LayoutId.ZONED; + } + + @Override + public Set layoutIds() { + return Set.of(LayoutId.ZONED, LayoutId.STATS); + } + + @Override + public Array decode(LayoutDecodeContext ctx, Layout layout, DType dtype) { + // A zoned node with no children is not a decodable shape — fail loudly with the same + // message the fall-through path produced when the original dispatch matched nothing. + if (layout.children().isEmpty()) { + throw new VortexException("cannot decode layout " + layout.layoutId()); + } + return ctx.decodeChild(layout.children().getFirst(), dtype); + } +} diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/layout/LayoutRegistryTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/layout/LayoutRegistryTest.java new file mode 100644 index 00000000..ebafebb9 --- /dev/null +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/layout/LayoutRegistryTest.java @@ -0,0 +1,176 @@ +package io.github.dfa1.vortex.reader.layout; + +import io.github.dfa1.vortex.core.error.VortexException; +import io.github.dfa1.vortex.core.model.DType; +import io.github.dfa1.vortex.core.model.EncodingId; +import io.github.dfa1.vortex.core.model.LayoutId; +import io.github.dfa1.vortex.core.model.PType; +import io.github.dfa1.vortex.reader.ReadRegistry; +import io.github.dfa1.vortex.reader.ScanOptions; +import io.github.dfa1.vortex.reader.VortexReader; +import io.github.dfa1.vortex.reader.array.Array; +import io.github.dfa1.vortex.reader.array.UnknownArray; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.IOException; +import java.lang.foreign.MemorySegment; +import java.net.URISyntaxException; +import java.nio.file.Path; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.Mockito.mock; + +/// Drives the [LayoutRegistry] dispatch surface: default population (including the zoned dual +/// alias), duplicate-registration rejection, the loud unknown-layout failure, and custom-decoder +/// dispatch — the last both at the registry level and end-to-end through a real scan. +class LayoutRegistryTest { + + @ParameterizedTest + @ValueSource(strings = {"vortex.flat", "vortex.chunked", "vortex.zoned", "vortex.stats", "vortex.dict"}) + void defaults_containTheFourBuiltins_bothZonedAliasesResolve(String wireId) { + // Given — the registry populated with the four built-in decoders + LayoutRegistry sut = LayoutRegistry.defaults(); + + // When + boolean result = sut.hasDecoder(LayoutId.parse(wireId)); + + // Then — flat, chunked, dict, and BOTH zoned aliases (vortex.zoned + legacy vortex.stats) + // resolve to a decoder + assertThat(result).isTrue(); + } + + @Test + void register_duplicateId_throwsVortexException() { + // Given — a decoder claiming vortex.flat, an id the defaults already own + LayoutDecoder duplicate = stubDecoder(LayoutId.FLAT, sentinelArray()); + + // When / Then — re-registering an occupied id is rejected loudly + assertThatThrownBy(() -> LayoutRegistry.builder().registerDefaults().register(duplicate)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("already registered") + .hasMessageContaining("vortex.flat"); + } + + @Test + void decode_unknownLayoutId_throwsVortexExceptionWithExactMessage() { + // Given — a layout carrying a Custom id no decoder handles + LayoutRegistry sut = LayoutRegistry.defaults(); + Layout unknown = new Layout(new LayoutId.Custom("acme.frobnicate"), 0L, null, List.of(), List.of()); + LayoutDecodeContext ctx = mock(LayoutDecodeContext.class); + + // When / Then — no allow-unknown mode: the message is byte-identical to the pre-SPI throw + assertThatThrownBy(() -> sut.decode(ctx, unknown, primitive())) + .isInstanceOf(VortexException.class) + .hasMessage("cannot decode layout acme.frobnicate"); + } + + @Test + void decode_customLayoutId_dispatchesToRegisteredDecoder() { + // Given — a custom decoder registered under a Custom id, returning a sentinel array + LayoutId customId = new LayoutId.Custom("acme.custom"); + Array sentinel = sentinelArray(); + LayoutRegistry sut = LayoutRegistry.builder() + .registerDefaults() + .register(stubDecoder(customId, sentinel)) + .build(); + Layout custom = new Layout(customId, 0L, null, List.of(), List.of()); + LayoutDecodeContext ctx = mock(LayoutDecodeContext.class); + + // When + Array result = sut.decode(ctx, custom, primitive()); + + // Then — the custom decoder handled the node, proving id-keyed dispatch + assertThat(result).isSameAs(sentinel); + } + + @Test + void openWithCustomRegistry_reachesTheCustomDecoderDuringScan() throws URISyntaxException, IOException { + // Given — a LayoutRegistry whose built-ins are wrapped in counting delegators, so any decode + // dispatch during a real scan is observable. This proves the open(path, reg, layoutReg) + // overload threads the custom registry all the way into ScanIterator's decode path. + AtomicInteger dispatches = new AtomicInteger(); + LayoutRegistry custom = LayoutRegistry.builder() + .register(new CountingLayoutDecoder(new FlatLayoutDecoder(), dispatches)) + .register(new CountingLayoutDecoder(new ChunkedLayoutDecoder(), dispatches)) + .register(new CountingLayoutDecoder(new ZonedLayoutDecoder(), dispatches)) + .register(new CountingLayoutDecoder(new DictLayoutDecoder(), dispatches)) + .build(); + Path fixture = fixtureFile("primitives.vortex"); + + // When — a full scan decodes every chunk through the custom registry + long rows = 0L; + try (var reader = VortexReader.open(fixture, ReadRegistry.loadAll(), custom); + var iter = reader.scan(ScanOptions.all())) { + while (iter.hasNext()) { + try (var chunk = iter.next()) { + rows += chunk.rowCount(); + } + } + } + + // Then — the scan produced rows and routed every layout decode through the custom registry + assertThat(rows).isPositive(); + assertThat(dispatches.get()).isPositive(); + } + + // ── helpers ─────────────────────────────────────────────────────────────── + + private static DType primitive() { + return new DType.Primitive(PType.I32, false); + } + + /// A concrete stand-in [Array] — the sealed interface cannot be mocked, and these tests only + /// need object identity, never its contents. + private static Array sentinelArray() { + return new UnknownArray(EncodingId.parse("stub"), primitive(), 0L, null, + new MemorySegment[0], new Array[0]); + } + + private static LayoutDecoder stubDecoder(LayoutId id, Array result) { + return new LayoutDecoder() { + @Override + public LayoutId layoutId() { + return id; + } + + @Override + public Array decode(LayoutDecodeContext ctx, Layout layout, DType dtype) { + return result; + } + }; + } + + private Path fixtureFile(String name) throws URISyntaxException { + var url = getClass().getResource("/fixtures/" + name); + assertThat(url).as("fixture not found: " + name).isNotNull(); + return Path.of(url.toURI()); + } + + /// Wraps a built-in [LayoutDecoder], tallying every dispatch before delegating, so a real scan + /// can assert its layout decodes flowed through this registry. + private record CountingLayoutDecoder(LayoutDecoder delegate, AtomicInteger count) + implements LayoutDecoder { + + @Override + public LayoutId layoutId() { + return delegate.layoutId(); + } + + @Override + public Set layoutIds() { + return delegate.layoutIds(); + } + + @Override + public Array decode(LayoutDecodeContext ctx, Layout layout, DType dtype) { + count.incrementAndGet(); + return delegate.decode(ctx, layout, dtype); + } + } +} From dd196f17c2cb2f80b91b9eb44a2a2485e00ec186 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 4 Jul 2026 13:39:18 +0200 Subject: [PATCH 07/12] fix(reader): honor custom decoders under chunked; harden SPI edges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review findings on the LayoutDecoder SPI: - ChunkedLayoutDecoder decoded its leaves via a direct static FlatLayoutDecoder call, silently bypassing the registry — a custom decoder registered for a leaf id was not honored under a chunked parent, making the SPI partially decorative. Leaves now route through ctx.decodeChild; the end-to-end test asserts the flat delegator itself fires during a real scan. Integration oracle confirms identical behavior for built-ins (dict leaves under chunked included). - ScanLayoutContext.segmentSpec and DictLayoutDecoder child access now guard malformed indexes/arity with VortexException instead of leaking IndexOutOfBoundsException from untrusted input. Co-Authored-By: Claude Fable 5 --- .../dfa1/vortex/reader/ScanIterator.java | 8 +++++++- .../reader/layout/ChunkedLayoutDecoder.java | 6 ++++-- .../reader/layout/DictLayoutDecoder.java | 5 +++++ .../reader/layout/FlatLayoutDecoder.java | 18 +++--------------- .../reader/layout/LayoutRegistryTest.java | 17 ++++++++++------- 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java b/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java index 8b8df4bf..e0a8a029 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java @@ -824,7 +824,13 @@ public Array decodeFlatSegment(SegmentSpec spec, DType dtype, long rowCount) { @Override public SegmentSpec segmentSpec(int index) { - return file.footer().segmentSpecs().get(index); + List specs = file.footer().segmentSpecs(); + if (index < 0 || index >= specs.size()) { + // Untrusted input: a malformed flat layout may carry any segment index. + throw new VortexException("segment index " + index + + " out of bounds (segmentSpecs.size=" + specs.size() + ")"); + } + return specs.get(index); } } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/ChunkedLayoutDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/ChunkedLayoutDecoder.java index c9a633a2..be9fb0e2 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/ChunkedLayoutDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/ChunkedLayoutDecoder.java @@ -70,13 +70,15 @@ private static Array decodeChunkedLayout(LayoutDecodeContext ctx, List f throw new VortexException(EncodingId.VORTEX_CHUNKED, "no flat children"); } if (flats.size() == 1) { - return FlatLayoutDecoder.decodeFlat(ctx, flats.getFirst(), dtype); + return ctx.decodeChild(flats.getFirst(), dtype); } // ADR 0012: every primitive ptype gets the zero-copy ChunkedXxxArray shape. // The concat path is gone. var chunkArrays = new ArrayList(flats.size()); for (Layout flat : flats) { - chunkArrays.add(FlatLayoutDecoder.decodeFlat(ctx, flat, dtype)); + // Registry dispatch, not a direct decodeFlat call — a custom decoder registered for + // a leaf's layout id must be honored under a chunked parent too. + chunkArrays.add(ctx.decodeChild(flat, dtype)); } if (dtype instanceof DType.Bool) { return ChunkedBoolArray.of(dtype, totalRows, chunkArrays); diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/DictLayoutDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/DictLayoutDecoder.java index a1de4888..ce5edee0 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/DictLayoutDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/DictLayoutDecoder.java @@ -45,6 +45,11 @@ public Array decode(LayoutDecodeContext ctx, Layout dictLayout, DType dtype) { PType codesPType = readDictLayoutCodesPType(rawMeta); // child[0] = values layout; child[1] = codes layout + if (dictLayout.children().size() < 2) { + // Untrusted input: a malformed dict layout may carry any child count. + throw new VortexException(EncodingId.VORTEX_DICT, + "expected 2 children (values, codes), got " + dictLayout.children().size()); + } Layout valuesLayout = dictLayout.children().get(0); Layout codesLayout = dictLayout.children().get(1); long n = codesLayout.rowCount(); diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/FlatLayoutDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/FlatLayoutDecoder.java index 90db9d7f..4e97f92f 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/FlatLayoutDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/FlatLayoutDecoder.java @@ -17,23 +17,11 @@ public LayoutId layoutId() { @Override public Array decode(LayoutDecodeContext ctx, Layout layout, DType dtype) { - return decodeFlat(ctx, layout, dtype); - } - - /// Decodes one flat leaf: resolves its single segment index to a [SegmentSpec] and delegates - /// to the file handle. Shared with [ChunkedLayoutDecoder], which decodes each of its collected - /// leaves through exactly this path (not registry dispatch), preserving the original behavior. - /// - /// @param ctx the decode context - /// @param flat the flat layout node - /// @param dtype logical type of the decoded array - /// @return the decoded [Array] - static Array decodeFlat(LayoutDecodeContext ctx, Layout flat, DType dtype) { - if (flat.segments().isEmpty()) { + if (layout.segments().isEmpty()) { throw new VortexException("no segments"); } - int segIdx = flat.segments().getFirst(); + int segIdx = layout.segments().getFirst(); SegmentSpec spec = ctx.segmentSpec(segIdx); - return ctx.decodeFlatSegment(spec, dtype, flat.rowCount()); + return ctx.decodeFlatSegment(spec, dtype, layout.rowCount()); } } diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/layout/LayoutRegistryTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/layout/LayoutRegistryTest.java index ebafebb9..5211085f 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/layout/LayoutRegistryTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/layout/LayoutRegistryTest.java @@ -94,12 +94,13 @@ void openWithCustomRegistry_reachesTheCustomDecoderDuringScan() throws URISyntax // Given — a LayoutRegistry whose built-ins are wrapped in counting delegators, so any decode // dispatch during a real scan is observable. This proves the open(path, reg, layoutReg) // overload threads the custom registry all the way into ScanIterator's decode path. - AtomicInteger dispatches = new AtomicInteger(); + AtomicInteger flatDispatches = new AtomicInteger(); + AtomicInteger otherDispatches = new AtomicInteger(); LayoutRegistry custom = LayoutRegistry.builder() - .register(new CountingLayoutDecoder(new FlatLayoutDecoder(), dispatches)) - .register(new CountingLayoutDecoder(new ChunkedLayoutDecoder(), dispatches)) - .register(new CountingLayoutDecoder(new ZonedLayoutDecoder(), dispatches)) - .register(new CountingLayoutDecoder(new DictLayoutDecoder(), dispatches)) + .register(new CountingLayoutDecoder(new FlatLayoutDecoder(), flatDispatches)) + .register(new CountingLayoutDecoder(new ChunkedLayoutDecoder(), otherDispatches)) + .register(new CountingLayoutDecoder(new ZonedLayoutDecoder(), otherDispatches)) + .register(new CountingLayoutDecoder(new DictLayoutDecoder(), otherDispatches)) .build(); Path fixture = fixtureFile("primitives.vortex"); @@ -114,9 +115,11 @@ void openWithCustomRegistry_reachesTheCustomDecoderDuringScan() throws URISyntax } } - // Then — the scan produced rows and routed every layout decode through the custom registry + // Then — the scan produced rows, and the FLAT delegator itself fired: leaf decode under a + // container layout goes through registry dispatch too (a custom flat decoder is honored), + // not through a direct built-in call that would bypass the registry. assertThat(rows).isPositive(); - assertThat(dispatches.get()).isPositive(); + assertThat(flatDispatches.get()).isPositive(); } // ── helpers ─────────────────────────────────────────────────────────────── From 10ad42644bd5a9e75dff25a58741c7235bde9e9d Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 4 Jul 2026 13:39:53 +0200 Subject: [PATCH 08/12] docs(changelog): LayoutDecoder SPI Co-Authored-By: Claude Fable 5 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68a0fe40..7f908912 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `Compute.filteredSum(filterColumn, predicate, aggColumn)` fuses a filter and a sum into a single scan — a row folds into the total only when the predicate selects it (a null filter row is excluded) and the aggregate value is non-null — with no intermediate selection bitmap. It matches a hand-written fused loop and is ~1.5× faster than the two-pass `filter` + `sum`. ([57d2225b](https://github.com/dfa1/vortex-java/commit/57d2225b)) - `Compute.filteredAggregate(chunk, filter, aggColumn)` fuses a whole multi-column `RowFilter` (an n-ary `AND` of column-bound predicate leaves) and folds the selected rows' `SUM`/`MIN`/`MAX`/non-null count over an aggregate column in a single pass — the multi-column counterpart of `filteredSum`, and the row-level kernel behind the Calcite boundary-chunk aggregate push-down. A `null` aggregate column counts selected rows only (`COUNT(*)`). ([2ba54888](https://github.com/dfa1/vortex-java/commit/2ba54888)) - `core.model.LayoutId` — typed layout identity with the same sealed shape as `EncodingId` (`WellKnown` constants plus `Custom`; layouts are runtime-pluggable in the reference implementation). The reader now recognizes `vortex.zoned`, the current canonical zone-map layout id in the Rust reference, alongside the legacy `vortex.stats` alias it keeps writing — files from current Rust writers scan and prune correctly. ([7df3a0db](https://github.com/dfa1/vortex-java/commit/7df3a0db)) +- Layout decode is pluggable: `LayoutDecoder` + `LayoutRegistry` (`reader.layout`) mirror the encoding registry — `LayoutRegistry.builder().registerDefaults().register(custom).build()` passed to the new `VortexReader.open(path, readRegistry, layoutRegistry)` / `VortexHttpReader` overloads dispatches every layout decode, container children included, through the registry. Programmatic registration only (no service file); unknown layouts fail loudly. Zone-map pruning and filtered scans recognize the built-in layouts only. ([fc488d04](https://github.com/dfa1/vortex-java/commit/fc488d04), [dd196f17](https://github.com/dfa1/vortex-java/commit/dd196f17)) ### Changed From 3a657a320d8a7d8d49b5bfe5533f5ee32e0be4e2 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 4 Jul 2026 13:48:55 +0200 Subject: [PATCH 09/12] refactor(reader): FlatSegmentDecoder is SerializedArrayDecoder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renamed after its exact Rust counterpart (SerializedArray in vortex-array/src/serde.rs: "a parsed but not-yet-decoded deserialized array" whose decode() resolves the encoding id against the spec table and consults the registry). "Flat" is a layout concept and "segment" a byte-range concept — the unit this class decodes is one serialized array message. VortexHandle's decodeFlatSegment follows as decodeSegment, next to rawSegment. Pitest target FQN and living docs updated; released changelog entries and ADR 0001 stay as written. Co-Authored-By: Claude Fable 5 --- .../dfa1/vortex/cli/tui/VortexInspectorTui.java | 4 ++-- docs/testing.md | 2 +- reader/pom.xml | 2 +- .../github/dfa1/vortex/reader/ScanIterator.java | 6 +++--- ...tDecoder.java => SerializedArrayDecoder.java} | 4 ++-- .../github/dfa1/vortex/reader/VortexHandle.java | 2 +- .../dfa1/vortex/reader/VortexHttpReader.java | 4 ++-- .../github/dfa1/vortex/reader/VortexReader.java | 4 ++-- .../vortex/reader/layout/FlatLayoutDecoder.java | 2 +- .../reader/layout/LayoutDecodeContext.java | 2 +- .../reader/ArrayNodeDepthBombSecurityTest.java | 10 +++++----- ...va => SerializedArrayBoundsSecurityTest.java} | 4 ++-- ...ava => SerializedArrayDecoderDecodeTest.java} | 8 ++++---- .../dfa1/vortex/writer/WriterZoneMapTest.java | 16 ++++++++-------- 14 files changed, 35 insertions(+), 35 deletions(-) rename reader/src/main/java/io/github/dfa1/vortex/reader/{FlatSegmentDecoder.java => SerializedArrayDecoder.java} (98%) rename reader/src/test/java/io/github/dfa1/vortex/reader/{FlatSegmentBoundsSecurityTest.java => SerializedArrayBoundsSecurityTest.java} (96%) rename reader/src/test/java/io/github/dfa1/vortex/reader/{FlatSegmentDecoderDecodeTest.java => SerializedArrayDecoderDecodeTest.java} (93%) diff --git a/cli/src/main/java/io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java b/cli/src/main/java/io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java index 57983efe..fe011c2c 100644 --- a/cli/src/main/java/io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java +++ b/cli/src/main/java/io/github/dfa1/vortex/cli/tui/VortexInspectorTui.java @@ -607,7 +607,7 @@ private void runDictLoad(InspectorTree.Node dictNode) { int segIdx = values.segments().getFirst(); SegmentSpec spec = tree.segmentSpecs().get(segIdx); io.github.dfa1.vortex.reader.array.Array arr = - handle.decodeFlatSegment(spec, dtype, values.rowCount(), arena); + handle.decodeSegment(spec, dtype, values.rowCount(), arena); int n = (int) Math.min(arr.length(), DATA_PREVIEW_ROWS); List out = new ArrayList<>(n); for (int i = 0; i < n; i++) { @@ -722,7 +722,7 @@ private List decodeStatsFlat( } int segIdx = flat.segments().getFirst(); SegmentSpec spec = tree.segmentSpecs().get(segIdx); - Array arr = handle.decodeFlatSegment(spec, statsDtype, flat.rowCount(), arena); + Array arr = handle.decodeSegment(spec, statsDtype, flat.rowCount(), arena); return InspectorRender.formatStatsArray(arr, statsDtype); } diff --git a/docs/testing.md b/docs/testing.md index 0e8fb9e5..ac4a12f8 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -113,7 +113,7 @@ bounds/parse classes** — not the whole codebase. It measures whether the tests ```bash ./mvnw -pl core -P pitest verify # IoBounds, PTypeIO ./mvnw -pl reader -am -P pitest verify -DskipITs # Footer, Trailer, PostscriptParser, - # SegmentSpec, Layout, FlatSegmentDecoder + # SegmentSpec, Layout, SerializedArrayDecoder ./mvnw -pl writer -am -P pitest verify -DskipITs # ChunkImpl, WriteRegistry ``` diff --git a/reader/pom.xml b/reader/pom.xml index 5753227e..043143eb 100644 --- a/reader/pom.xml +++ b/reader/pom.xml @@ -90,7 +90,7 @@ io.github.dfa1.vortex.reader.PostscriptParser io.github.dfa1.vortex.reader.SegmentSpec io.github.dfa1.vortex.reader.layout.Layout - io.github.dfa1.vortex.reader.FlatSegmentDecoder + io.github.dfa1.vortex.reader.SerializedArrayDecoder diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java b/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java index e0a8a029..73da1de1 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/ScanIterator.java @@ -410,7 +410,7 @@ private List decodeZoneTable(String column) { long nZones = statsFlat.rowCount(); SegmentSpec spec = file.footer().segmentSpecs().get(segIdx); try (Arena tableArena = Arena.ofConfined()) { - Array decoded = file.decodeFlatSegment(spec, statsDtype, nZones, tableArena); + Array decoded = file.decodeSegment(spec, statsDtype, nZones, tableArena); if (!(decoded instanceof StructArray table)) { return null; } @@ -818,8 +818,8 @@ public Array decodeChild(Layout child, DType dtype) { } @Override - public Array decodeFlatSegment(SegmentSpec spec, DType dtype, long rowCount) { - return file.decodeFlatSegment(spec, dtype, rowCount, arena); + public Array decodeSegment(SegmentSpec spec, DType dtype, long rowCount) { + return file.decodeSegment(spec, dtype, rowCount, arena); } @Override diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/FlatSegmentDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/SerializedArrayDecoder.java similarity index 98% rename from reader/src/main/java/io/github/dfa1/vortex/reader/FlatSegmentDecoder.java rename to reader/src/main/java/io/github/dfa1/vortex/reader/SerializedArrayDecoder.java index ba5b2777..17a6cf16 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/FlatSegmentDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/SerializedArrayDecoder.java @@ -23,7 +23,7 @@ /// /// [ReadRegistry] is pure dispatch; this class owns all file-format knowledge: /// FlatBuffer parsing, buffer-offset arithmetic, and encoding-spec lookup. -final class FlatSegmentDecoder { +final class SerializedArrayDecoder { /// Hard cap on array-node recursion depth. The encoded array tree nests through child nodes /// (validity, patches, run-ends, dictionary codes/values, …); a crafted or self-referential @@ -37,7 +37,7 @@ final class FlatSegmentDecoder { /// Creates a decoder backed by the given registry. /// /// @param registry the registry used to dispatch to concrete decoder impls - public FlatSegmentDecoder(ReadRegistry registry) { + public SerializedArrayDecoder(ReadRegistry registry) { this.registry = registry; } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java index 4a11d66d..91dc871a 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHandle.java @@ -32,7 +32,7 @@ public interface VortexHandle extends Closeable { /// @param rowCount number of logical rows in the segment /// @param arena allocator for decode output; lifetime matches the caller's chunk epoch /// @return the decoded array - Array decodeFlatSegment(SegmentSpec spec, DType dtype, long rowCount, SegmentAllocator arena); + Array decodeSegment(SegmentSpec spec, DType dtype, long rowCount, SegmentAllocator arena); /// Returns a read-only view of the bytes backing the given segment spec. /// Writes through the returned segment throw `UnsupportedOperationException`. diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java index eaa0c460..a0c1c880 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexHttpReader.java @@ -288,13 +288,13 @@ public long fileSize() { // ── HTTP helpers ────────────────────────────────────────────────────────── @Override - public io.github.dfa1.vortex.reader.array.Array decodeFlatSegment( + public io.github.dfa1.vortex.reader.array.Array decodeSegment( io.github.dfa1.vortex.reader.SegmentSpec spec, DType dtype, long rowCount, java.lang.foreign.SegmentAllocator arenaOut ) { MemorySegment seg = rawSegment(spec); - return new FlatSegmentDecoder(registry) + return new SerializedArrayDecoder(registry) .decode(seg, footer.arraySpecs(), dtype, rowCount, arenaOut); } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java index f9a2016e..d0fd0ff5 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/VortexReader.java @@ -317,13 +317,13 @@ private ArrayStats readFlatStats(Layout flat) { } @Override - public io.github.dfa1.vortex.reader.array.Array decodeFlatSegment( + public io.github.dfa1.vortex.reader.array.Array decodeSegment( io.github.dfa1.vortex.reader.SegmentSpec spec, DType dtype, long rowCount, java.lang.foreign.SegmentAllocator arena ) { MemorySegment seg = IoBounds.slice(fileSegment, spec.offset(), spec.length()).asReadOnly(); - return new FlatSegmentDecoder(registry) + return new SerializedArrayDecoder(registry) .decode(seg, footer.arraySpecs(), dtype, rowCount, arena); } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/FlatLayoutDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/FlatLayoutDecoder.java index 4e97f92f..dde1f476 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/FlatLayoutDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/FlatLayoutDecoder.java @@ -22,6 +22,6 @@ public Array decode(LayoutDecodeContext ctx, Layout layout, DType dtype) { } int segIdx = layout.segments().getFirst(); SegmentSpec spec = ctx.segmentSpec(segIdx); - return ctx.decodeFlatSegment(spec, dtype, layout.rowCount()); + return ctx.decodeSegment(spec, dtype, layout.rowCount()); } } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutDecodeContext.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutDecodeContext.java index 6d2e631e..8c06435c 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutDecodeContext.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutDecodeContext.java @@ -31,7 +31,7 @@ public interface LayoutDecodeContext { /// @param dtype logical type of the decoded array /// @param rowCount number of logical rows in the segment /// @return the decoded [Array] - Array decodeFlatSegment(SegmentSpec spec, DType dtype, long rowCount); + Array decodeSegment(SegmentSpec spec, DType dtype, long rowCount); /// Resolves a segment index (as stored on a flat [Layout]) to its [SegmentSpec] in the /// file's segment table. diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/ArrayNodeDepthBombSecurityTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/ArrayNodeDepthBombSecurityTest.java index 682d99c2..d39c5511 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/ArrayNodeDepthBombSecurityTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/ArrayNodeDepthBombSecurityTest.java @@ -15,10 +15,10 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; /// Adversarial tests for the encoded-array-tree recursion in -/// [FlatSegmentDecoder]'s `convertArrayNode`. +/// [SerializedArrayDecoder]'s `convertArrayNode`. /// /// The decoder walks the array node tree recursively (validity, patches, run-ends, dictionary -/// codes/values, …). Without the [FlatSegmentDecoder#MAX_ARRAY_TREE_DEPTH] cap a crafted segment +/// codes/values, …). Without the [SerializedArrayDecoder#MAX_ARRAY_TREE_DEPTH] cap a crafted segment /// with thousands of nested children produces a [StackOverflowError] — an `Error` that escapes the /// "malformed input must surface as [VortexException]" contract (ADR 0003). /// @@ -29,7 +29,7 @@ class ArrayNodeDepthBombSecurityTest { private static final DType DTYPE = DType.I32; - private final FlatSegmentDecoder sut = new FlatSegmentDecoder(ReadRegistry.empty()); + private final SerializedArrayDecoder sut = new SerializedArrayDecoder(ReadRegistry.empty()); @Test void arrayTreeAtDepthLimit_clearsGuard() { @@ -38,7 +38,7 @@ void arrayTreeAtDepthLimit_clearsGuard() { // depth == limit there, and `limit > limit` is false. The walk completes and only then // fails because the empty registry has no decoder. Kills `depth >` relaxed to `>=`, // which would wrongly reject this legal max-depth tree with the depth message. - byte[] fb = deeplyNestedArrayFlatBuffer(FlatSegmentDecoder.MAX_ARRAY_TREE_DEPTH); + byte[] fb = deeplyNestedArrayFlatBuffer(SerializedArrayDecoder.MAX_ARRAY_TREE_DEPTH); MemorySegment seg = wrapAsSegment(fb, arena); // When / Then @@ -53,7 +53,7 @@ void arrayTreeOneOverDepthLimit_throwsVortexException() { try (Arena arena = Arena.ofConfined()) { // Given — one level deeper: the deepest node reaches limit + 1, tripping the guard // before any StackOverflowError can escape. - byte[] fb = deeplyNestedArrayFlatBuffer(FlatSegmentDecoder.MAX_ARRAY_TREE_DEPTH + 1); + byte[] fb = deeplyNestedArrayFlatBuffer(SerializedArrayDecoder.MAX_ARRAY_TREE_DEPTH + 1); MemorySegment seg = wrapAsSegment(fb, arena); // When / Then — must surface as VortexException, not StackOverflowError diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/FlatSegmentBoundsSecurityTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/SerializedArrayBoundsSecurityTest.java similarity index 96% rename from reader/src/test/java/io/github/dfa1/vortex/reader/FlatSegmentBoundsSecurityTest.java rename to reader/src/test/java/io/github/dfa1/vortex/reader/SerializedArrayBoundsSecurityTest.java index 9eeb49cf..803c48ca 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/FlatSegmentBoundsSecurityTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/SerializedArrayBoundsSecurityTest.java @@ -21,12 +21,12 @@ /// Both the trailing length field and each buffer descriptor's offset/length come straight /// from untrusted file bytes. After ADR 0003 Phase E, every malformed value must surface as /// a [VortexException], never a raw `IndexOutOfBoundsException` from `MemorySegment.asSlice`. -class FlatSegmentBoundsSecurityTest { +class SerializedArrayBoundsSecurityTest { private static final DType DTYPE = DType.I32; - private final FlatSegmentDecoder sut = new FlatSegmentDecoder(ReadRegistry.empty()); + private final SerializedArrayDecoder sut = new SerializedArrayDecoder(ReadRegistry.empty()); @Test void segmentSmallerThanLengthField_throwsVortexException() { diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/FlatSegmentDecoderDecodeTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/SerializedArrayDecoderDecodeTest.java similarity index 93% rename from reader/src/test/java/io/github/dfa1/vortex/reader/FlatSegmentDecoderDecodeTest.java rename to reader/src/test/java/io/github/dfa1/vortex/reader/SerializedArrayDecoderDecodeTest.java index 1014a6dd..b6b01cef 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/FlatSegmentDecoderDecodeTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/SerializedArrayDecoderDecodeTest.java @@ -17,7 +17,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.catchThrowable; -/// Successful flat-segment decode path — complements [FlatSegmentBoundsSecurityTest] (which only +/// Successful flat-segment decode path — complements [SerializedArrayBoundsSecurityTest] (which only /// drives the rejection paths). A buffer descriptor with non-zero padding exercises the offset /// walk, and an unknown encoding id exercises the unknown-id passthrough through an /// allow-unknown registry. Together these pin two otherwise-untested spots: @@ -25,12 +25,12 @@ /// negative offset and fails, so a clean decode proves the addition. /// - the unknown-id node construction: mishandling an unresolvable id there yields a /// node the decode would not turn into an `UnknownArray`. -class FlatSegmentDecoderDecodeTest { +class SerializedArrayDecoderDecodeTest { @Test void decode_unknownEncodingWithBufferPadding_returnsUnknownArray() { ReadRegistry registry = ReadRegistry.builder().allowUnknown().build(); - FlatSegmentDecoder sut = new FlatSegmentDecoder(registry); + SerializedArrayDecoder sut = new SerializedArrayDecoder(registry); try (Arena arena = Arena.ofConfined()) { // Given — a flat segment whose single buffer carries 8 bytes of leading padding and @@ -56,7 +56,7 @@ void decode_unknownEncodingWithBufferPadding_returnsUnknownArray() { @Test void decode_blankEncodingId_throwsVortexException() { ReadRegistry registry = ReadRegistry.builder().allowUnknown().build(); - FlatSegmentDecoder sut = new FlatSegmentDecoder(registry); + SerializedArrayDecoder sut = new SerializedArrayDecoder(registry); try (Arena arena = Arena.ofConfined()) { // Given — a zero-length FlatBuffer string in the spec table decodes to "", which diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java index 9e9cb2ac..61c82cdc 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/WriterZoneMapTest.java @@ -108,7 +108,7 @@ void zoneMaps_statsPayloadDecodesPerZoneMinMax(@TempDir Path tmp) throws IOExcep Layout zonesFlat = reader.layout().children().get(0).children().get(1); SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); try (Arena arena = Arena.ofConfined()) { - StructArray stats = (StructArray) reader.decodeFlatSegment(spec, numericStatsTableDtype(), 3, arena); + StructArray stats = (StructArray) reader.decodeSegment(spec, numericStatsTableDtype(), 3, arena); LongArray max = (LongArray) ((MaskedArray) stats.field("max")).inner(); LongArray min = (LongArray) ((MaskedArray) stats.field("min")).inner(); LongArray sum = (LongArray) ((MaskedArray) stats.field("sum")).inner(); @@ -153,7 +153,7 @@ void zoneMaps_nullableColumn_recordsPerZoneNullCount(@TempDir Path tmp) throws I Layout zonesFlat = reader.layout().children().get(0).children().get(1); SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); try (Arena arena = Arena.ofConfined()) { - StructArray stats = (StructArray) reader.decodeFlatSegment(spec, numericStatsTableDtype(), 2, arena); + StructArray stats = (StructArray) reader.decodeSegment(spec, numericStatsTableDtype(), 2, arena); LongArray nullCount = (LongArray) ((MaskedArray) stats.field("null_count")).inner(); LongArray sum = (LongArray) ((MaskedArray) stats.field("sum")).inner(); @@ -259,7 +259,7 @@ void utf8Column_emitsStringMinMaxZoneMap(@TempDir Path tmp) throws IOException { SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); try (Arena arena = Arena.ofConfined()) { StructArray stats = - (StructArray) reader.decodeFlatSegment(spec, utf8StatsTableDtype(), 2, arena); + (StructArray) reader.decodeSegment(spec, utf8StatsTableDtype(), 2, arena); VarBinArray max = (VarBinArray) ((MaskedArray) stats.field("max")).inner(); VarBinArray min = (VarBinArray) ((MaskedArray) stats.field("min")).inner(); assertThat(min.getString(0)).isEqualTo("apple"); @@ -296,7 +296,7 @@ void extensionColumn_emitsStorageMinMaxZoneMap(@TempDir Path tmp) throws IOExcep Layout zonesFlat = column.children().get(1); SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); try (Arena arena = Arena.ofConfined()) { - StructArray stats = (StructArray) reader.decodeFlatSegment(spec, statsTableDtype(), 2, arena); + StructArray stats = (StructArray) reader.decodeSegment(spec, statsTableDtype(), 2, arena); LongArray max = (LongArray) ((MaskedArray) stats.field("max")).inner(); LongArray min = (LongArray) ((MaskedArray) stats.field("min")).inner(); assertThat(min.getLong(0)).isEqualTo(10); @@ -335,7 +335,7 @@ void dictColumn_emitsStringMinMaxZoneMapWrappingDict(@TempDir Path tmp) throws I SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); try (Arena arena = Arena.ofConfined()) { StructArray stats = - (StructArray) reader.decodeFlatSegment(spec, utf8StatsTableDtype(), 2, arena); + (StructArray) reader.decodeSegment(spec, utf8StatsTableDtype(), 2, arena); VarBinArray max = (VarBinArray) ((MaskedArray) stats.field("max")).inner(); VarBinArray min = (VarBinArray) ((MaskedArray) stats.field("min")).inner(); assertThat(min.getString(0)).isEqualTo("a"); @@ -371,7 +371,7 @@ void primitiveDictColumn_emitsNumericMinMaxZoneMapWrappingDict(@TempDir Path tmp Layout zonesFlat = column.children().get(1); SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); try (Arena arena = Arena.ofConfined()) { - StructArray stats = (StructArray) reader.decodeFlatSegment(spec, numericStatsTableDtype(), 2, arena); + StructArray stats = (StructArray) reader.decodeSegment(spec, numericStatsTableDtype(), 2, arena); LongArray max = (LongArray) ((MaskedArray) stats.field("max")).inner(); LongArray min = (LongArray) ((MaskedArray) stats.field("min")).inner(); LongArray sum = (LongArray) ((MaskedArray) stats.field("sum")).inner(); @@ -410,7 +410,7 @@ void zoneMaps_f64StatsPayloadDecodesPerZoneMinMaxSum(@TempDir Path tmp) throws I Layout zonesFlat = reader.layout().children().get(0).children().get(1); SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); try (Arena arena = Arena.ofConfined()) { - StructArray stats = (StructArray) reader.decodeFlatSegment(spec, f64StatsTableDtype(), 3, arena); + StructArray stats = (StructArray) reader.decodeSegment(spec, f64StatsTableDtype(), 3, arena); DoubleArray max = (DoubleArray) ((MaskedArray) stats.field("max")).inner(); DoubleArray min = (DoubleArray) ((MaskedArray) stats.field("min")).inner(); DoubleArray sum = (DoubleArray) ((MaskedArray) stats.field("sum")).inner(); @@ -446,7 +446,7 @@ void zoneMaps_perTypeStatsDecodePerZoneMinMax(PType ptype, @TempDir Path tmp) th Layout zonesFlat = reader.layout().children().get(0).children().get(1); SegmentSpec spec = reader.footer().segmentSpecs().get(zonesFlat.segments().getFirst()); try (Arena arena = Arena.ofConfined()) { - StructArray stats = (StructArray) reader.decodeFlatSegment(spec, perTypeStatsTableDtype(ptype), 2, arena); + StructArray stats = (StructArray) reader.decodeSegment(spec, perTypeStatsTableDtype(ptype), 2, arena); MemorySegment min = ((MaskedArray) stats.field("min")).inner().materialize(arena); MemorySegment max = ((MaskedArray) stats.field("max")).inner().materialize(arena); assertThat(readStat(min, ptype, 0)).as("min zone 0").isEqualTo(0.0); From 030061db651c47723b6ff39577d4b399329007b8 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 4 Jul 2026 13:49:44 +0200 Subject: [PATCH 10/12] docs(reader): cite the Rust SerializedArray counterpart Co-Authored-By: Claude Fable 5 --- .../dfa1/vortex/reader/SerializedArrayDecoder.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/SerializedArrayDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/SerializedArrayDecoder.java index 17a6cf16..c6a0003b 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/SerializedArrayDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/SerializedArrayDecoder.java @@ -15,10 +15,15 @@ import java.lang.foreign.SegmentAllocator; import java.util.List; -/// Parses a flat segment from the memory-mapped file region and dispatches to the -/// appropriate decoder via the [ReadRegistry]. +/// Decodes one serialized array message — the wire unit a flat layout's segment stores — by +/// parsing its FlatBuffer scaffolding and dispatching to the appropriate decoder via the +/// [ReadRegistry]. /// -/// Flat segment wire format: +/// Named after the Rust reference's `SerializedArray` (`vortex-array/src/serde.rs`): "a parsed +/// but not-yet-decoded deserialized array" whose `decode` resolves the encoding id against the +/// spec table and consults the registry — exactly this class's contract. +/// +/// Serialized array wire format: /// `buffer_data... | FlatBuffer(Array) | u32 LE = FlatBuffer byte length` /// /// [ReadRegistry] is pure dispatch; this class owns all file-format knowledge: From 2ca759d1ed37a5838a9210aaa53637e9bb425379 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 4 Jul 2026 13:55:59 +0200 Subject: [PATCH 11/12] refactor(reader): registries keyed by typed ids MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ReadRegistry and LayoutRegistry map keys become EncodingId/LayoutId: since ArrayNode and Layout carry parsed typed ids, string-keyed dispatch just round-tripped through the wire form. Strings at the boundary, types inside. TreeMap orders by wire string via comparator — the sealed ids are not Comparable, and a Custom key must not throw. Co-Authored-By: Claude Fable 5 --- .../dfa1/vortex/reader/ReadRegistry.java | 26 +++++++++++-------- .../vortex/reader/layout/LayoutRegistry.java | 23 +++++++++------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/ReadRegistry.java b/reader/src/main/java/io/github/dfa1/vortex/reader/ReadRegistry.java index 4f541ffa..af71adb1 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/ReadRegistry.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/ReadRegistry.java @@ -10,6 +10,7 @@ import java.lang.foreign.MemorySegment; import java.util.Collections; +import java.util.Comparator; import java.util.Map; import java.util.ServiceLoader; import java.util.TreeMap; @@ -20,14 +21,17 @@ /// via the [#loadAll()] and [#empty()] convenience factories. public final class ReadRegistry { - private final Map decoders; + // Keyed by the typed id — ArrayNode already carries a parsed EncodingId, so dispatch never + // round-trips through the wire string ("strings at the boundary, types inside"). Ordered by + // that string (EncodingId is not Comparable; a Custom key must not throw); order is not + // load-bearing, but stable ordering keeps the registries consistent. + private final Map decoders; private final boolean allowUnknown; - private ReadRegistry(Map decoders, boolean allowUnknown) { - // Keyed by the wire string, ordered naturally by it, mirroring WriteRegistry. Decode - // dispatch is keyed, so order is not load-bearing here, but a stable order keeps the two - // registries consistent. - this.decoders = Collections.unmodifiableMap(new TreeMap<>(decoders)); + private ReadRegistry(Map decoders, boolean allowUnknown) { + var sorted = new TreeMap(Comparator.comparing(EncodingId::id)); + sorted.putAll(decoders); + this.decoders = Collections.unmodifiableMap(sorted); this.allowUnknown = allowUnknown; } @@ -65,7 +69,7 @@ public boolean isAllowUnknown() { /// @param encodingId the encoding id to query /// @return `true` if a decoder is registered public boolean hasDecoder(EncodingId encodingId) { - return decoders.containsKey(encodingId.id()); + return decoders.containsKey(encodingId); } /// Decodes the array described by `ctx`. @@ -74,7 +78,7 @@ public boolean hasDecoder(EncodingId encodingId) { /// @return the decoded [Array] public Array decode(DecodeContext ctx) { ArrayNode node = ctx.node(); - EncodingDecoder decoder = decoders.get(node.encodingId().id()); + EncodingDecoder decoder = decoders.get(node.encodingId()); if (decoder != null) { return decoder.decode(ctx); } @@ -90,7 +94,7 @@ public Array decode(DecodeContext ctx) { /// @return the primary [MemorySegment] of the decoded array public MemorySegment decodeAsSegment(DecodeContext ctx) { ArrayNode node = ctx.node(); - EncodingDecoder decoder = decoders.get(node.encodingId().id()); + EncodingDecoder decoder = decoders.get(node.encodingId()); if (decoder != null) { return decoder.decode(ctx).materialize(ctx.arena()); } @@ -121,7 +125,7 @@ private static UnknownArray decodeUnknown(DecodeContext ctx, ArrayNode node) { /// Not thread-safe. Build once, use everywhere — the produced [ReadRegistry] is immutable. public static final class Builder { - private final Map decoders = new TreeMap<>(); + private final Map decoders = new TreeMap<>(Comparator.comparing(EncodingId::id)); private boolean allowUnknown = false; private Builder() { @@ -133,7 +137,7 @@ private Builder() { /// @return this builder, for chaining /// @throws VortexException if a decoder for the same id is already registered public Builder register(EncodingDecoder decoder) { - EncodingDecoder old = decoders.put(decoder.encodingId().id(), decoder); + EncodingDecoder old = decoders.put(decoder.encodingId(), decoder); if (old != null) { throw new VortexException("decoder %s already registered".formatted(decoder.encodingId())); } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutRegistry.java b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutRegistry.java index 5bf9068c..7efb4179 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutRegistry.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/layout/LayoutRegistry.java @@ -6,6 +6,7 @@ import io.github.dfa1.vortex.reader.array.Array; import java.util.Collections; +import java.util.Comparator; import java.util.Map; import java.util.TreeMap; @@ -20,12 +21,16 @@ /// allow-unknown mode, matching the Rust reference. public final class LayoutRegistry { - // Keyed by the wire string, ordered naturally by it, mirroring ReadRegistry. Decode dispatch - // is keyed, so order is not load-bearing, but a stable order keeps the registries consistent. - private final Map decoders; + // Keyed by the typed id — Layout already carries a parsed LayoutId, so dispatch never + // round-trips through the wire string. Ordered by that string (LayoutId is not Comparable; + // a Custom key must not throw): order is not load-bearing, but stable ordering keeps the + // registries consistent. + private final Map decoders; - private LayoutRegistry(Map decoders) { - this.decoders = Collections.unmodifiableMap(new TreeMap<>(decoders)); + private LayoutRegistry(Map decoders) { + var sorted = new TreeMap(Comparator.comparing(LayoutId::id)); + sorted.putAll(decoders); + this.decoders = Collections.unmodifiableMap(sorted); } /// Returns a registry populated with the four built-in layout decoders (flat, chunked, @@ -48,7 +53,7 @@ public static Builder builder() { /// @param layoutId the layout id to query /// @return `true` if a decoder is registered public boolean hasDecoder(LayoutId layoutId) { - return decoders.containsKey(layoutId.id()); + return decoders.containsKey(layoutId); } /// Decodes `layout` into an [Array] of `dtype` by dispatching to the decoder registered for @@ -60,7 +65,7 @@ public boolean hasDecoder(LayoutId layoutId) { /// @return the decoded [Array] /// @throws VortexException if no decoder is registered for `layout`'s id public Array decode(LayoutDecodeContext ctx, Layout layout, DType dtype) { - LayoutDecoder decoder = decoders.get(layout.layoutId().id()); + LayoutDecoder decoder = decoders.get(layout.layoutId()); if (decoder == null) { throw new VortexException("cannot decode layout " + layout.layoutId()); } @@ -72,7 +77,7 @@ public Array decode(LayoutDecodeContext ctx, Layout layout, DType dtype) { /// Not thread-safe. Build once, use everywhere — the produced [LayoutRegistry] is immutable. public static final class Builder { - private final Map decoders = new TreeMap<>(); + private final Map decoders = new TreeMap<>(Comparator.comparing(LayoutId::id)); private Builder() { } @@ -86,7 +91,7 @@ private Builder() { /// @throws VortexException if any of the decoder's ids is already registered public Builder register(LayoutDecoder decoder) { for (LayoutId id : decoder.layoutIds()) { - LayoutDecoder old = decoders.put(id.id(), decoder); + LayoutDecoder old = decoders.put(id, decoder); if (old != null) { throw new VortexException("layout decoder %s already registered".formatted(id)); } From 8012c13d9ef75bffca97b9e0636e9d921fa1a79a Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 4 Jul 2026 13:57:15 +0200 Subject: [PATCH 12/12] docs: move ADRs from docs/adr to top-level adr/ Relative links rewritten: docs pages point at ../adr/, ADR upward references drop one level, ADR links into docs/ gain the prefix. Co-Authored-By: Claude Fable 5 --- CHANGELOG.md | 8 ++++---- TODO.md | 16 ++++++++-------- .../0001-split-read-and-write-runtimes.md | 6 +++--- .../0002-pluggable-dtype-layout-compute.md | 6 +++--- .../0003-vortex-exception-sanitization.md | 6 +++--- .../0004-resource-caps-read-options.md | 6 +++--- {docs/adr => adr}/0005-vector-api-adoption.md | 8 ++++---- {docs/adr => adr}/0006-benchmark-publishing.md | 6 +++--- {docs/adr => adr}/0007-pco-encode.md | 0 .../0008-domain-primitives-unsigned-integers.md | 6 +++--- {docs/adr => adr}/0009-write-api-ergonomics.md | 0 {docs/adr => adr}/0010-lazy-decode.md | 4 ++-- .../0011-writer-memorysegment-overload.md | 0 .../0012-zero-copy-layout-decoding.md | 2 +- {docs/adr => adr}/0013-compute-primitives.md | 0 .../0014-variant-encoding-strategy.md | 0 .../0015-drop-materialized-fallbacks.md | 2 +- {docs/adr => adr}/0016-vortex-arrow-bridge.md | 4 ++-- .../0017-in-house-flatbuffers-codegen.md | 0 {docs/adr => adr}/0018-calcite-sql-adapter.md | 0 .../0019-columnar-transducer-facade.md | 0 {docs/adr => adr}/ADR.md | 0 {docs/adr => adr}/template.md | 0 .../io/github/dfa1/vortex/core/io/IoBounds.java | 2 +- docs/compatibility.md | 4 ++-- 25 files changed, 43 insertions(+), 43 deletions(-) rename {docs/adr => adr}/0001-split-read-and-write-runtimes.md (99%) rename {docs/adr => adr}/0002-pluggable-dtype-layout-compute.md (98%) rename {docs/adr => adr}/0003-vortex-exception-sanitization.md (99%) rename {docs/adr => adr}/0004-resource-caps-read-options.md (97%) rename {docs/adr => adr}/0005-vector-api-adoption.md (96%) rename {docs/adr => adr}/0006-benchmark-publishing.md (97%) rename {docs/adr => adr}/0007-pco-encode.md (100%) rename {docs/adr => adr}/0008-domain-primitives-unsigned-integers.md (98%) rename {docs/adr => adr}/0009-write-api-ergonomics.md (100%) rename {docs/adr => adr}/0010-lazy-decode.md (99%) rename {docs/adr => adr}/0011-writer-memorysegment-overload.md (100%) rename {docs/adr => adr}/0012-zero-copy-layout-decoding.md (99%) rename {docs/adr => adr}/0013-compute-primitives.md (100%) rename {docs/adr => adr}/0014-variant-encoding-strategy.md (100%) rename {docs/adr => adr}/0015-drop-materialized-fallbacks.md (98%) rename {docs/adr => adr}/0016-vortex-arrow-bridge.md (98%) rename {docs/adr => adr}/0017-in-house-flatbuffers-codegen.md (100%) rename {docs/adr => adr}/0018-calcite-sql-adapter.md (100%) rename {docs/adr => adr}/0019-columnar-transducer-facade.md (100%) rename {docs/adr => adr}/ADR.md (100%) rename {docs/adr => adr}/template.md (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f908912..90385150 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -170,7 +170,7 @@ A hardening release: no new file-format capability, but a large step up in verif - Zero-warning rule: `-Xlint:all -Werror` across all modules. The `classfile` lint (which only flags missing annotation class files inside third-party Arrow bytecode) is scoped off in the two Arrow-using modules only. ([dab467e5](https://github.com/dfa1/vortex-java/commit/dab467e5), [43f6f840](https://github.com/dfa1/vortex-java/commit/43f6f840)) - Mutation testing (PIT): opt-in `pitest` profiles in core, reader, and writer, scoped to the bounds/parse classes (`IoBounds`, `PTypeIO`, `WriteRegistry`, `ChunkImpl`, …), with common config hoisted into the parent POM. ([46904b24](https://github.com/dfa1/vortex-java/commit/46904b24), [ed8c98a1](https://github.com/dfa1/vortex-java/commit/ed8c98a1), [1200c76b](https://github.com/dfa1/vortex-java/commit/1200c76b), [840cc46a](https://github.com/dfa1/vortex-java/commit/840cc46a)) -- SonarCloud: generated `fbs/` and `proto/` sources excluded from analysis (machine output, not hand-maintained); the deliberate per-width SIMD-loop duplication is documented in [ADR 0005](docs/adr/0005-vector-api-adoption.md) rather than refactored away. Code smells dropped 857→394; coverage ~81%, all ratings A, zero bugs/vulnerabilities. ([6c591293](https://github.com/dfa1/vortex-java/commit/6c591293)) +- SonarCloud: generated `fbs/` and `proto/` sources excluded from analysis (machine output, not hand-maintained); the deliberate per-width SIMD-loop duplication is documented in [ADR 0005](adr/0005-vector-api-adoption.md) rather than refactored away. Code smells dropped 857→394; coverage ~81%, all ratings A, zero bugs/vulnerabilities. ([6c591293](https://github.com/dfa1/vortex-java/commit/6c591293)) ### Tests @@ -185,7 +185,7 @@ Read and write Vortex Variant (semi-structured, JSON-shaped) columns from Java. ### Added -- Writer: `vortex.variant` encoder. Encodes a variant column as the canonical `vortex.variant` container over `core_storage` — an all-equal column becomes a single `vortex.constant`, a row-varying column a `vortex.chunked` of per-run constants — with an optional row-aligned typed `shredded` child recorded in `VariantMetadata.shredded_dtype`. Input is `VariantData(List)` with `.constant(n, v)` / `.shredded(...)` factories. Java↔Rust (JNI) round-trip verified for constant, row-varying, and shredded columns. Scalar values only — arbitrary nested objects need `vortex.parquet.variant` (deferred, [ADR 0014](docs/adr/0014-variant-encoding-strategy.md)). ([35da529d](https://github.com/dfa1/vortex-java/commit/35da529d), [e4e44980](https://github.com/dfa1/vortex-java/commit/e4e44980), [4566dca0](https://github.com/dfa1/vortex-java/commit/4566dca0)) +- Writer: `vortex.variant` encoder. Encodes a variant column as the canonical `vortex.variant` container over `core_storage` — an all-equal column becomes a single `vortex.constant`, a row-varying column a `vortex.chunked` of per-run constants — with an optional row-aligned typed `shredded` child recorded in `VariantMetadata.shredded_dtype`. Input is `VariantData(List)` with `.constant(n, v)` / `.shredded(...)` factories. Java↔Rust (JNI) round-trip verified for constant, row-varying, and shredded columns. Scalar values only — arbitrary nested objects need `vortex.parquet.variant` (deferred, [ADR 0014](adr/0014-variant-encoding-strategy.md)). ([35da529d](https://github.com/dfa1/vortex-java/commit/35da529d), [e4e44980](https://github.com/dfa1/vortex-java/commit/e4e44980), [4566dca0](https://github.com/dfa1/vortex-java/commit/4566dca0)) - Reader: variant columns now decode Java-side. `ConstantEncodingDecoder` and `ChunkedEncodingDecoder` handle `DType.Variant` (materializing the inner-typed array); `VariantEncodingDecoder` wraps the result as `VariantArray`, exposing `coreStorage()` and `shredded()`. ([76e4c741](https://github.com/dfa1/vortex-java/commit/76e4c741), [4566dca0](https://github.com/dfa1/vortex-java/commit/4566dca0)) ### Security @@ -200,7 +200,7 @@ Read and write Vortex Variant (semi-structured, JSON-shaped) columns from Java. ### Changed -- Decode shape: transform encodings now decode **lazy-only**. The eager `Materialized*Array` fallbacks were removed from `vortex.zigzag` (all PTypes + broadcast, [cd59fefa](https://github.com/dfa1/vortex-java/commit/cd59fefa)), `fastlanes.for` (all integer PTypes, [d7953e1f](https://github.com/dfa1/vortex-java/commit/d7953e1f)), `vortex.alp` (broadcast-without-patches, [deab8067](https://github.com/dfa1/vortex-java/commit/deab8067)), `vortex.constant` (Decimal → `LazyConstantDecimalArray`, [a6a9611e](https://github.com/dfa1/vortex-java/commit/a6a9611e)), `vortex.runend` (Bool → `LazyRunEndBoolArray`, [0bbcb81f](https://github.com/dfa1/vortex-java/commit/0bbcb81f)), `vortex.sparse` (Bool → `LazySparseBoolArray`, [db2e955b](https://github.com/dfa1/vortex-java/commit/db2e955b)), and `fastlanes.rle` (validity → `OffsetBoolArray`, empty → `LazyConstantXxxArray`, [5e83a5c3](https://github.com/dfa1/vortex-java/commit/5e83a5c3)). Decompression encodings (`bitpacked`, `pco`, `zstd`, `fsst`, `delta`, `patched`), the primitive base, the `vortex.dict` encoding-level path, and the `vortex.alp` patches path stay Materialized by design. See [ADR 0015](docs/adr/0015-drop-materialized-fallbacks.md). +- Decode shape: transform encodings now decode **lazy-only**. The eager `Materialized*Array` fallbacks were removed from `vortex.zigzag` (all PTypes + broadcast, [cd59fefa](https://github.com/dfa1/vortex-java/commit/cd59fefa)), `fastlanes.for` (all integer PTypes, [d7953e1f](https://github.com/dfa1/vortex-java/commit/d7953e1f)), `vortex.alp` (broadcast-without-patches, [deab8067](https://github.com/dfa1/vortex-java/commit/deab8067)), `vortex.constant` (Decimal → `LazyConstantDecimalArray`, [a6a9611e](https://github.com/dfa1/vortex-java/commit/a6a9611e)), `vortex.runend` (Bool → `LazyRunEndBoolArray`, [0bbcb81f](https://github.com/dfa1/vortex-java/commit/0bbcb81f)), `vortex.sparse` (Bool → `LazySparseBoolArray`, [db2e955b](https://github.com/dfa1/vortex-java/commit/db2e955b)), and `fastlanes.rle` (validity → `OffsetBoolArray`, empty → `LazyConstantXxxArray`, [5e83a5c3](https://github.com/dfa1/vortex-java/commit/5e83a5c3)). Decompression encodings (`bitpacked`, `pco`, `zstd`, `fsst`, `delta`, `patched`), the primitive base, the `vortex.dict` encoding-level path, and the `vortex.alp` patches path stay Materialized by design. See [ADR 0015](adr/0015-drop-materialized-fallbacks.md). - **Breaking — sealed `Array` permits changed.** `DecimalArray` is now a `non-sealed` family interface (decimal arrays moved from `implements Array` to `implements DecimalArray`), so decimal joins the per-dtype family layer. Downstream exhaustive `switch` over `Array` must add a `case DecimalArray`. ([a6a9611e](https://github.com/dfa1/vortex-java/commit/a6a9611e)) - **Breaking — `Array` API.** `Array.truncate(rows)` renamed to `Array.limited(rows)` and made an abstract operation implemented by every array (composites slice their children); raw-segment access moved off the `ArraySegments` utility onto `Array.materialize(SegmentAllocator)` and `Array.segmentIfPresent()`. ([87ab65e2](https://github.com/dfa1/vortex-java/commit/87ab65e2), [4d9ac1f8](https://github.com/dfa1/vortex-java/commit/4d9ac1f8), [332b067e](https://github.com/dfa1/vortex-java/commit/332b067e), [32a35e03](https://github.com/dfa1/vortex-java/commit/32a35e03)) - CSV import reports progress every 10K rows instead of per-chunk. ([07a056e7](https://github.com/dfa1/vortex-java/commit/07a056e7)) @@ -211,7 +211,7 @@ Read and write Vortex Variant (semi-structured, JSON-shaped) columns from Java. ### Documentation -- [ADR 0016](docs/adr/0016-vortex-arrow-bridge.md): captures `vortex-arrow` bridge interop options (separate module / Arrow C-Data / none); deferred until a concrete downstream need. ([a6126f29](https://github.com/dfa1/vortex-java/commit/a6126f29)) +- [ADR 0016](adr/0016-vortex-arrow-bridge.md): captures `vortex-arrow` bridge interop options (separate module / Arrow C-Data / none); deferred until a concrete downstream need. ([a6126f29](https://github.com/dfa1/vortex-java/commit/a6126f29)) ### Tests diff --git a/TODO.md b/TODO.md index 1abf9b5c..7af1028a 100644 --- a/TODO.md +++ b/TODO.md @@ -8,10 +8,10 @@ ## Performance -- [ ] **Benchmark publishing** — drop CI workflow, add `bench-publish` script; see [ADR-0006](docs/adr/0006-benchmark-publishing.md). +- [ ] **Benchmark publishing** — drop CI workflow, add `bench-publish` script; see [ADR-0006](adr/0006-benchmark-publishing.md). - [ ] Performance tests must be peer-reviewed - [ ] Run performance tests on other machines (I have access only to Apple M5) -- [ ] **Vector API adoption** — deferred; see [ADR-0005](docs/adr/0005-vector-api-adoption.md) for adoption criteria and candidate loops. +- [ ] **Vector API adoption** — deferred; see [ADR-0005](adr/0005-vector-api-adoption.md) for adoption criteria and candidate loops. ## Security @@ -44,7 +44,7 @@ Per-encoding gotchas: ### Resource caps -- [ ] **Implement `ResourceLimits` + `ReadOptions`** — see [ADR-0004](docs/adr/0004-resource-caps-read-options.md) for design, defaults, and enforcement points. Also covers Pco page/bin caps. +- [ ] **Implement `ResourceLimits` + `ReadOptions`** — see [ADR-0004](adr/0004-resource-caps-read-options.md) for design, defaults, and enforcement points. Also covers Pco page/bin caps. ### Fuzz infrastructure @@ -70,23 +70,23 @@ Per-encoding gotchas: ## Tooling -- [ ] Optional `vortex-arrow` bridge module for Arrow ecosystem interop — see [ADR-0016](docs/adr/0016-vortex-arrow-bridge.md) +- [ ] Optional `vortex-arrow` bridge module for Arrow ecosystem interop — see [ADR-0016](adr/0016-vortex-arrow-bridge.md) ## API - [ ] **Error messages — structural sanitization of `VortexException`** — Phase E (bounds typing via `IoBounds`) shipped; remaining is Phases A–D (the `Sanitize` - helper + `VortexError` catalog). See [ADR-0003](docs/adr/0003-vortex-exception-sanitization.md) + helper + `VortexError` catalog). See [ADR-0003](adr/0003-vortex-exception-sanitization.md) for design and phasing. - [ ] Use domain primitives (`UInt32`, `UInt64`, etc.) as value classes via Project Valhalla instead of raw `long`/`int` - - See [ADR-0008](docs/adr/0008-domain-primitives-unsigned-integers.md) and https://dfa1.github.io/articles/rethink-domain-primitives-with-valhalla + - See [ADR-0008](adr/0008-domain-primitives-unsigned-integers.md) and https://dfa1.github.io/articles/rethink-domain-primitives-with-valhalla - Candidates: `PType` integer kinds, buffer offsets, row indices, byte lengths - Goal: type-safety at zero cost (value class = no heap alloc, no boxing) ## Compute - [ ] **Compute primitives — encoded-domain specialization & façade** — the remaining ADR-0013 - follow-ups now the fused kernels have shipped. See [ADR-0013](docs/adr/0013-compute-primitives.md). + follow-ups now the fused kernels have shipped. See [ADR-0013](adr/0013-compute-primitives.md). Done: §4 `Predicate`; §5 `RowFilter` unified over `Predicate`; §6 zone-map aggregate push-down in both tiers — the whole-zone `ZoneReducer` fold wired into `VortexAggregatePushDownRule` (rewrites a whole-table `MIN`/`MAX`/`COUNT`/`SUM`/`AVG` to a single-row `Values`, auto-registered over a bare @@ -100,7 +100,7 @@ Per-encoding gotchas: residual leaves tested per match). Multi-fork numbers: `fusedFilteredSumDict` 762 → 38 ms/op ≈ 20×; `fusedFilteredAggregateDict` 983 → 46 ms/op ≈ 22×; `fusedFilteredAggregateMulti` (2-leaf `AND` × 2 aggregates) 2269 → 201 ms/op ≈ 11×. - Next: the columnar transducer façade — [ADR-0019](docs/adr/0019-columnar-transducer-facade.md) + Next: the columnar transducer façade — [ADR-0019](adr/0019-columnar-transducer-facade.md) drafted (Proposed): declarative column-bound stages compiled to one fused pass; the remaining measured lever is the multi-aggregate single scan (≈ 2×) plus composition ergonomics for the Calcite boundary tier; review, then implement. diff --git a/docs/adr/0001-split-read-and-write-runtimes.md b/adr/0001-split-read-and-write-runtimes.md similarity index 99% rename from docs/adr/0001-split-read-and-write-runtimes.md rename to adr/0001-split-read-and-write-runtimes.md index ef1b915a..3b17d8a8 100644 --- a/docs/adr/0001-split-read-and-write-runtimes.md +++ b/adr/0001-split-read-and-write-runtimes.md @@ -353,6 +353,6 @@ of CI / integration-test fallout, plus reviewer time. Not a weekend. - [PR #27 — `sec(parser): BoundedSegment + audit trail for untrusted asSlice`](https://github.com/dfa1/vortex-java/pull/27) - [Phase 1–4 commits — BoundedSegment introduction and migration](https://github.com/dfa1/vortex-java/pull/27/commits) -- [SECURITY.md — the contract this work hardens](../../SECURITY.md) -- [CLAUDE.md — current "three touch-points" rule for adding an encoding](../../CLAUDE.md) -- [TODO.md — parser hardening backlog](../../TODO.md) +- [SECURITY.md — the contract this work hardens](../SECURITY.md) +- [CLAUDE.md — current "three touch-points" rule for adding an encoding](../CLAUDE.md) +- [TODO.md — parser hardening backlog](../TODO.md) diff --git a/docs/adr/0002-pluggable-dtype-layout-compute.md b/adr/0002-pluggable-dtype-layout-compute.md similarity index 98% rename from docs/adr/0002-pluggable-dtype-layout-compute.md rename to adr/0002-pluggable-dtype-layout-compute.md index 064f786a..6660e609 100644 --- a/docs/adr/0002-pluggable-dtype-layout-compute.md +++ b/adr/0002-pluggable-dtype-layout-compute.md @@ -3,7 +3,7 @@ - **Status:** Deferred — awaiting a real downstream consumer use case - **Date:** 2026-06-11 - **Deciders:** project maintainer -- **Related:** [ADR 0001 — Split read and write runtimes](0001-split-read-and-write-runtimes.md), [TODO.md §Audit runtime pluggability vs Rust impl](../../TODO.md) +- **Related:** [ADR 0001 — Split read and write runtimes](0001-split-read-and-write-runtimes.md), [TODO.md §Audit runtime pluggability vs Rust impl](../TODO.md) ## Context @@ -197,10 +197,10 @@ Until those four boxes can be ticked, the deferral stands. ## References -- [TODO.md §"Audit runtime pluggability vs Rust impl"](../../TODO.md) +- [TODO.md §"Audit runtime pluggability vs Rust impl"](../TODO.md) (line ~215, dated 2026-06-04) - [ADR 0001 — Split read and write runtimes out of `core`](0001-split-read-and-write-runtimes.md) -- [docs/compatibility.md — Known wire-format gaps](../compatibility.md) +- [docs/compatibility.md — Known wire-format gaps](../docs/compatibility.md) (notes `DType::Union` as one new variant Java does not yet decode) - Rust upstream: [`VortexSession`](https://docs.rs/vortex/latest/vortex/session/struct.VortexSession.html) diff --git a/docs/adr/0003-vortex-exception-sanitization.md b/adr/0003-vortex-exception-sanitization.md similarity index 99% rename from docs/adr/0003-vortex-exception-sanitization.md rename to adr/0003-vortex-exception-sanitization.md index 20d6bbf5..889c137c 100644 --- a/docs/adr/0003-vortex-exception-sanitization.md +++ b/adr/0003-vortex-exception-sanitization.md @@ -5,7 +5,7 @@ - **Deciders:** project maintainer - **Related:** [ADR 0001 — Split read and write runtimes](0001-split-read-and-write-runtimes.md), [ADR 0004 — Resource caps and `ReadOptions`](0004-resource-caps-read-options.md), - [SECURITY.md](../../SECURITY.md) + [SECURITY.md](../SECURITY.md) ## Context @@ -415,7 +415,7 @@ programmatically inspect exception types, this decision can be revisited. ## References -- [SECURITY.md — injection threat model](../../SECURITY.md) +- [SECURITY.md — injection threat model](../SECURITY.md) - [PR #27 — `BoundedSegment` + audit trail for untrusted `asSlice`](https://github.com/dfa1/vortex-java/pull/27) - [ADR 0001 — Split read and write runtimes](0001-split-read-and-write-runtimes.md) -- [TODO.md §"Error messages — structural sanitization"](../../TODO.md) +- [TODO.md §"Error messages — structural sanitization"](../TODO.md) diff --git a/docs/adr/0004-resource-caps-read-options.md b/adr/0004-resource-caps-read-options.md similarity index 97% rename from docs/adr/0004-resource-caps-read-options.md rename to adr/0004-resource-caps-read-options.md index 36fdd356..054a65c9 100644 --- a/docs/adr/0004-resource-caps-read-options.md +++ b/adr/0004-resource-caps-read-options.md @@ -3,7 +3,7 @@ - **Status:** Accepted — implementation pending - **Date:** 2026-06-13 - **Deciders:** project maintainer -- **Related:** [SECURITY.md](../../SECURITY.md), [TODO.md §Resource caps](../../TODO.md) +- **Related:** [SECURITY.md](../SECURITY.md), [TODO.md §Resource caps](../TODO.md) ## Context @@ -167,6 +167,6 @@ a user-facing query limit, not a security cap. ## References -- [SECURITY.md — zip-bomb and depth-bomb mitigations](../../SECURITY.md) -- [TODO.md §Resource caps](../../TODO.md) +- [SECURITY.md — zip-bomb and depth-bomb mitigations](../SECURITY.md) +- [TODO.md §Resource caps](../TODO.md) - [ADR 0003 — VortexException sanitization](0003-vortex-exception-sanitization.md) diff --git a/docs/adr/0005-vector-api-adoption.md b/adr/0005-vector-api-adoption.md similarity index 96% rename from docs/adr/0005-vector-api-adoption.md rename to adr/0005-vector-api-adoption.md index ac243a32..3f47590b 100644 --- a/docs/adr/0005-vector-api-adoption.md +++ b/adr/0005-vector-api-adoption.md @@ -3,8 +3,8 @@ - **Status:** Deferred — adoption gated on API finalization + benchmark evidence - **Date:** 2026-06-13 - **Deciders:** project maintainer -- **Related:** [CLAUDE.md §Memory model — Hot-loop rule](../../CLAUDE.md), - [TODO.md §Vector API](../../TODO.md) +- **Related:** [CLAUDE.md §Memory model — Hot-loop rule](../CLAUDE.md), + [TODO.md §Vector API](../TODO.md) ## Context @@ -150,9 +150,9 @@ loop structure. ## References -- [CLAUDE.md §Hot-loop rule](../../CLAUDE.md) — the primary vectorization +- [CLAUDE.md §Hot-loop rule](../CLAUDE.md) — the primary vectorization strategy in use today -- [TODO.md §Vector API items](../../TODO.md) +- [TODO.md §Vector API items](../TODO.md) - JEP 469: Vector API (8th Incubator) — https://openjdk.org/jeps/469 - JMH benchmark harness: `./bench` command, `JavaVsJniReadBenchmark` diff --git a/docs/adr/0006-benchmark-publishing.md b/adr/0006-benchmark-publishing.md similarity index 97% rename from docs/adr/0006-benchmark-publishing.md rename to adr/0006-benchmark-publishing.md index 80b773ab..6a4ed3ab 100644 --- a/docs/adr/0006-benchmark-publishing.md +++ b/adr/0006-benchmark-publishing.md @@ -3,7 +3,7 @@ - **Status:** Accepted — CI workflow deleted; `bench-publish` script pending - **Date:** 2026-06-13 - **Deciders:** project maintainer -- **Related:** [TODO.md §Performance](../../TODO.md), +- **Related:** [TODO.md §Performance](../TODO.md), `.github/workflows/benchmark.yml` ## Context @@ -156,5 +156,5 @@ longer updated. multi-source comparison - [benchmark-action/github-action-benchmark](https://github.com/benchmark-action/github-action-benchmark) — the tool being retired -- [TODO.md §Publish benchmarks](../../TODO.md) -- [TODO.md §Publish reproducible perf artifacts](../../TODO.md) +- [TODO.md §Publish benchmarks](../TODO.md) +- [TODO.md §Publish reproducible perf artifacts](../TODO.md) diff --git a/docs/adr/0007-pco-encode.md b/adr/0007-pco-encode.md similarity index 100% rename from docs/adr/0007-pco-encode.md rename to adr/0007-pco-encode.md diff --git a/docs/adr/0008-domain-primitives-unsigned-integers.md b/adr/0008-domain-primitives-unsigned-integers.md similarity index 98% rename from docs/adr/0008-domain-primitives-unsigned-integers.md rename to adr/0008-domain-primitives-unsigned-integers.md index e765ba91..2007b858 100644 --- a/docs/adr/0008-domain-primitives-unsigned-integers.md +++ b/adr/0008-domain-primitives-unsigned-integers.md @@ -3,7 +3,7 @@ - **Status:** Proposed - **Date:** 2026-06-13 - **Deciders:** project maintainer -- **Related:** [TODO.md — domain primitives / Valhalla item](../../TODO.md), +- **Related:** [TODO.md — domain primitives / Valhalla item](../TODO.md), [ADR 0005 — Vector API adoption](0005-vector-api-adoption.md) ## Context @@ -20,7 +20,7 @@ as their widened signed counterparts (`int`, `int`, `long`, `long`) with no enforcement of the unsigned range and no indication at the call site that the value must be treated as unsigned. -The [TODO.md](../../TODO.md) item reads: +The [TODO.md](../TODO.md) item reads: > Use domain primitives (`UInt32`, `UInt64`, etc.) as value classes via > Project Valhalla instead of raw `long`/`int`. @@ -155,7 +155,7 @@ additional tooling. ## References -- [TODO.md — domain primitives / Valhalla item](../../TODO.md) +- [TODO.md — domain primitives / Valhalla item](../TODO.md) - [Rethink Domain Primitives with Valhalla](https://dfa1.github.io/articles/rethink-domain-primitives-with-valhalla) - JEP 401: Value Classes and Objects — https://openjdk.org/jeps/401 - [ADR 0005 — Vector API adoption](0005-vector-api-adoption.md) — analogous diff --git a/docs/adr/0009-write-api-ergonomics.md b/adr/0009-write-api-ergonomics.md similarity index 100% rename from docs/adr/0009-write-api-ergonomics.md rename to adr/0009-write-api-ergonomics.md diff --git a/docs/adr/0010-lazy-decode.md b/adr/0010-lazy-decode.md similarity index 99% rename from docs/adr/0010-lazy-decode.md rename to adr/0010-lazy-decode.md index 063980af..0aefc7d2 100644 --- a/docs/adr/0010-lazy-decode.md +++ b/adr/0010-lazy-decode.md @@ -8,7 +8,7 @@ - **Superseded by:** — - **Related:** [ADR 0005 — Vector API adoption](0005-vector-api-adoption.md), [ADR 0012 — Zero-copy layout decoding: lazy Chunked / Dict](0012-zero-copy-layout-decoding.md), - [CLAUDE.md §Memory model](../../CLAUDE.md) + [CLAUDE.md §Memory model](../CLAUDE.md) ## Context @@ -651,7 +651,7 @@ across the lazy storage types from ADRs 0010 and 0012. - [ADR 0005](0005-vector-api-adoption.md) — Vector API is an optimization on top of an eager loop; lazy makes most of those loops conditional, changing what is even worth vectorizing. -- [CLAUDE.md §Memory model — Encoding output allocation rule](../../CLAUDE.md) +- [CLAUDE.md §Memory model — Encoding output allocation rule](../CLAUDE.md) — current rule mandates arena allocation for decode output. Phase 1 changes this rule: lazy arrays do not allocate decode output, they hold the input. diff --git a/docs/adr/0011-writer-memorysegment-overload.md b/adr/0011-writer-memorysegment-overload.md similarity index 100% rename from docs/adr/0011-writer-memorysegment-overload.md rename to adr/0011-writer-memorysegment-overload.md diff --git a/docs/adr/0012-zero-copy-layout-decoding.md b/adr/0012-zero-copy-layout-decoding.md similarity index 99% rename from docs/adr/0012-zero-copy-layout-decoding.md rename to adr/0012-zero-copy-layout-decoding.md index a1e27efd..14fe4347 100644 --- a/docs/adr/0012-zero-copy-layout-decoding.md +++ b/adr/0012-zero-copy-layout-decoding.md @@ -369,7 +369,7 @@ What was **not** shipped (intentional): - [ADR 0010 — Lazy decode for 1:1 transform encodings](0010-lazy-decode.md) - [ADR 0001 — Split read and write runtimes out of core](0001-split-read-and-write-runtimes.md) -- [CLAUDE.md §Memory model](../../CLAUDE.md) — hot-loop rule, encoding output allocation rule +- [CLAUDE.md §Memory model](../CLAUDE.md) — hot-loop rule, encoding output allocation rule - Rust reference: - `vortex-array/src/arrays/chunked/array.rs` — ChunkedArray storage and `find_chunk_idx` - `vortex-array/src/arrays/dict/array.rs` — DictArray storage diff --git a/docs/adr/0013-compute-primitives.md b/adr/0013-compute-primitives.md similarity index 100% rename from docs/adr/0013-compute-primitives.md rename to adr/0013-compute-primitives.md diff --git a/docs/adr/0014-variant-encoding-strategy.md b/adr/0014-variant-encoding-strategy.md similarity index 100% rename from docs/adr/0014-variant-encoding-strategy.md rename to adr/0014-variant-encoding-strategy.md diff --git a/docs/adr/0015-drop-materialized-fallbacks.md b/adr/0015-drop-materialized-fallbacks.md similarity index 98% rename from docs/adr/0015-drop-materialized-fallbacks.md rename to adr/0015-drop-materialized-fallbacks.md index ebc84af6..053fc8d6 100644 --- a/docs/adr/0015-drop-materialized-fallbacks.md +++ b/adr/0015-drop-materialized-fallbacks.md @@ -51,7 +51,7 @@ A Materialized branch may be deleted when **all** of these hold: (`RustWritesJavaReadsIntegrationTest`, `RustJavaReaderComparisonIntegrationTest`). 3. The encoding's `Decode shape` row in - [`docs/compatibility.md`](../compatibility.md) reads `Lazy / Lazy`. + [`docs/compatibility.md`](../docs/compatibility.md) reads `Lazy / Lazy`. 4. At least one production-shaped workload has decoded the encoding via the Lazy path — the NYC Yellow Taxi fixture set is the canonical stand-in until a broader corpus exists. diff --git a/docs/adr/0016-vortex-arrow-bridge.md b/adr/0016-vortex-arrow-bridge.md similarity index 98% rename from docs/adr/0016-vortex-arrow-bridge.md rename to adr/0016-vortex-arrow-bridge.md index e20c21be..7840157a 100644 --- a/docs/adr/0016-vortex-arrow-bridge.md +++ b/adr/0016-vortex-arrow-bridge.md @@ -3,7 +3,7 @@ - **Status:** Proposed — gated on a concrete downstream interop need - **Date:** 2026-06-18 - **Deciders:** project maintainer -- **Related:** [TODO.md §Tooling](../../TODO.md), +- **Related:** [TODO.md §Tooling](../TODO.md), [ADR 0010 — Lazy decode](0010-lazy-decode.md), [ADR 0012 — Zero-copy layout decoding](0012-zero-copy-layout-decoding.md) @@ -182,7 +182,7 @@ replacement. ## References -- [TODO.md §Tooling — vortex-arrow bridge](../../TODO.md) +- [TODO.md §Tooling — vortex-arrow bridge](../TODO.md) - Arrow Java (`arrow-vector`) — https://arrow.apache.org/docs/java/ - Arrow C-Data Interface — https://arrow.apache.org/docs/format/CDataInterface.html - CLAUDE.md §Code style — no `sun.misc.Unsafe` / internal JDK APIs diff --git a/docs/adr/0017-in-house-flatbuffers-codegen.md b/adr/0017-in-house-flatbuffers-codegen.md similarity index 100% rename from docs/adr/0017-in-house-flatbuffers-codegen.md rename to adr/0017-in-house-flatbuffers-codegen.md diff --git a/docs/adr/0018-calcite-sql-adapter.md b/adr/0018-calcite-sql-adapter.md similarity index 100% rename from docs/adr/0018-calcite-sql-adapter.md rename to adr/0018-calcite-sql-adapter.md diff --git a/docs/adr/0019-columnar-transducer-facade.md b/adr/0019-columnar-transducer-facade.md similarity index 100% rename from docs/adr/0019-columnar-transducer-facade.md rename to adr/0019-columnar-transducer-facade.md diff --git a/docs/adr/ADR.md b/adr/ADR.md similarity index 100% rename from docs/adr/ADR.md rename to adr/ADR.md diff --git a/docs/adr/template.md b/adr/template.md similarity index 100% rename from docs/adr/template.md rename to adr/template.md diff --git a/core/src/main/java/io/github/dfa1/vortex/core/io/IoBounds.java b/core/src/main/java/io/github/dfa1/vortex/core/io/IoBounds.java index ba1d98b1..cd081262 100644 --- a/core/src/main/java/io/github/dfa1/vortex/core/io/IoBounds.java +++ b/core/src/main/java/io/github/dfa1/vortex/core/io/IoBounds.java @@ -18,7 +18,7 @@ /// misuse and should throw [IndexOutOfBoundsException] via /// [java.util.Objects#checkIndex(long, long)], not a `VortexException`. /// -/// See ADR 0003 (`docs/adr/0003-vortex-exception-sanitization.md`). +/// See ADR 0003 (`adr/0003-vortex-exception-sanitization.md`). public final class IoBounds { private IoBounds() { diff --git a/docs/compatibility.md b/docs/compatibility.md index f989fb76..c29ef0df 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -33,7 +33,7 @@ resolves only the standalone decoders in `reader`; no encoder class is loaded. |------|------------|-------------| | `DType::Union` (`fbs.DType.Type.Union = 12`) | Rust 0.71.0 | ❌ Decode throws `VortexException("unsupported DType typeType=12")`. No `DType.Union` variant in Java's sealed type. | | `vortex.onpair` experimental string encoding | Rust 0.74.0 | ❌ Not registered. Files using it fail to decode unless `Registry.allowUnknown()` is enabled. | -| `vortex.variant` arbitrary nested objects | Rust (`vortex.parquet.variant`) | ⚠️ Java encodes/decodes variant columns of **typed scalar** values (constant / chunked-of-constants core, optional shredded child); Java↔Rust round-trip verified. Arbitrary nested JSON objects and real path-based shredding need the `vortex.parquet.variant` physical encoding — deferred ([ADR 0014](adr/0014-variant-encoding-strategy.md)). | +| `vortex.variant` arbitrary nested objects | Rust (`vortex.parquet.variant`) | ⚠️ Java encodes/decodes variant columns of **typed scalar** values (constant / chunked-of-constants core, optional shredded child); Java↔Rust round-trip verified. Arbitrary nested JSON objects and real path-based shredding need the `vortex.parquet.variant` physical encoding — deferred ([ADR 0014](../adr/0014-variant-encoding-strategy.md)). | | Arrow extension array import affecting Variant shape | Rust 0.74.0 (#8125) | Untested. Re-run integration fixtures against v0.74.0 once published. | ## Encodings @@ -77,7 +77,7 @@ resolves only the standalone decoders in `reader`; no encoder class is loaded. ### Decode shape -Per [ADR 0010](adr/0010-lazy-decode.md) and [ADR 0012](adr/0012-zero-copy-layout-decoding.md), each +Per [ADR 0010](../adr/0010-lazy-decode.md) and [ADR 0012](../adr/0012-zero-copy-layout-decoding.md), each decoder falls into one of three shapes: - **Zero-copy** — output is a view over the memory-mapped file (or a wrapper over child arrays).