revert bucket partition validation

ahmedabu98 · ahmedabu98 · commit 1aaf8a0bc678 · 2026-04-07T10:56:29.000-07:00
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AddFiles.java
@@ -31,7 +31,6 @@
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
@@ -97,10 +96,8 @@
 import org.apache.iceberg.Table;
 import org.apache.iceberg.avro.Avro;
 import org.apache.iceberg.catalog.TableIdentifier;
-import org.apache.iceberg.data.Record;
 import org.apache.iceberg.exceptions.AlreadyExistsException;
 import org.apache.iceberg.exceptions.NoSuchTableException;
-import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.InputFile;
 import org.apache.iceberg.io.OutputFile;
 import org.apache.iceberg.mapping.MappingUtil;
@@ -111,8 +108,6 @@
 import org.apache.iceberg.transforms.Transform;
 import org.apache.iceberg.types.Conversions;
 import org.apache.iceberg.types.Type;
-import org.apache.iceberg.types.TypeUtil;
-import org.apache.iceberg.types.Types;
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.metadata.FileMetaData;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
@@ -573,11 +568,6 @@ private String getPartitionFromFilePath(String filePath) {
      * determine the partition. We also cannot fall back to a "null" partition, because that will
      * also get skipped by most queries.
      *
-     * <p>The Bucket partition transform is an exceptional case because it is not monotonic, meaning
-     * it's not enough to just compare the min and max values. There may be a middle value somewhere
-     * that gets hashed to a different value. For this transform, we'll need to read all the values
-     * in the column ensure they all get transformed to the same partition value.
-     *
      * <p>In these cases, we output the DataFile to the DLQ, because assigning an incorrect
      * partition may lead to it being incorrectly ignored by downstream queries.
      */
@@ -614,22 +604,9 @@ static String getPartitionFromMetrics(Metrics metrics, InputFile inputFile, Tabl
 
       PartitionKey pk = new PartitionKey(table.spec(), table.schema());
 
-      HashMap<Integer, PartitionField> bucketPartitions = new HashMap<>();
+      // read metadata from footer and set partition based on min/max transformed values
       for (int i = 0; i < fields.size(); i++) {
         PartitionField field = fields.get(i);
-        Transform<?, ?> transform = field.transform();
-        if (transform.toString().contains("bucket[")) {
-          bucketPartitions.put(i, field);
-        }
-      }
-
-      // first, read only metadata for the non-bucket partition types
-      for (int i = 0; i < fields.size(); i++) {
-        PartitionField field = fields.get(i);
-        // skip bucket partitions (we will process them below)
-        if (bucketPartitions.containsKey(i)) {
-          continue;
-        }
         Type type = table.schema().findType(field.sourceId());
         Transform<?, ?> transform = field.transform();
 
@@ -658,54 +635,6 @@ static String getPartitionFromMetrics(Metrics metrics, InputFile inputFile, Tabl
         pk.set(i, lowerTransformedValue);
       }
 
-      // bucket transform needs extra processing (see java doc above)
-      if (!bucketPartitions.isEmpty()) {
-        // Optimize by only reading bucket-transformed columns into memory
-        org.apache.iceberg.Schema bucketCols =
-            TypeUtil.select(
-                table.schema(),
-                bucketPartitions.values().stream()
-                    .map(PartitionField::sourceId)
-                    .collect(Collectors.toSet()));
-
-        // Keep one instance of transformed value per column. Use this to compare against each
-        // record's transformed value.
-        // Values in the same columns must yield the same transformed value, otherwise we cannot
-        // determine a partition
-        // from this file.
-        Map<Integer, Object> transformedValues = new HashMap<>();
-
-        // Do a one-time read of the file and compare all bucket-transformed columns
-        try (CloseableIterable<Record> reader = ReadUtils.createReader(inputFile, bucketCols)) {
-          for (Record record : reader) {
-            for (Map.Entry<Integer, PartitionField> entry : bucketPartitions.entrySet()) {
-              int partitionIndex = entry.getKey();
-              PartitionField partitionField = entry.getValue();
-              Transform<?, ?> transform = partitionField.transform();
-              Types.NestedField field = table.schema().findField(partitionField.sourceId());
-              Object value = record.getField(field.name());
-
-              // set initial transformed value for this column
-              @Nullable Object transformedValue = transformedValues.get(partitionIndex);
-              Object currentTransformedValue = transformValue(transform, field.type(), value);
-              if (transformedValue == null) {
-                transformedValues.put(partitionIndex, checkStateNotNull(currentTransformedValue));
-                continue;
-              }
-
-              if (!Objects.deepEquals(currentTransformedValue, transformedValue)) {
-                throw new UnknownPartitionException(
-                    "Found records with conflicting transformed values, for column: "
-                        + field.name());
-              }
-            }
-          }
-        }
-
-        for (Map.Entry<Integer, Object> partitionCol : transformedValues.entrySet()) {
-          pk.set(partitionCol.getKey(), partitionCol.getValue());
-        }
-      }
       return pk.toPath();
     }
   }
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/BeamRowWrapper.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/BeamRowWrapper.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.iceberg;
+
+import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull;
+
+import java.lang.reflect.Array;
+import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.time.Instant;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.LocalTime;
+import java.util.concurrent.TimeUnit;
+import org.apache.beam.sdk.schemas.Schema;
+import org.apache.beam.sdk.schemas.Schema.FieldType;
+import org.apache.beam.sdk.schemas.logicaltypes.Date;
+import org.apache.beam.sdk.schemas.logicaltypes.DateTime;
+import org.apache.beam.sdk.schemas.logicaltypes.FixedPrecisionNumeric;
+import org.apache.beam.sdk.schemas.logicaltypes.MicrosInstant;
+import org.apache.beam.sdk.schemas.logicaltypes.Time;
+import org.apache.beam.sdk.values.Row;
+import org.apache.iceberg.StructLike;
+import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.DateTimeUtil;
+import org.apache.iceberg.util.UUIDUtil;
+import org.checkerframework.checker.nullness.qual.Nullable;
+
+public class BeamRowWrapper implements StructLike {
+
+  private final FieldType[] types;
+  private final @Nullable PositionalGetter<?>[] getters;
+  private @Nullable Row row = null;
+
+  public BeamRowWrapper(Schema schema, Types.StructType struct) {
+    int size = schema.getFieldCount();
+
+    types = (FieldType[]) Array.newInstance(FieldType.class, size);
+    getters = (PositionalGetter[]) Array.newInstance(PositionalGetter.class, size);
+
+    for (int i = 0; i < size; i++) {
+      types[i] = schema.getField(i).getType();
+      getters[i] = buildGetter(types[i], struct.fields().get(i).type());
+    }
+  }
+
+  public BeamRowWrapper wrap(@Nullable Row row) {
+    this.row = row;
+    return this;
+  }
+
+  @Override
+  public int size() {
+    return types.length;
+  }
+
+  @Override
+  public <T> @Nullable T get(int pos, Class<T> javaClass) {
+    if (row == null || row.getValue(pos) == null) {
+      return null;
+    } else if (getters[pos] != null) {
+      return javaClass.cast(getters[pos].get(checkStateNotNull(row), pos));
+    }
+
+    return javaClass.cast(checkStateNotNull(row).getValue(pos));
+  }
+
+  @Override
+  public <T> void set(int pos, T value) {
+    throw new UnsupportedOperationException(
+        "Could not set a field in the BeamRowWrapper because rowData is read-only");
+  }
+
+  private interface PositionalGetter<T> {
+    T get(Row data, int pos);
+  }
+
+  private static @Nullable PositionalGetter<?> buildGetter(FieldType beamType, Type icebergType) {
+    switch (beamType.getTypeName()) {
+      case BYTE:
+        return Row::getByte;
+      case INT16:
+        return Row::getInt16;
+      case STRING:
+        return Row::getString;
+      case BYTES:
+        return (row, pos) -> {
+          byte[] bytes = checkStateNotNull(row.getBytes(pos));
+          if (Type.TypeID.UUID == icebergType.typeId()) {
+            return UUIDUtil.convert(bytes);
+          } else {
+            return ByteBuffer.wrap(bytes);
+          }
+        };
+      case DECIMAL:
+        return Row::getDecimal;
+      case DATETIME:
+        return (row, pos) ->
+            TimeUnit.MILLISECONDS.toMicros(checkStateNotNull(row.getDateTime(pos)).getMillis());
+      case ROW:
+        Schema beamSchema = checkStateNotNull(beamType.getRowSchema());
+        Types.StructType structType = (Types.StructType) icebergType;
+
+        BeamRowWrapper nestedWrapper = new BeamRowWrapper(beamSchema, structType);
+        return (row, pos) -> nestedWrapper.wrap(row.getRow(pos));
+      case LOGICAL_TYPE:
+        if (beamType.isLogicalType(MicrosInstant.IDENTIFIER)) {
+          return (row, pos) -> {
+            Instant instant = checkStateNotNull(row.getLogicalTypeValue(pos, Instant.class));
+            return TimeUnit.SECONDS.toMicros(instant.getEpochSecond()) + instant.getNano() / 1000;
+          };
+        } else if (beamType.isLogicalType(DateTime.IDENTIFIER)) {
+          return (row, pos) ->
+              DateTimeUtil.microsFromTimestamp(
+                  checkStateNotNull(row.getLogicalTypeValue(pos, LocalDateTime.class)));
+        } else if (beamType.isLogicalType(Date.IDENTIFIER)) {
+          return (row, pos) ->
+              DateTimeUtil.daysFromDate(
+                  checkStateNotNull(row.getLogicalTypeValue(pos, LocalDate.class)));
+        } else if (beamType.isLogicalType(Time.IDENTIFIER)) {
+          return (row, pos) ->
+              DateTimeUtil.microsFromTime(
+                  checkStateNotNull(row.getLogicalTypeValue(pos, LocalTime.class)));
+        } else if (beamType.isLogicalType(FixedPrecisionNumeric.IDENTIFIER)) {
+          return (row, pos) -> row.getLogicalTypeValue(pos, BigDecimal.class);
+        } else {
+          return null;
+        }
+      default:
+        return null;
+    }
+  }
+}
diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/AddFilesTest.java
@@ -80,6 +80,7 @@
 import org.joda.time.Duration;
 import org.junit.Before;
 import org.junit.ClassRule;
+import org.junit.Ignore;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.ExpectedException;
@@ -425,6 +426,12 @@ public void testPartitionPrefixErrors() throws Exception {
     pipeline.run().waitUntilFinish();
   }
 
+  /**
+   * We reverted the in-depth bucket-partition validation in
+   * https://github.com/apache/beam/pull/38039, partly because it was too resource intensive, and
+   * also because the Spark AddFiles equivalent performs zero validation.
+   */
+  @Ignore
   @Test
   public void testRecognizesBucketPartitionMismatch() throws IOException {
     String file1 = root + "data1.parquet";