Skip to content

Commit 51e4ab0

Browse files
authored
HIVE-29547: Iceberg: [V3] Handle nested and missing Struct while using defaults (#6409)
1 parent f3496f3 commit 51e4ab0

10 files changed

Lines changed: 222 additions & 81 deletions

File tree

iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ Type convertType(TypeInfo typeInfo, String defaultValue) {
163163
StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
164164
List<Types.NestedField> fields =
165165
convertInternal(structTypeInfo.getAllStructFieldNames(), structTypeInfo.getAllStructFieldTypeInfos(),
166-
HiveSchemaUtil.getDefaultValuesMap(defaultValue), Collections.emptyList());
166+
HiveSchemaUtil.getDefaultValuesMap(null, defaultValue), Collections.emptyList());
167167
return Types.StructType.of(fields);
168168
case MAP:
169169
MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo;

iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java

Lines changed: 80 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -25,18 +25,17 @@
2525
import java.util.Map;
2626
import java.util.Objects;
2727
import java.util.Optional;
28-
import java.util.Set;
2928
import java.util.stream.Collectors;
30-
import org.apache.commons.lang3.StringUtils;
3129
import org.apache.hadoop.hive.metastore.api.FieldSchema;
3230
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
3331
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
32+
import org.apache.hive.iceberg.com.fasterxml.jackson.core.type.TypeReference;
33+
import org.apache.hive.iceberg.com.fasterxml.jackson.databind.ObjectMapper;
3434
import org.apache.iceberg.PartitionSpec;
3535
import org.apache.iceberg.Schema;
3636
import org.apache.iceberg.data.GenericRecord;
3737
import org.apache.iceberg.data.Record;
3838
import org.apache.iceberg.expressions.Literal;
39-
import org.apache.iceberg.relocated.com.google.common.base.Splitter;
4039
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
4140
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
4241
import org.apache.iceberg.types.Conversions;
@@ -48,6 +47,8 @@
4847

4948
public final class HiveSchemaUtil {
5049

50+
private static final ObjectMapper MAPPER = new ObjectMapper();
51+
5152
private HiveSchemaUtil() {
5253
}
5354

@@ -237,7 +238,7 @@ private static void getDefaultValDiff(Types.NestedField field, Map<String, Strin
237238
difference.addDefaultChanged(field, expectedDefault);
238239
}
239240
} else if (field.type().isStructType()) {
240-
Map<String, String> structDefaults = getDefaultValuesMap(defaultStr);
241+
Map<String, String> structDefaults = getDefaultValuesMap(field.type().asStructType().fields(), defaultStr);
241242

242243
for (Types.NestedField nested : field.type().asStructType().fields()) {
243244
getDefaultValDiff(nested, structDefaults, difference);
@@ -413,31 +414,48 @@ public static String convertToTypeString(Type type) {
413414
}
414415
}
415416

416-
public static void setDefaultValues(Record record, List<Types.NestedField> fields, Set<String> missingColumns) {
417-
for (Types.NestedField field : fields) {
418-
Object fieldValue = record.getField(field.name());
417+
public static void setDefaultValues(Record record, List<Types.NestedField> missingFields) {
418+
for (Types.NestedField field : missingFields) {
419+
if (field.type().isStructType()) {
420+
// Attempt to build the nested struct with its defaults
421+
Record nestedRecord = buildStructWithDefaults(field.type().asStructType());
422+
if (nestedRecord != null) {
423+
record.setField(field.name(), nestedRecord);
424+
}
425+
} else if (field.writeDefault() != null) {
426+
Object defaultValue = convertToWriteType(field.writeDefault(), field.type());
427+
record.setField(field.name(), defaultValue);
428+
}
429+
}
430+
}
419431

420-
if (fieldValue == null) {
421-
boolean isMissing = missingColumns.contains(field.name());
422-
423-
if (isMissing) {
424-
if (field.type().isStructType()) {
425-
// Create struct and apply defaults to all nested fields
426-
Record nestedRecord = GenericRecord.create(field.type().asStructType());
427-
record.setField(field.name(), nestedRecord);
428-
// For nested fields, we consider ALL fields as "missing" to apply defaults
429-
setDefaultValuesForNestedStruct(nestedRecord, field.type().asStructType().fields());
430-
} else if (field.writeDefault() != null) {
431-
Object defaultValue = convertToWriteType(field.writeDefault(), field.type());
432-
record.setField(field.name(), defaultValue);
433-
}
432+
/**
433+
* Recursively builds a struct populated with write defaults.
434+
* * @return A populated Record, or null if no nested fields have defaults.
435+
*/
436+
private static Record buildStructWithDefaults(Types.StructType structType) {
437+
Record nestedRecord = GenericRecord.create(structType);
438+
boolean hasAnyDefault = false;
439+
440+
for (Types.NestedField field : structType.fields()) {
441+
if (field.writeDefault() != null) {
442+
Object defaultValue = convertToWriteType(field.writeDefault(), field.type());
443+
nestedRecord.setField(field.name(), defaultValue);
444+
hasAnyDefault = true;
445+
} else if (field.type().isStructType()) {
446+
// Recursively process deeper nested structs
447+
Record deeperRecord = buildStructWithDefaults(field.type().asStructType());
448+
449+
// If the deeper struct has defaults, attach it and flag this current struct as populated
450+
if (deeperRecord != null) {
451+
nestedRecord.setField(field.name(), deeperRecord);
452+
hasAnyDefault = true;
434453
}
435-
// Explicit NULLs remain NULL
436-
} else if (field.type().isStructType() && fieldValue instanceof Record) {
437-
// For existing structs, apply defaults to any null nested fields
438-
setDefaultValuesForNestedStruct((Record) fieldValue, field.type().asStructType().fields());
439454
}
440455
}
456+
457+
// If no fields (or nested fields) had defaults, return null to avoid an empty struct
458+
return hasAnyDefault ? nestedRecord : null;
441459
}
442460

443461
/**
@@ -483,11 +501,10 @@ private static void setDefaultValuesForNestedStruct(Record record, List<Types.Ne
483501
for (Types.NestedField field : fields) {
484502
Object fieldValue = record.getField(field.name());
485503

486-
if (fieldValue == null && field.writeDefault() != null) {
487-
// Always apply default to null fields in nested structs
504+
if (field.writeDefault() != null) {
488505
Object defaultValue = convertToWriteType(field.writeDefault(), field.type());
489506
record.setField(field.name(), defaultValue);
490-
} else if (field.type().isStructType() && fieldValue instanceof Record) {
507+
} else if (field.type().isStructType()) {
491508
// Recursively process nested structs
492509
setDefaultValuesForNestedStruct((Record) fieldValue, field.type().asStructType().fields());
493510
}
@@ -532,12 +549,43 @@ public static Object convertToWriteType(Object value, Type type) {
532549
return value; // fallback
533550
}
534551

535-
public static Map<String, String> getDefaultValuesMap(String defaultValue) {
536-
if (StringUtils.isEmpty(defaultValue)) {
552+
public static Map<String, String> getDefaultValuesMap(List<Types.NestedField> fields, String json) {
553+
if (json == null || json.trim().isEmpty()) {
537554
return Collections.emptyMap();
538555
}
539-
// For Struct, the default value is expected to be in key:value format
540-
return Splitter.on(',').trimResults().withKeyValueSeparator(':').split(stripQuotes(defaultValue));
556+
// 1. Strip the surrounding single or double quotes passed by Hive
557+
String cleanJson = stripQuotes(json);
558+
559+
// 2. Check for NULL on the cleaned string
560+
if (cleanJson.equalsIgnoreCase("NULL")) {
561+
if (fields == null || fields.isEmpty()) {
562+
return Collections.emptyMap();
563+
}
564+
return fields.stream().collect(Collectors.toMap(Types.NestedField::name, f -> "NULL"));
565+
}
566+
567+
Map<String, Object> parsed;
568+
Map<String, String> result = Maps.newHashMap();
569+
try {
570+
// 3. Pass the cleaned JSON to the mapper
571+
parsed = MAPPER.readValue(cleanJson, new TypeReference<>() {
572+
});
573+
574+
for (Map.Entry<String, Object> entry : parsed.entrySet()) {
575+
Object value = entry.getValue();
576+
577+
if (value == null) {
578+
result.put(entry.getKey(), "NULL");
579+
} else if (value instanceof Map || value instanceof List) {
580+
result.put(entry.getKey(), MAPPER.writeValueAsString(value));
581+
} else {
582+
result.put(entry.getKey(), value.toString());
583+
}
584+
}
585+
} catch (Exception e) {
586+
throw new IllegalArgumentException("Invalid default values JSON: " + json, e);
587+
}
588+
return result;
541589
}
542590

543591
public static String stripQuotes(String val) {

iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -890,7 +890,8 @@ private void handleDefaultValues(Map<String, String> defaultValues, Map<String,
890890
Type fieldType =
891891
columns.stream().filter(col -> col.name().equalsIgnoreCase(simpleName)).findFirst().get().type();
892892
if (fieldType.isStructType()) {
893-
Map<String, String> structDefaults = HiveSchemaUtil.getDefaultValuesMap(field.getValue());
893+
Map<String, String> structDefaults =
894+
HiveSchemaUtil.getDefaultValuesMap(fieldType.asStructType().fields(), field.getValue());
894895
handleDefaultValues(structDefaults, renameMapping, fieldType.asStructType().fields(), qualifiedName + ".");
895896
} else {
896897
Object defaultValue = HiveSchemaUtil.getDefaultValue(field.getValue(), fieldType);

iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergDefaultWriter.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,7 @@
4242
abstract class HiveIcebergDefaultWriter extends HiveIcebergWriterBase {
4343

4444
private final int currentSpecId;
45-
private final Set<String> missingColumns;
46-
private final List<NestedField> missingOrStructFields;
45+
private final List<NestedField> missingFields;
4746

4847
HiveIcebergDefaultWriter(
4948
Table table,
@@ -54,12 +53,12 @@ abstract class HiveIcebergDefaultWriter extends HiveIcebergWriterBase {
5453

5554
Schema schema = table.schema();
5655
this.currentSpecId = table.spec().specId();
57-
this.missingColumns =
56+
Set<String> missingColumns =
5857
context != null && context.missingColumns() != null ? context.missingColumns() : Set.of();
5958

60-
this.missingOrStructFields =
59+
this.missingFields =
6160
schema.columns().stream()
62-
.filter(field -> missingColumns.contains(field.name()) || field.type().isStructType())
61+
.filter(field -> missingColumns.contains(field.name()))
6362
.toList();
6463
}
6564

@@ -76,7 +75,7 @@ private static PartitioningWriter<Record, DataWriteResult> newDataWriterWithVari
7675
}
7776

7877
protected final void applyDefaultValues(Record record) {
79-
HiveSchemaUtil.setDefaultValues(record, missingOrStructFields, missingColumns);
78+
HiveSchemaUtil.setDefaultValues(record, missingFields);
8079
}
8180

8281
protected final void write(Record record) {

iceberg/iceberg-handler/src/test/queries/positive/iceberg_alter_default_column.q

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ TBLPROPERTIES ('format-version'='3');
55

66
INSERT INTO ice_t (id) VALUES (1);
77

8-
ALTER TABLE ice_t ADD COLUMNS (point STRUCT<x:INT, y:INT> DEFAULT 'x:100,y:99',
8+
ALTER TABLE ice_t ADD COLUMNS (point STRUCT<x:INT, y:INT> DEFAULT '{"x":100,"y":99}',
99
name STRING DEFAULT 'unknown',
1010
age INT DEFAULT 25,
1111
salary DOUBLE DEFAULT 50000.0,
@@ -20,7 +20,7 @@ INSERT INTO ice_t (id) VALUES (2);
2020
SELECT * FROM ice_t ORDER BY id;
2121

2222
ALTER TABLE ice_t REPLACE COLUMNS (id INT,
23-
point STRUCT<x:INT, y:INT> DEFAULT 'x:100,y:99',
23+
point STRUCT<x:INT, y:INT> DEFAULT '{"x":100,"y":99}',
2424
name STRING DEFAULT 'unknown',
2525
age INT DEFAULT 25,
2626
salary DOUBLE DEFAULT 50000.0,
@@ -32,7 +32,7 @@ ALTER TABLE ice_t REPLACE COLUMNS (id INT,
3232
SELECT * FROM ice_t ORDER BY id;
3333

3434
-- change default of a field of Struct column
35-
ALTER TABLE ice_t CHANGE COLUMN point point STRUCT<x:INT, y:INT> DEFAULT 'x:100,y:88';
35+
ALTER TABLE ice_t CHANGE COLUMN point point STRUCT<x:INT, y:INT> DEFAULT '{"x":100,"y":88}';
3636

3737
-- rename and change default value of age column
3838
ALTER TABLE ice_t CHANGE COLUMN age age_new int DEFAULT 21;
@@ -42,7 +42,7 @@ INSERT INTO ice_t (id) VALUES (3);
4242
SELECT * FROM ice_t ORDER BY id;
4343

4444
-- Rename the struct column with default changes
45-
ALTER TABLE ice_t CHANGE COLUMN point point_new STRUCT<x:INT, y:INT> DEFAULT 'x:55,y:88';
45+
ALTER TABLE ice_t CHANGE COLUMN point point_new STRUCT<x:INT, y:INT> DEFAULT '{"x":55,"y":88}';
4646

4747
INSERT INTO ice_t (id) VALUES (4);
4848

iceberg/iceberg-handler/src/test/queries/positive/iceberg_default_column.q

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
CREATE TABLE t3 (
22
id INT,
3-
point STRUCT<x:INT, y:INT> DEFAULT 'x:100,y:99',
3+
point STRUCT<x:INT, y:INT> DEFAULT '{"x":100,"y":99}',
44
name STRING DEFAULT 'unknown',
55
age INT DEFAULT 25,
66
salary DOUBLE DEFAULT 50000.0,
@@ -13,7 +13,7 @@ CREATE TABLE t3 (
1313
STORED BY ICEBERG
1414
TBLPROPERTIES ('format-version'='3');
1515

16-
-- Case 1: Partial struct with explicit null field
16+
-- Case 1: Partial struct with explicit null field, the null should be preserved
1717
INSERT INTO t3 (id, point) VALUES (2, named_struct('x', CAST(null AS INT), 'y', 7));
1818

1919
-- Case 2: Only ID specified (all defaults should apply)
@@ -25,13 +25,38 @@ INSERT INTO t3 (id, name) VALUES (4, NULL);
2525
-- Case 4: Mixed scenario - some fields provided, some missing
2626
INSERT INTO t3 (id, name, age) VALUES (5, 'custom_name', 30);
2727

28-
-- Case 5: Complex struct with nested nulls
28+
-- Case 5: Complex struct with nested nulls, the null should be preserved
2929
INSERT INTO t3 (id, point) VALUES (6, named_struct('x', CAST(null AS INT), 'y', CAST(null AS INT)));
3030

3131
-- Case 6: ALTER AND SET NULL and 'null' string
3232
ALTER TABLE t3 CHANGE COLUMN age age int DEFAULT null;
3333
ALTER TABLE t3 CHANGE COLUMN name name string DEFAULT 'null';
34+
ALTER TABLE t3 CHANGE COLUMN point point STRUCT<x:INT, y:INT> DEFAULT null;
3435

3536
INSERT INTO t3 (id) VALUES (7);
3637

38+
-- Case 7: Add a nested struct field with default value
39+
ALTER TABLE t3 ADD COLUMNS (person STRUCT<
40+
name: STRING,
41+
address: STRUCT<
42+
street: STRING,
43+
city: STRING
44+
>
45+
> DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}'
46+
);
47+
48+
INSERT INTO t3 (id) VALUES (8);
49+
-- Partial struct with explicit null field, the null should be preserved
50+
INSERT INTO t3 (id, person)
51+
VALUES (
52+
9,
53+
named_struct(
54+
'name', CAST(NULL AS STRING),
55+
'address', named_struct(
56+
'street', CAST(NULL AS STRING),
57+
'city', 'Bangalore'
58+
)
59+
)
60+
);
61+
3762
SELECT * FROM t3 ORDER BY id;

iceberg/iceberg-handler/src/test/queries/positive/iceberg_initial_default.q

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ TBLPROPERTIES ('format-version'='3');
55

66
INSERT INTO ice_parq (id) VALUES (1);
77

8-
ALTER TABLE ice_parq ADD COLUMNS (point STRUCT<x:INT, y:INT> DEFAULT 'x:100,y:99',
8+
ALTER TABLE ice_parq ADD COLUMNS (point STRUCT<x:INT, y:INT> DEFAULT '{"x":100,"y":99}',
99
name STRING DEFAULT 'unknown',
1010
age INT DEFAULT 25,
1111
salary DOUBLE DEFAULT 50000.0,
@@ -20,7 +20,7 @@ INSERT INTO ice_parq (id) VALUES (2);
2020
SELECT * FROM ice_parq ORDER BY id;
2121

2222
-- change default of a field of Struct column
23-
ALTER TABLE ice_parq CHANGE COLUMN point point STRUCT<x:INT, y:INT> DEFAULT 'x:100,y:88';
23+
ALTER TABLE ice_parq CHANGE COLUMN point point STRUCT<x:INT, y:INT> DEFAULT '{"x":100,"y":88}';
2424

2525
-- rename and change default value of age column
2626
ALTER TABLE ice_parq CHANGE COLUMN age age_new int DEFAULT 21;
@@ -37,7 +37,7 @@ TBLPROPERTIES ('format-version'='3');
3737

3838
INSERT INTO ice_avro (id) VALUES (1);
3939

40-
ALTER TABLE ice_avro ADD COLUMNS (point STRUCT<x:INT, y:INT> DEFAULT 'x:100,y:99',
40+
ALTER TABLE ice_avro ADD COLUMNS (point STRUCT<x:INT, y:INT> DEFAULT '{"x":100,"y":99}',
4141
name STRING DEFAULT 'unknown',
4242
age INT DEFAULT 25,
4343
salary DOUBLE DEFAULT 50000.0,
@@ -52,7 +52,7 @@ INSERT INTO ice_avro (id) VALUES (2);
5252
SELECT * FROM ice_avro ORDER BY id;
5353

5454
-- change default of a field of Struct column
55-
ALTER TABLE ice_avro CHANGE COLUMN point point STRUCT<x:INT, y:INT> DEFAULT 'x:100,y:88';
55+
ALTER TABLE ice_avro CHANGE COLUMN point point STRUCT<x:INT, y:INT> DEFAULT '{"x":100,"y":88}';
5656

5757
-- rename and change default value of age column
5858
ALTER TABLE ice_avro CHANGE COLUMN age age_new int DEFAULT 21;
@@ -69,7 +69,7 @@ TBLPROPERTIES ('format-version'='3');
6969

7070
INSERT INTO ice_orc (id) VALUES (1);
7171

72-
ALTER TABLE ice_orc ADD COLUMNS (point STRUCT<x:INT, y:INT> DEFAULT 'x:100,y:99',
72+
ALTER TABLE ice_orc ADD COLUMNS (point STRUCT<x:INT, y:INT> DEFAULT '{"x":100,"y":99}',
7373
name STRING DEFAULT 'unknown',
7474
age INT DEFAULT 25,
7575
salary DOUBLE DEFAULT 50000.0,
@@ -84,7 +84,7 @@ INSERT INTO ice_orc (id) VALUES (2);
8484
SELECT * FROM ice_orc ORDER BY id;
8585

8686
-- change default of a field of Struct column
87-
ALTER TABLE ice_orc CHANGE COLUMN point point STRUCT<x:INT, y:INT> DEFAULT 'x:100,y:88';
87+
ALTER TABLE ice_orc CHANGE COLUMN point point STRUCT<x:INT, y:INT> DEFAULT '{"x":100,"y":88}';
8888

8989
-- rename and change default value of age column
9090
ALTER TABLE ice_orc CHANGE COLUMN age age_new int DEFAULT 21;

0 commit comments

Comments
 (0)