From f663e0e87c2982478e47d24399c252ccc364ffd0 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 15 Jun 2026 10:34:02 -0700 Subject: [PATCH 01/22] feat(schema): represent, serialize and validate v3 column default values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First of a multi-part split of column default value support (#730) — the schema foundation the read and evolution paths build on. Purely additive; no read/write behavior change on its own. - SchemaField carries `initial-default` / `write-default` (immutable std::shared_ptr) with copy-preserving WithInitialDefault / WithWriteDefault modifiers; getters return optional. - JSON serde reads/writes `initial-default` / `write-default` via the existing single-value serialization. - Schema::Validate rejects default values below format v3 and validates they are non-null primitive literals matching the field type. - Generic schema projection maps a column missing from a data file with an initial-default to FieldProjection::Kind::kDefault. Read-path application (Parquet/Avro) and schema evolution follow in separate PRs. See #731 for the full end-to-end proof-of-concept. --- src/iceberg/json_serde.cc | 32 ++++- src/iceberg/schema.cc | 26 +++- src/iceberg/schema_field.cc | 79 ++++++++++- src/iceberg/schema_field.h | 41 +++++- src/iceberg/schema_util.cc | 6 +- .../test/resources/TableMetadataV3Valid.json | 123 ++++++++++++++++++ src/iceberg/test/schema_json_test.cc | 45 +++++++ src/iceberg/test/schema_test.cc | 56 ++++++++ src/iceberg/test/schema_util_test.cc | 53 ++++++++ 9 files changed, 451 insertions(+), 10 deletions(-) create mode 100644 src/iceberg/test/resources/TableMetadataV3Valid.json diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index c72b7da57..4552db883 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -27,6 +27,8 @@ #include #include "iceberg/constants.h" +#include "iceberg/expression/json_serde_internal.h" +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/name_mapping.h" #include "iceberg/partition_field.h" @@ -298,6 +300,15 @@ nlohmann::json ToJson(const SchemaField& field) { if (!field.doc().empty()) { json[kDoc] = field.doc(); } + // Defaults are validated to be primitive literals matching the field type, so + // single-value serialization cannot fail here. + if (field.initial_default().has_value()) { + ICEBERG_ASSIGN_OR_THROW(json[kInitialDefault], + ToJson(field.initial_default()->get())); + } + if (field.write_default().has_value()) { + ICEBERG_ASSIGN_OR_THROW(json[kWriteDefault], ToJson(field.write_default()->get())); + } return json; } @@ -310,7 +321,6 @@ nlohmann::json ToJson(const Type& type) { nlohmann::json fields_json = nlohmann::json::array(); for (const auto& field : struct_type.fields()) { fields_json.push_back(ToJson(field)); - // TODO(gangwu): add default values } json[kFields] = fields_json; return json; @@ -552,9 +562,27 @@ Result> FieldFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue(json, kName)); ICEBERG_ASSIGN_OR_RAISE(auto required, GetJsonValue(json, kRequired)); ICEBERG_ASSIGN_OR_RAISE(auto doc, GetJsonValueOrDefault(json, kDoc)); + ICEBERG_ASSIGN_OR_RAISE(std::optional initial_default_json, + GetJsonValueOptional(json, kInitialDefault)); + ICEBERG_ASSIGN_OR_RAISE(std::optional write_default_json, + GetJsonValueOptional(json, kWriteDefault)); + + std::shared_ptr initial_default; + if (initial_default_json.has_value()) { + ICEBERG_ASSIGN_OR_RAISE(Literal literal, + LiteralFromJson(*initial_default_json, type.get())); + initial_default = std::make_shared(std::move(literal)); + } + std::shared_ptr write_default; + if (write_default_json.has_value()) { + ICEBERG_ASSIGN_OR_RAISE(Literal literal, + LiteralFromJson(*write_default_json, type.get())); + write_default = std::make_shared(std::move(literal)); + } return std::make_unique(field_id, std::move(name), std::move(type), - !required, doc); + !required, doc, std::move(initial_default), + std::move(write_default)); } Result> SchemaFromJson(const nlohmann::json& json) { diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index fcac43c78..d6347251c 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -116,9 +116,15 @@ std::shared_ptr ReassignTypeIds(const std::shared_ptr& type, SchemaField ReassignField(const SchemaField& field, int32_t new_id, const Schema::GetId& get_id, Schema::IdMap& ids_to_reassigned, Schema::IdMap& ids_to_original) { - return {new_id, std::string(field.name()), + // Reassigning IDs only rewrites the field ID and nested type IDs; share the field's + // (immutable) default values rather than copying them. + return {new_id, + std::string(field.name()), ReassignTypeIds(field.type(), get_id, ids_to_reassigned, ids_to_original), - field.optional(), std::string(field.doc())}; + field.optional(), + std::string(field.doc()), + field.initial_default_ptr(), + field.write_default_ptr()}; } std::vector ReassignIds(std::vector fields, @@ -447,7 +453,21 @@ Status Schema::Validate(int32_t format_version) const { } } - // TODO(GuoTao.yu): Check default values when they are supported + // Only the initial-default is gated on format version: it changes how existing + // data files are read (rows written before the column existed materialize this + // value), so it requires the v3 reader contract. A write-default only affects + // values written going forward and does not reinterpret existing data. + if (field.initial_default().has_value() && + format_version < TableMetadata::kMinFormatVersionDefaultValues) { + return InvalidSchema( + "Invalid initial default for {}: non-null default ({}) is not supported " + "until v{}", + field.name(), field.initial_default()->get(), + TableMetadata::kMinFormatVersionDefaultValues); + } + if (field.initial_default().has_value() || field.write_default().has_value()) { + ICEBERG_RETURN_UNEXPECTED(field.Validate()); + } } return {}; diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index 206915ec2..8c3fb9be6 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -21,19 +21,26 @@ #include #include +#include +#include "iceberg/expression/literal.h" #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/macros.h" namespace iceberg { SchemaField::SchemaField(int32_t field_id, std::string_view name, - std::shared_ptr type, bool optional, std::string_view doc) + std::shared_ptr type, bool optional, std::string_view doc, + std::shared_ptr initial_default, + std::shared_ptr write_default) : field_id_(field_id), name_(name), type_(std::move(type)), optional_(optional), - doc_(doc) {} + doc_(doc), + initial_default_(std::move(initial_default)), + write_default_(std::move(write_default)) {} SchemaField SchemaField::MakeOptional(int32_t field_id, std::string_view name, std::shared_ptr type, std::string_view doc) { @@ -55,6 +62,51 @@ bool SchemaField::optional() const { return optional_; } std::string_view SchemaField::doc() const { return doc_; } +std::optional> SchemaField::initial_default() + const { + if (initial_default_ == nullptr) { + return std::nullopt; + } + return std::cref(*initial_default_); +} + +std::optional> SchemaField::write_default() const { + if (write_default_ == nullptr) { + return std::nullopt; + } + return std::cref(*write_default_); +} + +const std::shared_ptr& SchemaField::initial_default_ptr() const { + return initial_default_; +} + +const std::shared_ptr& SchemaField::write_default_ptr() const { + return write_default_; +} + +namespace { + +Status ValidateDefault(const SchemaField& field, const Literal& value, + std::string_view kind) { + if (value.IsNull() || value.IsAboveMax() || value.IsBelowMin()) { + return InvalidSchema("Invalid {} value for {}: must be a non-null value", kind, + field.name()); + } + if (field.type() == nullptr || !field.type()->is_primitive()) { + return InvalidSchema( + "Invalid {} value for {}: default values are only supported for primitive types", + kind, field.name()); + } + if (*value.type() != *field.type()) { + return InvalidSchema("{} of field {} has type {} but expected {}", kind, field.name(), + *value.type(), *field.type()); + } + return {}; +} + +} // namespace + Status SchemaField::Validate() const { if (name_.empty()) [[unlikely]] { return InvalidSchema("SchemaField cannot have empty name"); @@ -62,6 +114,13 @@ Status SchemaField::Validate() const { if (type_ == nullptr) [[unlikely]] { return InvalidSchema("SchemaField cannot have null type"); } + if (initial_default_ != nullptr) { + ICEBERG_RETURN_UNEXPECTED( + ValidateDefault(*this, *initial_default_, "initial-default")); + } + if (write_default_ != nullptr) { + ICEBERG_RETURN_UNEXPECTED(ValidateDefault(*this, *write_default_, "write-default")); + } return {}; } @@ -72,9 +131,23 @@ std::string SchemaField::ToString() const { return result; } +namespace { + +bool DefaultEquals(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) { + if (lhs == nullptr || rhs == nullptr) { + return lhs == rhs; + } + return *lhs == *rhs; +} + +} // namespace + bool SchemaField::Equals(const SchemaField& other) const { return field_id_ == other.field_id_ && name_ == other.name_ && *type_ == *other.type_ && - optional_ == other.optional_; + optional_ == other.optional_ && + DefaultEquals(initial_default_, other.initial_default_) && + DefaultEquals(write_default_, other.write_default_); } } // namespace iceberg diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index fd20226a5..e90425ca7 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -24,7 +24,9 @@ /// type (e.g. a struct). #include +#include #include +#include #include #include @@ -46,8 +48,14 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { /// \param[in] type The field type. /// \param[in] optional Whether values of this field are required or nullable. /// \param[in] doc Optional documentation string for the field. + /// \param[in] initial_default The v3 `initial-default` value, or null if absent. The + /// field shares ownership of the (immutable) value. + /// \param[in] write_default The v3 `write-default` value, or null if absent. The field + /// shares ownership of the (immutable) value. SchemaField(int32_t field_id, std::string_view name, std::shared_ptr type, - bool optional, std::string_view doc = {}); + bool optional, std::string_view doc = {}, + std::shared_ptr initial_default = nullptr, + std::shared_ptr write_default = nullptr); /// \brief Construct an optional (nullable) field. static SchemaField MakeOptional(int32_t field_id, std::string_view name, @@ -71,6 +79,32 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { /// \brief Get the field documentation. std::string_view doc() const; + /// \brief Get the default value for this field used when reading rows written + /// before the field existed (v3 `initial-default`). Empty if absent. + /// + /// The returned reference is a non-owning view into a value owned by this field; + /// it remains valid for the lifetime of this SchemaField. + [[nodiscard]] std::optional> initial_default() + const; + + /// \brief Get the default value for this field used when a writer does not + /// supply a value (v3 `write-default`). Empty if absent. + /// + /// The returned reference is a non-owning view into a value owned by this field; + /// it remains valid for the lifetime of this SchemaField. + [[nodiscard]] std::optional> write_default() + const; + + /// \brief Get the shared owning pointer to the `initial-default` value, or null if + /// absent. Prefer initial_default() for reading; this exists so a rebuilt field can + /// share the (immutable) value rather than copy it. + [[nodiscard]] const std::shared_ptr& initial_default_ptr() const; + + /// \brief Get the shared owning pointer to the `write-default` value, or null if + /// absent. Prefer write_default() for reading; this exists so a rebuilt field can + /// share the (immutable) value rather than copy it. + [[nodiscard]] const std::shared_ptr& write_default_ptr() const; + [[nodiscard]] std::string ToString() const override; Status Validate() const; @@ -100,6 +134,11 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { std::shared_ptr type_; bool optional_; std::string doc_; + // Default values are owned by this field and never mutated after being set; copies + // of the field share the same payload (reference-counted) instead of deep-copying, + // like `type_` above. Sharing is unobservable because the payload is immutable. + std::shared_ptr initial_default_; + std::shared_ptr write_default_; }; } // namespace iceberg diff --git a/src/iceberg/schema_util.cc b/src/iceberg/schema_util.cc index 4ff678fc6..5ff828258 100644 --- a/src/iceberg/schema_util.cc +++ b/src/iceberg/schema_util.cc @@ -172,10 +172,14 @@ Result ProjectNested(const Type& expected_type, const Type& sou iter->second.local_index, prune_source)); } else if (MetadataColumns::IsMetadataColumn(field_id)) { child_projection.kind = FieldProjection::Kind::kMetadata; + } else if (expected_field.initial_default().has_value()) { + // Rows written before the field existed assume its `initial-default` value. + child_projection.kind = FieldProjection::Kind::kDefault; + child_projection.from = expected_field.initial_default()->get(); } else if (expected_field.optional()) { child_projection.kind = FieldProjection::Kind::kNull; } else { - // TODO(gangwu): support default value for v3 and constant value + // TODO(gangwu): support constant value return InvalidSchema("Missing required field: {}", expected_field.ToString()); } result.children.emplace_back(std::move(child_projection)); diff --git a/src/iceberg/test/resources/TableMetadataV3Valid.json b/src/iceberg/test/resources/TableMetadataV3Valid.json new file mode 100644 index 000000000..712d4b5bd --- /dev/null +++ b/src/iceberg/test/resources/TableMetadataV3Valid.json @@ -0,0 +1,123 @@ +{ + "format-version": 3, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "next-row-id": 0, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 1, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + }, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [ + 1, + 2 + ], + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": 3055729675574597004, + "snapshots": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/1.avro" + }, + { + "snapshot-id": 3055729675574597004, + "parent-snapshot-id": 3051729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/2.avro", + "schema-id": 1 + } + ], + "snapshot-log": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770 + }, + { + "snapshot-id": 3055729675574597004, + "timestamp-ms": 1555100955770 + } + ], + "metadata-log": [] +} diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index 08275a45c..71fc474f4 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -24,6 +24,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/schema.h" #include "iceberg/schema_field.h" @@ -137,6 +138,50 @@ TEST(SchemaJsonTest, RoundTrip) { ASSERT_EQ(dumped_json, json); } +TEST(SchemaJsonTest, FieldWithDefaultValuesRoundTrip) { + constexpr std::string_view json = + R"({"fields":[{"id":1,"initial-default":42,"name":"id","required":true,"type":"int","write-default":7},{"id":2,"initial-default":"n/a","name":"name","required":false,"type":"string"}],"schema-id":1,"type":"struct"})"; + + ICEBERG_UNWRAP_OR_FAIL(auto schema, SchemaFromJson(nlohmann::json::parse(json))); + ASSERT_EQ(schema->fields().size(), 2); + + const auto& field1 = schema->fields()[0]; + ASSERT_TRUE(field1.initial_default().has_value()); + ASSERT_EQ(field1.initial_default()->get(), Literal::Int(42)); + ASSERT_TRUE(field1.write_default().has_value()); + ASSERT_EQ(field1.write_default()->get(), Literal::Int(7)); + + const auto& field2 = schema->fields()[1]; + ASSERT_TRUE(field2.initial_default().has_value()); + ASSERT_EQ(field2.initial_default()->get(), Literal::String("n/a")); + ASSERT_FALSE(field2.write_default().has_value()); + + ASSERT_EQ(ToJson(*schema).dump(), json); +} + +TEST(SchemaJsonTest, FieldWithMismatchedDefaultValueFails) { + constexpr std::string_view json = + R"({"fields":[{"id":1,"initial-default":"oops","name":"id","required":true,"type":"int"}],"schema-id":1,"type":"struct"})"; + + auto result = SchemaFromJson(nlohmann::json::parse(json)); + ASSERT_FALSE(result.has_value()); +} + +TEST(SchemaJsonTest, NestedFieldWithDefaultValuesRoundTrip) { + constexpr std::string_view json = + R"({"fields":[{"id":1,"name":"person","required":true,"type":{"fields":[{"id":2,"initial-default":18,"name":"age","required":true,"type":"int","write-default":21}],"type":"struct"}}],"schema-id":1,"type":"struct"})"; + + ICEBERG_UNWRAP_OR_FAIL(auto schema, SchemaFromJson(nlohmann::json::parse(json))); + const auto& person = schema->fields()[0]; + const auto& nested = dynamic_cast(*person.type()).fields()[0]; + ASSERT_TRUE(nested.initial_default().has_value()); + ASSERT_EQ(nested.initial_default()->get(), Literal::Int(18)); + ASSERT_TRUE(nested.write_default().has_value()); + ASSERT_EQ(nested.write_default()->get(), Literal::Int(21)); + + ASSERT_EQ(ToJson(*schema).dump(), json); +} + TEST(SchemaJsonTest, UnknownFieldRoundTrip) { constexpr std::string_view json = R"({"fields":[{"id":1,"name":"mystery","required":false,"type":"unknown"}],"schema-id":1,"type":"struct"})"; diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index 8f1b20035..28f7eb9a0 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -25,6 +25,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/table_metadata.h" @@ -133,6 +134,61 @@ TEST(SchemaTest, ValidateRejectsV3TypesBeforeFormatV3) { iceberg::IsOk()); } +TEST(SchemaTest, ValidateRejectsInitialDefaultBeforeFormatV3) { + iceberg::Schema schema({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(42)))}); + + auto status = schema.Validate(2); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + EXPECT_THAT(status, iceberg::HasErrorMessage("is not supported until v3")); + + EXPECT_THAT(schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsOk()); +} + +TEST(SchemaTest, ValidateDoesNotVersionGateWriteDefault) { + // A write-default does not reinterpret existing data, so it is not gated on + // format version: a write-default alone is accepted below v3. + iceberg::Schema schema({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, /*initial_default=*/nullptr, + std::make_shared(iceberg::Literal::Int(7)))}); + + EXPECT_THAT(schema.Validate(2), iceberg::IsOk()); +} + +TEST(SchemaTest, ValidateRejectsMismatchedDefaultValue) { + iceberg::Schema schema({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, /*initial_default=*/nullptr, + std::make_shared(iceberg::Literal::String("oops")))}); + + auto status = schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + EXPECT_THAT(status, iceberg::HasErrorMessage("write-default")); +} + +TEST(SchemaTest, ReassignIdsPreservesDefaultValues) { + // Reassigning field IDs rebuilds each SchemaField, so the rebuild must carry the + // default values over to the field with the new ID. + std::vector fields; + fields.push_back(iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(42)), + std::make_shared(iceberg::Literal::Int(7)))); + auto reassign_id = [](int32_t old_id) { return old_id + 1000; }; + + iceberg::Schema schema(std::move(fields), iceberg::Schema::kInitialSchemaId, + reassign_id); + + ASSERT_EQ(schema.fields().size(), 1); + const iceberg::SchemaField& field = schema.fields()[0]; + EXPECT_EQ(field.field_id(), 1001); + ASSERT_TRUE(field.initial_default().has_value()); + EXPECT_EQ(field.initial_default()->get(), iceberg::Literal::Int(42)); + ASSERT_TRUE(field.write_default().has_value()); + EXPECT_EQ(field.write_default()->get(), iceberg::Literal::Int(7)); +} + TEST(SchemaTest, ValidateRejectsInvalidUnknownFields) { iceberg::Schema required_unknown_schema( {iceberg::SchemaField(1, "mystery", iceberg::unknown(), false)}); diff --git a/src/iceberg/test/schema_util_test.cc b/src/iceberg/test/schema_util_test.cc index ee075006f..9a3eff887 100644 --- a/src/iceberg/test/schema_util_test.cc +++ b/src/iceberg/test/schema_util_test.cc @@ -24,6 +24,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/metadata_columns.h" #include "iceberg/schema.h" #include "iceberg/schema_field.h" @@ -179,6 +180,58 @@ TEST(SchemaUtilTest, ProjectMissingRequiredField) { ASSERT_THAT(projection_result, HasErrorMessage("Missing required field")); } +TEST(SchemaUtilTest, ProjectMissingRequiredFieldWithInitialDefault) { + Schema source_schema = CreateFlatSchema(); + Schema expected_schema({ + SchemaField::MakeRequired(/*field_id=*/1, "id", iceberg::int64()), + SchemaField(/*field_id=*/10, "extra", iceberg::int32(), /*optional=*/false, + /*doc=*/{}, std::make_shared(Literal::Int(42))), + }); + + auto projection_result = + Project(expected_schema, source_schema, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + + const auto& projection = *projection_result; + ASSERT_EQ(projection.fields.size(), 2); + AssertProjectedField(projection.fields[0], 0); + ASSERT_EQ(projection.fields[1].kind, FieldProjection::Kind::kDefault); + ASSERT_EQ(std::get(projection.fields[1].from), Literal::Int(42)); +} + +TEST(SchemaUtilTest, ProjectMissingOptionalFieldWithInitialDefault) { + // An optional field with an initial-default reads the default, not null. + Schema source_schema = CreateFlatSchema(); + Schema expected_schema({ + SchemaField::MakeRequired(/*field_id=*/1, "id", iceberg::int64()), + SchemaField(/*field_id=*/10, "extra", iceberg::string(), /*optional=*/true, + /*doc=*/{}, std::make_shared(Literal::String("n/a"))), + }); + + auto projection_result = + Project(expected_schema, source_schema, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + + const auto& projection = *projection_result; + ASSERT_EQ(projection.fields.size(), 2); + ASSERT_EQ(projection.fields[1].kind, FieldProjection::Kind::kDefault); + ASSERT_EQ(std::get(projection.fields[1].from), Literal::String("n/a")); +} + +TEST(SchemaUtilTest, ProjectPresentFieldIgnoresInitialDefault) { + // initial-default only applies when the field is missing from the data file. + Schema source_schema = CreateFlatSchema(); + Schema expected_schema({ + SchemaField(/*field_id=*/1, "id", iceberg::int64(), /*optional=*/false, + /*doc=*/{}, std::make_shared(Literal::Long(-1))), + }); + + auto projection_result = + Project(expected_schema, source_schema, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + AssertProjectedField(projection_result->fields[0], 0); +} + TEST(SchemaUtilTest, ProjectMetadataColumn) { Schema source_schema = CreateFlatSchema(); Schema expected_schema({ From 511d9c8e619494e57955ad06325b058527c1b2de Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 20 Jun 2026 10:20:11 -0700 Subject: [PATCH 02/22] test: remove unused TableMetadataV3Valid.json resource This file was added by this PR but is referenced by no test (table-metadata resources are loaded by explicit name via ReadTableMetadataFromResource) and carries no column defaults, so it is dead. Remove it. --- .../test/resources/TableMetadataV3Valid.json | 123 ------------------ 1 file changed, 123 deletions(-) delete mode 100644 src/iceberg/test/resources/TableMetadataV3Valid.json diff --git a/src/iceberg/test/resources/TableMetadataV3Valid.json b/src/iceberg/test/resources/TableMetadataV3Valid.json deleted file mode 100644 index 712d4b5bd..000000000 --- a/src/iceberg/test/resources/TableMetadataV3Valid.json +++ /dev/null @@ -1,123 +0,0 @@ -{ - "format-version": 3, - "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", - "location": "s3://bucket/test/location", - "last-sequence-number": 34, - "next-row-id": 0, - "last-updated-ms": 1602638573590, - "last-column-id": 3, - "current-schema-id": 1, - "schemas": [ - { - "type": "struct", - "schema-id": 0, - "fields": [ - { - "id": 1, - "name": "x", - "required": true, - "type": "long" - } - ] - }, - { - "type": "struct", - "schema-id": 1, - "identifier-field-ids": [ - 1, - 2 - ], - "fields": [ - { - "id": 1, - "name": "x", - "required": true, - "type": "long" - }, - { - "id": 2, - "name": "y", - "required": true, - "type": "long", - "doc": "comment" - }, - { - "id": 3, - "name": "z", - "required": true, - "type": "long" - } - ] - } - ], - "default-spec-id": 0, - "partition-specs": [ - { - "spec-id": 0, - "fields": [ - { - "name": "x", - "transform": "identity", - "source-id": 1, - "field-id": 1000 - } - ] - } - ], - "last-partition-id": 1000, - "default-sort-order-id": 3, - "sort-orders": [ - { - "order-id": 3, - "fields": [ - { - "transform": "identity", - "source-id": 2, - "direction": "asc", - "null-order": "nulls-first" - }, - { - "transform": "bucket[4]", - "source-id": 3, - "direction": "desc", - "null-order": "nulls-last" - } - ] - } - ], - "properties": {}, - "current-snapshot-id": 3055729675574597004, - "snapshots": [ - { - "snapshot-id": 3051729675574597004, - "timestamp-ms": 1515100955770, - "sequence-number": 0, - "summary": { - "operation": "append" - }, - "manifest-list": "s3://a/b/1.avro" - }, - { - "snapshot-id": 3055729675574597004, - "parent-snapshot-id": 3051729675574597004, - "timestamp-ms": 1555100955770, - "sequence-number": 1, - "summary": { - "operation": "append" - }, - "manifest-list": "s3://a/b/2.avro", - "schema-id": 1 - } - ], - "snapshot-log": [ - { - "snapshot-id": 3051729675574597004, - "timestamp-ms": 1515100955770 - }, - { - "snapshot-id": 3055729675574597004, - "timestamp-ms": 1555100955770 - } - ], - "metadata-log": [] -} From 9690529a959c40b4f10eacca24c2eb66cc83e2f0 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 20 Jun 2026 10:35:28 -0700 Subject: [PATCH 03/22] test(json_serde): cover default-value serde in json_serde_test Add SchemaField default-value coverage directly to json_serde_test (the binary that exercises json_serde.cc): one test pins the initial-default / write-default wire format (incl. their absence when unset), and one round-trips a field with both defaults through ToJson -> FieldFromJson for every primitive type, exercising the non-trivial date/timestamp/decimal/uuid/binary encodings the default path reuses. --- src/iceberg/test/json_serde_test.cc | 69 +++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index 0dac226e9..3a57375dc 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -23,10 +23,12 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/name_mapping.h" #include "iceberg/partition_spec.h" #include "iceberg/schema.h" +#include "iceberg/schema_field.h" #include "iceberg/snapshot.h" #include "iceberg/sort_field.h" #include "iceberg/sort_order.h" @@ -35,9 +37,11 @@ #include "iceberg/table_update.h" #include "iceberg/test/matchers.h" #include "iceberg/transform.h" +#include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" // IWYU pragma: keep #include "iceberg/util/timepoint.h" +#include "iceberg/util/uuid.h" namespace iceberg { @@ -71,6 +75,11 @@ Result> FromJsonHelper(const nlohmann::json& json) return NameMappingFromJson(json); } +template <> +Result> FromJsonHelper(const nlohmann::json& json) { + return FieldFromJson(json); +} + // Helper function to reduce duplication in testing template void TestJsonConversion(const T& obj, const nlohmann::json& expected_json) { @@ -85,6 +94,66 @@ void TestJsonConversion(const T& obj, const nlohmann::json& expected_json) { } // namespace +// Pins the wire format produced by ToJson(SchemaField) / FieldFromJson for +// `initial-default` and `write-default`, including the absence of the keys when a +// field carries no defaults. +TEST(JsonInternalTest, SchemaFieldDefaultValues) { + // Both defaults present. + SchemaField with_both(/*field_id=*/1, "id", int32(), /*optional=*/false, /*doc=*/{}, + std::make_shared(Literal::Int(42)), + std::make_shared(Literal::Int(7))); + TestJsonConversion( + with_both, + R"({"id":1,"name":"id","required":true,"type":"int","initial-default":42,"write-default":7})"_json); + + // Only an initial-default; write-default must not appear in the JSON. + SchemaField initial_only(/*field_id=*/2, "name", string(), /*optional=*/true, + /*doc=*/{}, + std::make_shared(Literal::String("n/a")), + /*write_default=*/nullptr); + TestJsonConversion( + initial_only, + R"({"id":2,"name":"name","required":false,"type":"string","initial-default":"n/a"})"_json); + + // No defaults; neither key may appear. + SchemaField no_defaults(/*field_id=*/3, "plain", int32(), /*optional=*/false); + TestJsonConversion(no_defaults, + R"({"id":3,"name":"plain","required":true,"type":"int"})"_json); +} + +// Round-trips a field carrying both defaults through ToJson -> FieldFromJson for +// every primitive type, exercising the per-type single-value serialization the +// default path reuses (date/timestamp/decimal/uuid/binary have non-trivial wire +// encodings). +TEST(JsonInternalTest, SchemaFieldDefaultValuesRoundTripAllTypes) { + ICEBERG_UNWRAP_OR_FAIL(auto uuid_value, + Uuid::FromString("f79c3e09-677c-4bbd-a479-3f349cb785e7")); + std::vector, Literal>> cases; + cases.emplace_back(boolean(), Literal::Boolean(true)); + cases.emplace_back(int32(), Literal::Int(-7)); + cases.emplace_back(int64(), Literal::Long(1234567890123LL)); + cases.emplace_back(float32(), Literal::Float(1.5f)); + cases.emplace_back(float64(), Literal::Double(2.5)); + cases.emplace_back(date(), Literal::Date(19738)); + cases.emplace_back(timestamp(), Literal::Timestamp(1719446400000000LL)); + cases.emplace_back(string(), Literal::String("hello")); + cases.emplace_back(decimal(9, 2), Literal::Decimal(12345, 9, 2)); + cases.emplace_back(fixed(3), Literal::Fixed({0x01, 0x02, 0x03})); + cases.emplace_back(binary(), Literal::Binary({0xDE, 0xAD, 0xBE, 0xEF})); + cases.emplace_back(uuid(), Literal::UUID(uuid_value)); + + int32_t field_id = 1; + for (const auto& [type, literal] : cases) { + SchemaField field(field_id++, "f", type, /*optional=*/false, /*doc=*/{}, + std::make_shared(literal), + std::make_shared(literal)); + auto json = ToJson(field); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, FieldFromJson(json)); + EXPECT_EQ(field, *parsed) + << "round-trip mismatch for type " << type->ToString() << ", json=" << json.dump(); + } +} + TEST(JsonInternalTest, SortField) { auto identity_transform = Transform::Identity(); From d15a0feefd0cf213805427966eadbb1c1b960dc7 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 20 Jun 2026 11:45:58 -0700 Subject: [PATCH 04/22] test(json_serde): fix clang-format in default-value tests --- src/iceberg/test/json_serde_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index f8023535c..416d0e520 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -150,8 +150,8 @@ TEST(JsonInternalTest, SchemaFieldDefaultValuesRoundTripAllTypes) { std::make_shared(literal)); auto json = ToJson(field); ICEBERG_UNWRAP_OR_FAIL(auto parsed, FieldFromJson(json)); - EXPECT_EQ(field, *parsed) - << "round-trip mismatch for type " << type->ToString() << ", json=" << json.dump(); + EXPECT_EQ(field, *parsed) << "round-trip mismatch for type " << type->ToString() + << ", json=" << json.dump(); } } From 180a9f969b985cfdb6ba160bab711bfe2a057d1c Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 22 Jun 2026 23:19:34 -0700 Subject: [PATCH 05/22] fix(schema): align v3 default-value validation/serde with spec and Java Addresses wgtmac's review feedback: - Enforce UTC offset for timestamptz / timestamptz_ns default values in FieldFromJson. The shared timestamp parser accepts any offset and silently normalizes to UTC, so C++ could accept default metadata Java rejects and rewrite the offset on write. Added TemporalUtils::IsUtcOffset, which reuses the existing timezone-suffix parser. - ValidateDefault now casts the default literal to the field type (Literal::CastTo) instead of requiring an exact type match, matching Java's Types.NestedField (e.g. an int default on a long field). Uncastable or out-of-range defaults are still rejected. - Documented that non-primitive (struct/list/map) defaults remain unsupported, matching Java's current model. - Extended the json_serde round-trip tests with time/timestamptz/timestamp_ns/ timestamptz_ns and a negative case for non-UTC timestamptz defaults; added a schema test for the cast-to-field-type behavior. --- build-rest-tests.sh | 35 +++++++++++++++++++++++++++++ src/iceberg/json_serde.cc | 34 ++++++++++++++++++++++++++++ src/iceberg/schema_field.cc | 17 +++++++++++--- src/iceberg/test/json_serde_test.cc | 21 +++++++++++++++++ src/iceberg/test/schema_test.cc | 17 ++++++++++++++ src/iceberg/util/temporal_util.cc | 5 +++++ src/iceberg/util/temporal_util.h | 12 ++++++++++ 7 files changed, 138 insertions(+), 3 deletions(-) create mode 100755 build-rest-tests.sh diff --git a/build-rest-tests.sh b/build-rest-tests.sh new file mode 100755 index 000000000..40ca233f4 --- /dev/null +++ b/build-rest-tests.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# +# Build and run the REST catalog unit tests (target: rest_catalog_test, which +# includes rest_json_serde_test.cc and rest_file_io_test.cc). +# +# Fast path: uses Homebrew LLVM (C++23 + libc++) and Homebrew apache-arrow, so +# Arrow is found via find_package instead of being compiled from source. +# +# One-time install: +# brew install cmake ninja llvm apache-arrow +# +set -euo pipefail +cd "$(dirname "$0")" + +LLVM_PREFIX="$(brew --prefix llvm)" +export SDKROOT="${SDKROOT:-$(xcrun --show-sdk-path)}" # help LLVM clang find the macOS SDK + +# NOTE: RelWithDebInfo (not Debug) on purpose. The vendored nanoarrow applies +# `-Werror -Wpedantic` only in Debug ($<$:...>), and Homebrew's +# Clang 22 flags nanoarrow's use of `__COUNTER__` as a C2y extension, which then +# becomes a fatal error. Non-Debug configs drop those flags. +cmake -S . -B build -G Ninja \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER="${LLVM_PREFIX}/bin/clang++" \ + -DCMAKE_C_COMPILER="${LLVM_PREFIX}/bin/clang" \ + -DCMAKE_PREFIX_PATH="$(brew --prefix apache-arrow);$(brew --prefix)" \ + -DICEBERG_BUILD_REST=ON \ + -DICEBERG_BUILD_TESTS=ON + # If the Homebrew Arrow version mismatches and fails to compile, force the + # vendored Arrow 24.0.0 build instead by adding: + # -DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER + +cmake --build build --target rest_catalog_test -j + +ctest --test-dir build -R rest_catalog_test --output-on-failure diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 45da0bd27..24fb591e1 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -51,6 +51,7 @@ #include "iceberg/util/json_util_internal.h" #include "iceberg/util/macros.h" #include "iceberg/util/string_util.h" +#include "iceberg/util/temporal_util.h" #include "iceberg/util/timepoint.h" namespace iceberg { @@ -564,6 +565,35 @@ Result> TypeFromJson(const nlohmann::json& json) { } } +namespace { + +// The spec's JSON single-value form for `timestamptz` / `timestamptz_ns` default +// values requires a UTC offset. The shared timestamp parser accepts any offset and +// silently normalizes to UTC, which would let C++ accept default metadata that Java +// rejects and then rewrite the offset on serialization. Enforce UTC for these +// defaults at parse time, where the original offset is still visible. +Status ValidateTimestamptzDefaultIsUtc(const Type& type, const nlohmann::json& value) { + const auto type_id = type.type_id(); + if (type_id != TypeId::kTimestampTz && type_id != TypeId::kTimestampTzNs) { + return {}; + } + if (!value.is_string()) { + // Let LiteralFromJson report the type mismatch. + return {}; + } + const auto str = value.get(); + ICEBERG_ASSIGN_OR_RAISE(bool is_utc, TemporalUtils::IsUtcOffset(str)); + if (!is_utc) { + return JsonParseError( + "Invalid timestamptz default '{}' for {}: default values must use UTC " + "(offset 'Z' or '+00:00')", + str, type.ToString()); + } + return {}; +} + +} // namespace + Result> FieldFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE( auto type, GetJsonValue(json, kType).and_then(TypeFromJson)); @@ -578,12 +608,16 @@ Result> FieldFromJson(const nlohmann::json& json) { std::shared_ptr initial_default; if (initial_default_json.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + ValidateTimestamptzDefaultIsUtc(*type, *initial_default_json)); ICEBERG_ASSIGN_OR_RAISE(Literal literal, LiteralFromJson(*initial_default_json, type.get())); initial_default = std::make_shared(std::move(literal)); } std::shared_ptr write_default; if (write_default_json.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + ValidateTimestamptzDefaultIsUtc(*type, *write_default_json)); ICEBERG_ASSIGN_OR_RAISE(Literal literal, LiteralFromJson(*write_default_json, type.get())); write_default = std::make_shared(std::move(literal)); diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index 8c3fb9be6..d2a7a8e4f 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -93,14 +93,25 @@ Status ValidateDefault(const SchemaField& field, const Literal& value, return InvalidSchema("Invalid {} value for {}: must be a non-null value", kind, field.name()); } + // Defaults are only supported on primitive fields. The spec also permits JSON + // single-value defaults for struct/list/map (e.g. an empty struct `{}` whose + // sub-field defaults live in field metadata); that matches the current Java model's + // gap and is left as a follow-up. if (field.type() == nullptr || !field.type()->is_primitive()) { return InvalidSchema( "Invalid {} value for {}: default values are only supported for primitive types", kind, field.name()); } - if (*value.type() != *field.type()) { - return InvalidSchema("{} of field {} has type {} but expected {}", kind, field.name(), - *value.type(), *field.type()); + // Match Java (Types.NestedField), which casts the default literal to the field type + // instead of requiring an exact type match (e.g. an int default on a long field, or + // a string default on a date/timestamp/uuid field). Reject only defaults that cannot + // be cast to the field type or fall outside its range (CastTo signals out-of-range as + // an above-max/below-min sentinel). + auto field_type = std::static_pointer_cast(field.type()); + auto cast = value.CastTo(field_type); + if (!cast.has_value() || cast->IsAboveMax() || cast->IsBelowMin()) { + return InvalidSchema("{} of field {} has type {} that cannot be cast to {}", kind, + field.name(), *value.type(), *field.type()); } return {}; } diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index 416d0e520..3951a806c 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -136,7 +136,11 @@ TEST(JsonInternalTest, SchemaFieldDefaultValuesRoundTripAllTypes) { cases.emplace_back(float32(), Literal::Float(1.5f)); cases.emplace_back(float64(), Literal::Double(2.5)); cases.emplace_back(date(), Literal::Date(19738)); + cases.emplace_back(time(), Literal::Time(43200000000LL)); cases.emplace_back(timestamp(), Literal::Timestamp(1719446400000000LL)); + cases.emplace_back(timestamp_tz(), Literal::TimestampTz(1719446400000000LL)); + cases.emplace_back(timestamp_ns(), Literal::TimestampNs(1719446400000000123LL)); + cases.emplace_back(timestamptz_ns(), Literal::TimestampTzNs(1719446400000000123LL)); cases.emplace_back(string(), Literal::String("hello")); cases.emplace_back(decimal(9, 2), Literal::Decimal(12345, 9, 2)); cases.emplace_back(fixed(3), Literal::Fixed({0x01, 0x02, 0x03})); @@ -155,6 +159,23 @@ TEST(JsonInternalTest, SchemaFieldDefaultValuesRoundTripAllTypes) { } } +// The spec only permits UTC offsets for timestamptz / timestamptz_ns default values. +// A non-UTC offset (which the shared parser would silently normalize) must be rejected, +// while the UTC form is accepted. +TEST(JsonInternalTest, SchemaFieldRejectsNonUtcTimestamptzDefault) { + auto non_utc = nlohmann::json::parse( + R"({"id":1,"name":"ts","required":true,"type":"timestamptz","initial-default":"2024-06-27T05:00:00+05:00"})"); + EXPECT_FALSE(FieldFromJson(non_utc).has_value()); + + auto non_utc_ns = nlohmann::json::parse( + R"({"id":1,"name":"ts","required":true,"type":"timestamptz_ns","write-default":"2024-06-27T05:00:00-08:00"})"); + EXPECT_FALSE(FieldFromJson(non_utc_ns).has_value()); + + auto utc = nlohmann::json::parse( + R"({"id":1,"name":"ts","required":true,"type":"timestamptz","initial-default":"2024-06-27T00:00:00+00:00"})"); + EXPECT_TRUE(FieldFromJson(utc).has_value()); +} + TEST(JsonInternalTest, SortField) { auto identity_transform = Transform::Identity(); diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index 28f7eb9a0..d07cf71a8 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -167,6 +167,23 @@ TEST(SchemaTest, ValidateRejectsMismatchedDefaultValue) { EXPECT_THAT(status, iceberg::HasErrorMessage("write-default")); } +TEST(SchemaTest, ValidateCastsDefaultToFieldType) { + // Matching Java, the default literal is cast to the field type rather than required to + // match exactly: an int default on a long field is accepted (int -> long). + iceberg::Schema widened({iceberg::SchemaField( + 1, "id", iceberg::int64(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(34)))}); + EXPECT_THAT(widened.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsOk()); + + // A default whose type cannot be cast to the field type is still rejected. + iceberg::Schema uncastable({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::String("oops")))}); + EXPECT_THAT(uncastable.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); +} + TEST(SchemaTest, ReassignIdsPreservesDefaultValues) { // Reassigning field IDs rebuilds each SchemaField, so the rebuild must carry the // default values over to the field with the new ID. diff --git a/src/iceberg/util/temporal_util.cc b/src/iceberg/util/temporal_util.cc index e00ee7cf5..9e6a93cfd 100644 --- a/src/iceberg/util/temporal_util.cc +++ b/src/iceberg/util/temporal_util.cc @@ -444,6 +444,11 @@ Result TemporalUtils::ParseTimestampNsWithZone(std::string_view str) { /*units_per_micro=*/internal::kNanosPerMicro); } +Result TemporalUtils::IsUtcOffset(std::string_view str) { + ICEBERG_ASSIGN_OR_RAISE(auto timestamp_with_offset, ParseTimestampWithZoneSuffix(str)); + return timestamp_with_offset.second == 0; +} + #define DISPATCH_EXTRACT_YEAR(type_id) \ case type_id: \ return ExtractYearImpl(literal); diff --git a/src/iceberg/util/temporal_util.h b/src/iceberg/util/temporal_util.h index 2121f565d..cc7a16319 100644 --- a/src/iceberg/util/temporal_util.h +++ b/src/iceberg/util/temporal_util.h @@ -125,6 +125,18 @@ class ICEBERG_EXPORT TemporalUtils { /// \return The number of nanoseconds since epoch (UTC), or an error. static Result ParseTimestampNsWithZone(std::string_view str); + /// \brief Reports whether a timestamp-with-zone string uses a UTC offset. + /// + /// The ParseTimestamp*WithZone parsers accept any offset and silently normalize it + /// to UTC. The spec's JSON single-value form for `timestamptz` / `timestamptz_ns` + /// default values only permits UTC ("Z" or "+00:00"), so callers that must enforce + /// that rule check the offset here before parsing. + /// + /// \param str The timestamp-with-zone string to inspect. + /// \return true if the offset is UTC, false if it is a non-UTC offset, or an error + /// if the timezone suffix cannot be parsed. + static Result IsUtcOffset(std::string_view str); + /// \brief Extract a date or timestamp year, as years from 1970 static Result ExtractYear(const Literal& literal); From 2052e4fbc9e4c7e490968049d9758c5729f281df Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 22 Jun 2026 23:20:55 -0700 Subject: [PATCH 06/22] chore: drop accidentally committed build-rest-tests.sh --- build-rest-tests.sh | 35 ----------------------------------- 1 file changed, 35 deletions(-) delete mode 100755 build-rest-tests.sh diff --git a/build-rest-tests.sh b/build-rest-tests.sh deleted file mode 100755 index 40ca233f4..000000000 --- a/build-rest-tests.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash -# -# Build and run the REST catalog unit tests (target: rest_catalog_test, which -# includes rest_json_serde_test.cc and rest_file_io_test.cc). -# -# Fast path: uses Homebrew LLVM (C++23 + libc++) and Homebrew apache-arrow, so -# Arrow is found via find_package instead of being compiled from source. -# -# One-time install: -# brew install cmake ninja llvm apache-arrow -# -set -euo pipefail -cd "$(dirname "$0")" - -LLVM_PREFIX="$(brew --prefix llvm)" -export SDKROOT="${SDKROOT:-$(xcrun --show-sdk-path)}" # help LLVM clang find the macOS SDK - -# NOTE: RelWithDebInfo (not Debug) on purpose. The vendored nanoarrow applies -# `-Werror -Wpedantic` only in Debug ($<$:...>), and Homebrew's -# Clang 22 flags nanoarrow's use of `__COUNTER__` as a C2y extension, which then -# becomes a fatal error. Non-Debug configs drop those flags. -cmake -S . -B build -G Ninja \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_CXX_COMPILER="${LLVM_PREFIX}/bin/clang++" \ - -DCMAKE_C_COMPILER="${LLVM_PREFIX}/bin/clang" \ - -DCMAKE_PREFIX_PATH="$(brew --prefix apache-arrow);$(brew --prefix)" \ - -DICEBERG_BUILD_REST=ON \ - -DICEBERG_BUILD_TESTS=ON - # If the Homebrew Arrow version mismatches and fails to compile, force the - # vendored Arrow 24.0.0 build instead by adding: - # -DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER - -cmake --build build --target rest_catalog_test -j - -ctest --test-dir build -R rest_catalog_test --output-on-failure From 54d9636223532377e941f48c8181d1b9285a36f2 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 23 Jun 2026 11:44:24 -0700 Subject: [PATCH 07/22] test(temporal): add unit test for TemporalUtils::IsUtcOffset Directly cover the UTC/non-UTC/unparseable offset cases the timestamptz default-value enforcement relies on. --- src/iceberg/test/temporal_util_test.cc | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/iceberg/test/temporal_util_test.cc b/src/iceberg/test/temporal_util_test.cc index 0d4426b0b..791d02c56 100644 --- a/src/iceberg/test/temporal_util_test.cc +++ b/src/iceberg/test/temporal_util_test.cc @@ -53,6 +53,31 @@ TEST(TemporalUtilTest, ParseTimestampNsChecksInt64Bounds) { IsError(ErrorKind::kInvalidArgument)); } +TEST(TemporalUtilTest, IsUtcOffset) { + // UTC offsets: "Z", "+00:00" and "-00:00". + ICEBERG_UNWRAP_OR_FAIL(auto z, TemporalUtils::IsUtcOffset("2024-06-27T00:00:00Z")); + EXPECT_TRUE(z); + ICEBERG_UNWRAP_OR_FAIL(auto plus_zero, + TemporalUtils::IsUtcOffset("2024-06-27T00:00:00+00:00")); + EXPECT_TRUE(plus_zero); + ICEBERG_UNWRAP_OR_FAIL(auto minus_zero, + TemporalUtils::IsUtcOffset("2024-06-27T00:00:00-00:00")); + EXPECT_TRUE(minus_zero); + + // Non-UTC offsets. + ICEBERG_UNWRAP_OR_FAIL(auto plus_five, + TemporalUtils::IsUtcOffset("2024-06-27T05:00:00+05:00")); + EXPECT_FALSE(plus_five); + ICEBERG_UNWRAP_OR_FAIL(auto minus_eight, + TemporalUtils::IsUtcOffset("2024-06-27T00:00:00-08:00")); + EXPECT_FALSE(minus_eight); + + // A missing or unparseable timezone suffix is an error. + EXPECT_THAT(TemporalUtils::IsUtcOffset("2024-06-27T00:00:00"), + IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(TemporalUtils::IsUtcOffset(""), IsError(ErrorKind::kInvalidArgument)); +} + TEST(TemporalUtilTest, ParseTimestampNsRejectsMoreThanNineFractionalDigits) { EXPECT_THAT(TemporalUtils::ParseTimestampNs("2026-01-01T00:00:01.0000010011"), IsError(ErrorKind::kInvalidArgument)); From 925611ae59a4a9b1afdd4131a8b487b6556747ee Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 23 Jun 2026 13:57:26 -0700 Subject: [PATCH 08/22] ci: re-trigger CI The AMD64 Windows job failed in its cleanup step (rm -rf example/build: Device or resource busy) after the example compiled and linked successfully; re-run CI. From 0b4d4524363a69d008e1b499cd7e42c3c29693f6 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 26 Jun 2026 10:15:07 -0700 Subject: [PATCH 09/22] fix(schema): address review nits on default-value validation/serde - ValidateDefault: use internal::checked_pointer_cast and ICEBERG_ASSIGN_OR_RAISE so an uncastable default surfaces the original CastTo error instead of a generic message; keep the out-of-range (sentinel) case as an invalid-schema error. - json_serde: error out directly when a timestamptz default is not a string, drop the offset-specific wording from the UTC error message (it also accepts -00:00), and use auto for the parsed default JSON. - Update schema tests for the propagated cast error and the out-of-range case. --- src/iceberg/json_serde.cc | 11 +++++------ src/iceberg/schema_field.cc | 11 ++++++----- src/iceberg/test/schema_test.cc | 15 +++++++++------ 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 24fb591e1..6790e9861 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -578,15 +578,14 @@ Status ValidateTimestamptzDefaultIsUtc(const Type& type, const nlohmann::json& v return {}; } if (!value.is_string()) { - // Let LiteralFromJson report the type mismatch. - return {}; + return JsonParseError("Invalid timestamptz default {} for {}: expected a string", + SafeDumpJson(value), type.ToString()); } const auto str = value.get(); ICEBERG_ASSIGN_OR_RAISE(bool is_utc, TemporalUtils::IsUtcOffset(str)); if (!is_utc) { return JsonParseError( - "Invalid timestamptz default '{}' for {}: default values must use UTC " - "(offset 'Z' or '+00:00')", + "Invalid timestamptz default '{}' for {}: default values must use a UTC offset", str, type.ToString()); } return {}; @@ -601,9 +600,9 @@ Result> FieldFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue(json, kName)); ICEBERG_ASSIGN_OR_RAISE(auto required, GetJsonValue(json, kRequired)); ICEBERG_ASSIGN_OR_RAISE(auto doc, GetJsonValueOrDefault(json, kDoc)); - ICEBERG_ASSIGN_OR_RAISE(std::optional initial_default_json, + ICEBERG_ASSIGN_OR_RAISE(auto initial_default_json, GetJsonValueOptional(json, kInitialDefault)); - ICEBERG_ASSIGN_OR_RAISE(std::optional write_default_json, + ICEBERG_ASSIGN_OR_RAISE(auto write_default_json, GetJsonValueOptional(json, kWriteDefault)); std::shared_ptr initial_default; diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index d2a7a8e4f..144f75799 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -25,6 +25,7 @@ #include "iceberg/expression/literal.h" #include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" @@ -107,11 +108,11 @@ Status ValidateDefault(const SchemaField& field, const Literal& value, // a string default on a date/timestamp/uuid field). Reject only defaults that cannot // be cast to the field type or fall outside its range (CastTo signals out-of-range as // an above-max/below-min sentinel). - auto field_type = std::static_pointer_cast(field.type()); - auto cast = value.CastTo(field_type); - if (!cast.has_value() || cast->IsAboveMax() || cast->IsBelowMin()) { - return InvalidSchema("{} of field {} has type {} that cannot be cast to {}", kind, - field.name(), *value.type(), *field.type()); + auto field_type = internal::checked_pointer_cast(field.type()); + ICEBERG_ASSIGN_OR_RAISE(auto cast, value.CastTo(field_type)); + if (cast.IsAboveMax() || cast.IsBelowMin()) { + return InvalidSchema("{} of field {} ({}) is out of range for {}", kind, field.name(), + *value.type(), *field.type()); } return {}; } diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index d07cf71a8..2db69a551 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -162,9 +162,11 @@ TEST(SchemaTest, ValidateRejectsMismatchedDefaultValue) { 1, "id", iceberg::int32(), false, /*doc=*/{}, /*initial_default=*/nullptr, std::make_shared(iceberg::Literal::String("oops")))}); + // The default literal is cast to the field type; an uncastable type surfaces the + // original cast error. auto status = schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion); - ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); - EXPECT_THAT(status, iceberg::HasErrorMessage("write-default")); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kNotSupported)); + EXPECT_THAT(status, iceberg::HasErrorMessage("Cast from String")); } TEST(SchemaTest, ValidateCastsDefaultToFieldType) { @@ -176,11 +178,12 @@ TEST(SchemaTest, ValidateCastsDefaultToFieldType) { EXPECT_THAT(widened.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), iceberg::IsOk()); - // A default whose type cannot be cast to the field type is still rejected. - iceberg::Schema uncastable({iceberg::SchemaField( + // A default outside the field type's range is rejected (CastTo returns an + // above-max/below-min sentinel). + iceberg::Schema out_of_range({iceberg::SchemaField( 1, "id", iceberg::int32(), false, /*doc=*/{}, - std::make_shared(iceberg::Literal::String("oops")))}); - EXPECT_THAT(uncastable.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + std::make_shared(iceberg::Literal::Long(5'000'000'000)))}); + EXPECT_THAT(out_of_range.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); } From b252b725e442b8271176542a3615b4239ac460e1 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 26 Jun 2026 11:40:29 -0700 Subject: [PATCH 10/22] refactor(serde): return Result from ToJson for schema/type/metadata serializers Column default values make single-value (Literal) serialization fallible, so the schema/type serializers must propagate errors instead of throwing. Change ToJson for SchemaField, Type, Schema, TableMetadata and TableUpdate (plus the REST CreateTableRequest/CommitTableRequest/LoadTableResult/CommitTableResponse, which embed them) to return Result, replacing the ICEBERG_ASSIGN_OR_THROW in SchemaField default serialization with ICEBERG_ASSIGN_OR_RAISE. - The shared ToJsonList template (also used by infallible serializers) is left unchanged; TableMetadata serializes its schema list with an explicit loop. - REST adds an ICEBERG_DECLARE_JSON_SERDE_FALLIBLE macro variant for the four models that embed a schema/metadata; the other REST models keep the bare-json ToJson. - Callers bottom out at existing Result boundaries (ToJsonString wrappers, rest_catalog.cc, TableMetadataUtil::Write). - Tests unwrap via ICEBERG_UNWRAP_OR_FAIL. --- src/iceberg/catalog/rest/json_serde.cc | 17 ++-- .../catalog/rest/json_serde_internal.h | 19 +++- src/iceberg/catalog/rest/rest_catalog.cc | 6 +- src/iceberg/json_serde.cc | 49 +++++----- src/iceberg/json_serde_internal.h | 10 +-- src/iceberg/table_metadata.cc | 2 +- src/iceberg/test/json_serde_test.cc | 89 ++++++++++++------- src/iceberg/test/metadata_serde_test.cc | 2 +- src/iceberg/test/schema_json_test.cc | 31 ++++--- 9 files changed, 143 insertions(+), 82 deletions(-) diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index ac80e9f08..49dbcdd4c 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -690,10 +690,10 @@ Result RenameTableRequestFromJson(const nlohmann::json& json } // LoadTableResult (used by CreateTableResponse, LoadTableResponse) -nlohmann::json ToJson(const LoadTableResult& result) { +Result ToJson(const LoadTableResult& result) { nlohmann::json json; SetOptionalStringField(json, kMetadataLocation, result.metadata_location); - json[kMetadata] = ToJson(*result.metadata); + ICEBERG_ASSIGN_OR_RAISE(json[kMetadata], ToJson(*result.metadata)); SetContainerField(json, kConfig, result.config); return json; } @@ -820,12 +820,12 @@ Result ListTablesResponseFromJson(const nlohmann::json& json return response; } -nlohmann::json ToJson(const CreateTableRequest& request) { +Result ToJson(const CreateTableRequest& request) { nlohmann::json json; json[kName] = request.name; SetOptionalStringField(json, kLocation, request.location); if (request.schema) { - json[kSchema] = ToJson(*request.schema); + ICEBERG_ASSIGN_OR_RAISE(json[kSchema], ToJson(*request.schema)); } if (request.partition_spec) { json[kPartitionSpec] = ToJson(*request.partition_spec); @@ -872,7 +872,7 @@ Result CreateTableRequestFromJson(const nlohmann::json& json } // CommitTableRequest serialization -nlohmann::json ToJson(const CommitTableRequest& request) { +Result ToJson(const CommitTableRequest& request) { nlohmann::json json; if (!request.identifier.name.empty()) { json[kIdentifier] = ToJson(request.identifier); @@ -886,7 +886,8 @@ nlohmann::json ToJson(const CommitTableRequest& request) { nlohmann::json updates_json = nlohmann::json::array(); for (const auto& update : request.updates) { - updates_json.push_back(ToJson(*update)); + ICEBERG_ASSIGN_OR_RAISE(auto update_json, ToJson(*update)); + updates_json.push_back(std::move(update_json)); } json[kUpdates] = std::move(updates_json); @@ -932,11 +933,11 @@ Result CommitTableRequestFromJson(const nlohmann::json& json } // CommitTableResponse serialization -nlohmann::json ToJson(const CommitTableResponse& response) { +Result ToJson(const CommitTableResponse& response) { nlohmann::json json; json[kMetadataLocation] = response.metadata_location; if (response.metadata) { - json[kMetadata] = ToJson(*response.metadata); + ICEBERG_ASSIGN_OR_RAISE(json[kMetadata], ToJson(*response.metadata)); } return json; } diff --git a/src/iceberg/catalog/rest/json_serde_internal.h b/src/iceberg/catalog/rest/json_serde_internal.h index 61d2eb6e0..ba05704c6 100644 --- a/src/iceberg/catalog/rest/json_serde_internal.h +++ b/src/iceberg/catalog/rest/json_serde_internal.h @@ -46,6 +46,16 @@ Result FromJson(const nlohmann::json& json); \ ICEBERG_REST_EXPORT nlohmann::json ToJson(const Model& model); +// Same as ICEBERG_DECLARE_JSON_SERDE, but ToJson returns Result because the model +// embeds a Schema/TableMetadata whose default-value serialization can fail. +#define ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(Model) \ + ICEBERG_REST_EXPORT Result Model##FromJson(const nlohmann::json& json); \ + \ + template <> \ + ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); \ + \ + ICEBERG_REST_EXPORT Result ToJson(const Model& model); + /// \note Don't forget to add `ICEBERG_DEFINE_FROM_JSON` to the end of /// `json_internal.cc` to define the `FromJson` function for the model. ICEBERG_DECLARE_JSON_SERDE(CatalogConfig) @@ -57,15 +67,16 @@ ICEBERG_DECLARE_JSON_SERDE(GetNamespaceResponse) ICEBERG_DECLARE_JSON_SERDE(UpdateNamespacePropertiesRequest) ICEBERG_DECLARE_JSON_SERDE(UpdateNamespacePropertiesResponse) ICEBERG_DECLARE_JSON_SERDE(ListTablesResponse) -ICEBERG_DECLARE_JSON_SERDE(LoadTableResult) +ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(LoadTableResult) ICEBERG_DECLARE_JSON_SERDE(RegisterTableRequest) ICEBERG_DECLARE_JSON_SERDE(RenameTableRequest) -ICEBERG_DECLARE_JSON_SERDE(CreateTableRequest) -ICEBERG_DECLARE_JSON_SERDE(CommitTableRequest) -ICEBERG_DECLARE_JSON_SERDE(CommitTableResponse) +ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(CreateTableRequest) +ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(CommitTableRequest) +ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(CommitTableResponse) ICEBERG_DECLARE_JSON_SERDE(OAuthTokenResponse) #undef ICEBERG_DECLARE_JSON_SERDE +#undef ICEBERG_DECLARE_JSON_SERDE_FALLIBLE ICEBERG_REST_EXPORT Result PlanTableScanResponseFromJson( const nlohmann::json& json, diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 7c79b94d1..2f8035477 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -666,7 +666,8 @@ Result RestCatalog::CreateTableInternal( .properties = properties, }; - ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(ToJson(request))); + ICEBERG_ASSIGN_OR_RAISE(auto request_json, ToJson(request)); + ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(request_json)); ICEBERG_ASSIGN_OR_RAISE(const auto response, client_->Post(path, json_request, /*headers=*/{}, *TableErrorHandler::Instance(), session)); @@ -709,7 +710,8 @@ Result RestCatalog::UpdateTableInternal( request.updates.push_back(update->Clone()); } - ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(ToJson(request))); + ICEBERG_ASSIGN_OR_RAISE(auto request_json, ToJson(request)); + ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(request_json)); ICEBERG_ASSIGN_OR_RAISE(const auto response, client_->Post(path, json_request, /*headers=*/{}, *TableErrorHandler::Instance(), session)); diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 6790e9861..4deb45330 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -301,28 +301,26 @@ Result> SortOrderFromJson(const nlohmann::json& json) return SortOrder::Make(order_id, std::move(sort_fields)); } -nlohmann::json ToJson(const SchemaField& field) { +Result ToJson(const SchemaField& field) { nlohmann::json json; json[kId] = field.field_id(); json[kName] = field.name(); json[kRequired] = !field.optional(); - json[kType] = ToJson(*field.type()); + ICEBERG_ASSIGN_OR_RAISE(json[kType], ToJson(*field.type())); if (!field.doc().empty()) { json[kDoc] = field.doc(); } - // Defaults are validated to be primitive literals matching the field type, so - // single-value serialization cannot fail here. if (field.initial_default().has_value()) { - ICEBERG_ASSIGN_OR_THROW(json[kInitialDefault], + ICEBERG_ASSIGN_OR_RAISE(json[kInitialDefault], ToJson(field.initial_default()->get())); } if (field.write_default().has_value()) { - ICEBERG_ASSIGN_OR_THROW(json[kWriteDefault], ToJson(field.write_default()->get())); + ICEBERG_ASSIGN_OR_RAISE(json[kWriteDefault], ToJson(field.write_default()->get())); } return json; } -nlohmann::json ToJson(const Type& type) { +Result ToJson(const Type& type) { switch (type.type_id()) { case TypeId::kStruct: { const auto& struct_type = internal::checked_cast(type); @@ -330,7 +328,8 @@ nlohmann::json ToJson(const Type& type) { json[kType] = kStruct; nlohmann::json fields_json = nlohmann::json::array(); for (const auto& field : struct_type.fields()) { - fields_json.push_back(ToJson(field)); + ICEBERG_ASSIGN_OR_RAISE(auto field_json, ToJson(field)); + fields_json.push_back(std::move(field_json)); } json[kFields] = fields_json; return json; @@ -343,7 +342,7 @@ nlohmann::json ToJson(const Type& type) { const auto& element_field = list_type.fields().front(); json[kElementId] = element_field.field_id(); json[kElementRequired] = !element_field.optional(); - json[kElement] = ToJson(*element_field.type()); + ICEBERG_ASSIGN_OR_RAISE(json[kElement], ToJson(*element_field.type())); return json; } case TypeId::kMap: { @@ -353,12 +352,12 @@ nlohmann::json ToJson(const Type& type) { const auto& key_field = map_type.key(); json[kKeyId] = key_field.field_id(); - json[kKey] = ToJson(*key_field.type()); + ICEBERG_ASSIGN_OR_RAISE(json[kKey], ToJson(*key_field.type())); const auto& value_field = map_type.value(); json[kValueId] = value_field.field_id(); json[kValueRequired] = !value_field.optional(); - json[kValue] = ToJson(*value_field.type()); + ICEBERG_ASSIGN_OR_RAISE(json[kValue], ToJson(*value_field.type())); return json; } case TypeId::kBoolean: @@ -404,8 +403,9 @@ nlohmann::json ToJson(const Type& type) { std::unreachable(); } -nlohmann::json ToJson(const Schema& schema) { - nlohmann::json json = ToJson(internal::checked_cast(schema)); +Result ToJson(const Schema& schema) { + ICEBERG_ASSIGN_OR_RAISE(nlohmann::json json, + ToJson(internal::checked_cast(schema))); json[kSchemaId] = schema.schema_id(); if (!schema.IdentifierFieldIds().empty()) { json[kIdentifierFieldIds] = schema.IdentifierFieldIds(); @@ -414,7 +414,8 @@ nlohmann::json ToJson(const Schema& schema) { } Result ToJsonString(const Schema& schema) { - return ToJsonString(ToJson(schema)); + ICEBERG_ASSIGN_OR_RAISE(auto json, ToJson(schema)); + return ToJsonString(json); } nlohmann::json ToJson(const SnapshotRef& ref) { @@ -956,7 +957,7 @@ Result EncryptedKeyFromJson(const nlohmann::json& json) { }; } -nlohmann::json ToJson(const TableMetadata& table_metadata) { +Result ToJson(const TableMetadata& table_metadata) { nlohmann::json json; json[kFormatVersion] = table_metadata.format_version; @@ -974,7 +975,7 @@ nlohmann::json ToJson(const TableMetadata& table_metadata) { if (table_metadata.format_version == 1) { for (const auto& schema : table_metadata.schemas) { if (schema->schema_id() == table_metadata.current_schema_id) { - json[kSchema] = ToJson(*schema); + ICEBERG_ASSIGN_OR_RAISE(json[kSchema], ToJson(*schema)); break; } } @@ -982,7 +983,14 @@ nlohmann::json ToJson(const TableMetadata& table_metadata) { // write the current schema ID and schema list json[kCurrentSchemaId] = table_metadata.current_schema_id; - json[kSchemas] = ToJsonList(table_metadata.schemas); + // Schemas can carry fallible default-value serialization, so the shared ToJsonList + // helper (which assumes infallible ToJson) is not used here. + nlohmann::json schemas_json = nlohmann::json::array(); + for (const auto& schema : table_metadata.schemas) { + ICEBERG_ASSIGN_OR_RAISE(auto schema_json, ToJson(*schema)); + schemas_json.push_back(std::move(schema_json)); + } + json[kSchemas] = std::move(schemas_json); // for older readers, continue writing the default spec as "partition-spec" if (table_metadata.format_version == 1) { @@ -1032,7 +1040,8 @@ nlohmann::json ToJson(const TableMetadata& table_metadata) { } Result ToJsonString(const TableMetadata& table_metadata) { - return ToJsonString(ToJson(table_metadata)); + ICEBERG_ASSIGN_OR_RAISE(auto json, ToJson(table_metadata)); + return ToJsonString(json); } namespace { @@ -1435,7 +1444,7 @@ Result NamespaceFromJson(const nlohmann::json& json) { return ns; } -nlohmann::json ToJson(const TableUpdate& update) { +Result ToJson(const TableUpdate& update) { nlohmann::json json; switch (update.kind()) { case TableUpdate::Kind::kAssignUUID: { @@ -1454,7 +1463,7 @@ nlohmann::json ToJson(const TableUpdate& update) { const auto& u = internal::checked_cast(update); json[kAction] = kActionAddSchema; if (u.schema()) { - json[kSchema] = ToJson(*u.schema()); + ICEBERG_ASSIGN_OR_RAISE(json[kSchema], ToJson(*u.schema())); } else { json[kSchema] = nlohmann::json::value_t::null; } diff --git a/src/iceberg/json_serde_internal.h b/src/iceberg/json_serde_internal.h index 351b41086..1a30e8e8f 100644 --- a/src/iceberg/json_serde_internal.h +++ b/src/iceberg/json_serde_internal.h @@ -93,7 +93,7 @@ ICEBERG_EXPORT Result> SortOrderFromJson( /// /// \param schema The Iceberg schema to convert. /// \return The JSON representation of the schema. -ICEBERG_EXPORT nlohmann::json ToJson(const Schema& schema); +ICEBERG_EXPORT Result ToJson(const Schema& schema); /// \brief Convert an Iceberg Schema to JSON. /// @@ -111,7 +111,7 @@ ICEBERG_EXPORT Result> SchemaFromJson(const nlohmann::js /// /// \param type The Iceberg type to convert. /// \return The JSON representation of the type. -ICEBERG_EXPORT nlohmann::json ToJson(const Type& type); +ICEBERG_EXPORT Result ToJson(const Type& type); /// \brief Convert JSON to an Iceberg Type. /// @@ -123,7 +123,7 @@ ICEBERG_EXPORT Result> TypeFromJson(const nlohmann::json& /// /// \param field The Iceberg field to convert. /// \return The JSON representation of the field. -ICEBERG_EXPORT nlohmann::json ToJson(const SchemaField& field); +ICEBERG_EXPORT Result ToJson(const SchemaField& field); /// \brief Convert JSON to an Iceberg SchemaField. /// @@ -300,7 +300,7 @@ ICEBERG_EXPORT Result EncryptedKeyFromJson(const nlohmann::json& j /// /// \param table_metadata The `TableMetadata` object to be serialized. /// \return A JSON object representing the `TableMetadata`. -ICEBERG_EXPORT nlohmann::json ToJson(const TableMetadata& table_metadata); +ICEBERG_EXPORT Result ToJson(const TableMetadata& table_metadata); /// \brief Serializes a `TableMetadata` object to JSON. /// @@ -404,7 +404,7 @@ ICEBERG_EXPORT Result NamespaceFromJson(const nlohmann::json& json); /// /// \param update The `TableUpdate` object to be serialized. /// \return A JSON object representing the `TableUpdate`. -ICEBERG_EXPORT nlohmann::json ToJson(const TableUpdate& update); +ICEBERG_EXPORT Result ToJson(const TableUpdate& update); /// \brief Deserializes a JSON object into a `TableUpdate` object. /// diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc index 53e3c231d..ef2f1bf38 100644 --- a/src/iceberg/table_metadata.cc +++ b/src/iceberg/table_metadata.cc @@ -489,7 +489,7 @@ Result TableMetadataUtil::Write(FileIO& io, const TableMetadata* ba Status TableMetadataUtil::Write(FileIO& io, const std::string& location, const TableMetadata& metadata) { - auto json = ToJson(metadata); + ICEBERG_ASSIGN_OR_RAISE(auto json, ToJson(metadata)); ICEBERG_ASSIGN_OR_RAISE(auto json_string, ToJsonString(json)); return io.WriteFile(location, json_string); } diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index 3951a806c..c97ed64a6 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -93,6 +93,19 @@ void TestJsonConversion(const T& obj, const nlohmann::json& expected_json) { EXPECT_EQ(obj, *obj_ex.value()) << "Deserialized object mismatch."; } +// ToJson(SchemaField) returns Result, so it cannot use the shared +// TestJsonConversion helper. Unwrap the serialized json before comparing and +// round-tripping. +void TestSchemaFieldJsonConversion(const SchemaField& field, + const nlohmann::json& expected_json) { + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(field)); + EXPECT_EQ(expected_json, json) << "JSON conversion mismatch."; + + auto obj_ex = FieldFromJson(expected_json); + EXPECT_TRUE(obj_ex.has_value()) << "Failed to deserialize JSON."; + EXPECT_EQ(field, *obj_ex.value()) << "Deserialized object mismatch."; +} + } // namespace // Pins the wire format produced by ToJson(SchemaField) / FieldFromJson for @@ -103,7 +116,7 @@ TEST(JsonInternalTest, SchemaFieldDefaultValues) { SchemaField with_both(/*field_id=*/1, "id", int32(), /*optional=*/false, /*doc=*/{}, std::make_shared(Literal::Int(42)), std::make_shared(Literal::Int(7))); - TestJsonConversion( + TestSchemaFieldJsonConversion( with_both, R"({"id":1,"name":"id","required":true,"type":"int","initial-default":42,"write-default":7})"_json); @@ -112,14 +125,14 @@ TEST(JsonInternalTest, SchemaFieldDefaultValues) { /*doc=*/{}, std::make_shared(Literal::String("n/a")), /*write_default=*/nullptr); - TestJsonConversion( + TestSchemaFieldJsonConversion( initial_only, R"({"id":2,"name":"name","required":false,"type":"string","initial-default":"n/a"})"_json); // No defaults; neither key may appear. SchemaField no_defaults(/*field_id=*/3, "plain", int32(), /*optional=*/false); - TestJsonConversion(no_defaults, - R"({"id":3,"name":"plain","required":true,"type":"int"})"_json); + TestSchemaFieldJsonConversion( + no_defaults, R"({"id":3,"name":"plain","required":true,"type":"int"})"_json); } // Round-trips a field carrying both defaults through ToJson -> FieldFromJson for @@ -152,7 +165,7 @@ TEST(JsonInternalTest, SchemaFieldDefaultValuesRoundTripAllTypes) { SchemaField field(field_id++, "f", type, /*optional=*/false, /*doc=*/{}, std::make_shared(literal), std::make_shared(literal)); - auto json = ToJson(field); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(field)); ICEBERG_UNWRAP_OR_FAIL(auto parsed, FieldFromJson(json)); EXPECT_EQ(field, *parsed) << "round-trip mismatch for type " << type->ToString() << ", json=" << json.dump(); @@ -415,7 +428,8 @@ TEST(JsonInternalTest, TableUpdateAssignUUID) { nlohmann::json expected = R"({"action":"assign-uuid","uuid":"550e8400-e29b-41d4-a716-446655440000"})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), update); @@ -426,7 +440,8 @@ TEST(JsonInternalTest, TableUpdateUpgradeFormatVersion) { nlohmann::json expected = R"({"action":"upgrade-format-version","format-version":2})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -440,7 +455,7 @@ TEST(JsonInternalTest, TableUpdateAddSchema) { /*schema_id=*/1); table::AddSchema update(schema, 2); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "add-schema"); EXPECT_EQ(json["last-column-id"], 2); EXPECT_TRUE(json.contains("schema")); @@ -457,9 +472,10 @@ TEST(JsonInternalTest, TableUpdateAddSchemaWithoutDeprecatedLastColumnId) { std::vector{SchemaField(1, "id", int64(), false), SchemaField(3, "name", string(), true)}, /*schema_id=*/1); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); nlohmann::json json = { {"action", "add-schema"}, - {"schema", ToJson(*schema)}, + {"schema", schema_json}, }; auto parsed = TableUpdateFromJson(json); @@ -473,7 +489,8 @@ TEST(JsonInternalTest, TableUpdateSetCurrentSchema) { table::SetCurrentSchema update(1); nlohmann::json expected = R"({"action":"set-current-schema","schema-id":1})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -487,7 +504,7 @@ TEST(JsonInternalTest, TableUpdateAddPartitionSpec) { PartitionSpec::Make(1, {PartitionField(3, 101, "region", identity_transform)})); table::AddPartitionSpec update(std::move(spec)); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "add-spec"); EXPECT_TRUE(json.contains("spec")); @@ -501,7 +518,8 @@ TEST(JsonInternalTest, TableUpdateSetDefaultPartitionSpec) { table::SetDefaultPartitionSpec update(2); nlohmann::json expected = R"({"action":"set-default-spec","spec-id":2})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ( @@ -514,7 +532,8 @@ TEST(JsonInternalTest, TableUpdateRemovePartitionSpecs) { nlohmann::json expected = R"({"action":"remove-partition-specs","spec-ids":[1,2,3]})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -524,7 +543,7 @@ TEST(JsonInternalTest, TableUpdateRemovePartitionSpecs) { TEST(JsonInternalTest, TableUpdateRemoveSchemas) { table::RemoveSchemas update({1, 2}); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "remove-schemas"); EXPECT_THAT(json["schema-ids"].get>(), testing::UnorderedElementsAre(1, 2)); @@ -540,7 +559,7 @@ TEST(JsonInternalTest, TableUpdateAddSortOrder) { ICEBERG_UNWRAP_OR_FAIL(auto sort_order, SortOrder::Make(1, {st})); table::AddSortOrder update(std::move(sort_order)); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "add-sort-order"); EXPECT_TRUE(json.contains("sort-order")); @@ -555,7 +574,8 @@ TEST(JsonInternalTest, TableUpdateSetDefaultSortOrder) { nlohmann::json expected = R"({"action":"set-default-sort-order","sort-order-id":1})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -573,7 +593,7 @@ TEST(JsonInternalTest, TableUpdateAddSnapshot) { .schema_id = 1}); table::AddSnapshot update(snapshot); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "add-snapshot"); EXPECT_TRUE(json.contains("snapshot")); @@ -588,7 +608,8 @@ TEST(JsonInternalTest, TableUpdateRemoveSnapshots) { nlohmann::json expected = R"({"action":"remove-snapshots","snapshot-ids":[111,222,333]})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -600,7 +621,8 @@ TEST(JsonInternalTest, TableUpdateRemoveSnapshotRef) { nlohmann::json expected = R"({"action":"remove-snapshot-ref","ref-name":"my-branch"})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -611,7 +633,7 @@ TEST(JsonInternalTest, TableUpdateSetSnapshotRefBranch) { table::SetSnapshotRef update("main", 123456789, SnapshotRefType::kBranch, 5, 86400000, 604800000); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "set-snapshot-ref"); EXPECT_EQ(json["ref-name"], "main"); EXPECT_EQ(json["snapshot-id"], 123456789); @@ -626,7 +648,7 @@ TEST(JsonInternalTest, TableUpdateSetSnapshotRefBranch) { TEST(JsonInternalTest, TableUpdateSetSnapshotRefTag) { table::SetSnapshotRef update("release-1.0", 987654321, SnapshotRefType::kTag); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "set-snapshot-ref"); EXPECT_EQ(json["type"], "tag"); @@ -639,7 +661,7 @@ TEST(JsonInternalTest, TableUpdateSetSnapshotRefTag) { TEST(JsonInternalTest, TableUpdateSetProperties) { table::SetProperties update({{"key1", "value1"}, {"key2", "value2"}}); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "set-properties"); EXPECT_TRUE(json.contains("updates")); @@ -668,7 +690,7 @@ TEST(JsonInternalTest, TableUpdateSetPropertiesMissingCanonicalField) { TEST(JsonInternalTest, TableUpdateRemoveProperties) { table::RemoveProperties update({"key1", "key2"}); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "remove-properties"); EXPECT_TRUE(json.contains("removals")); @@ -700,7 +722,8 @@ TEST(JsonInternalTest, TableUpdateSetLocation) { nlohmann::json expected = R"({"action":"set-location","location":"s3://bucket/warehouse/table"})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), update); @@ -736,7 +759,8 @@ TEST(JsonInternalTest, TableUpdateSetStatistics) { } })"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), update); @@ -747,7 +771,8 @@ TEST(JsonInternalTest, TableUpdateRemoveStatistics) { nlohmann::json expected = R"({"action":"remove-statistics","snapshot-id":123456789})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -771,7 +796,8 @@ TEST(JsonInternalTest, TableUpdateSetPartitionStatistics) { } })"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -783,7 +809,8 @@ TEST(JsonInternalTest, TableUpdateRemovePartitionStatistics) { nlohmann::json expected = R"({"action":"remove-partition-statistics","snapshot-id":123456789})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ( @@ -809,7 +836,8 @@ TEST(JsonInternalTest, TableUpdateAddEncryptionKey) { {"properties", {{"scope", "table"}}}}}, }; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -828,7 +856,8 @@ TEST(JsonInternalTest, TableUpdateRemoveEncryptionKey) { table::RemoveEncryptionKey update("key-1"); nlohmann::json expected = R"({"action":"remove-encryption-key","key-id":"key-1"})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), diff --git a/src/iceberg/test/metadata_serde_test.cc b/src/iceberg/test/metadata_serde_test.cc index 48f88f2bc..acda5e068 100644 --- a/src/iceberg/test/metadata_serde_test.cc +++ b/src/iceberg/test/metadata_serde_test.cc @@ -534,7 +534,7 @@ TEST(MetadataSerdeTest, EncryptionKeysRoundTrip) { EXPECT_EQ(metadata.value()->encryption_keys[0].encrypted_key_metadata, "secret-key-metadata"); - auto serialized = ToJson(*metadata.value()); + ICEBERG_UNWRAP_OR_FAIL(auto serialized, ToJson(*metadata.value())); ASSERT_TRUE(serialized.contains("encryption-keys")); EXPECT_EQ(serialized["encryption-keys"], metadata_json["encryption-keys"]); } diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index 71fc474f4..cf7bdb647 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -43,8 +43,8 @@ class TypeJsonTest : public ::testing::TestWithParam {}; TEST_P(TypeJsonTest, SingleTypeRoundTrip) { // To Json const auto& param = GetParam(); - auto json = ToJson(*param.type).dump(); - ASSERT_EQ(param.json, json); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(*param.type)); + ASSERT_EQ(param.json, json.dump()); // From Json auto type_result = TypeFromJson(nlohmann::json::parse(param.json)); @@ -134,8 +134,8 @@ TEST(SchemaJsonTest, RoundTrip) { ASSERT_EQ(field2.type()->type_id(), TypeId::kString); ASSERT_TRUE(field2.optional()); - auto dumped_json = ToJson(*schema).dump(); - ASSERT_EQ(dumped_json, json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); } TEST(SchemaJsonTest, FieldWithDefaultValuesRoundTrip) { @@ -156,7 +156,8 @@ TEST(SchemaJsonTest, FieldWithDefaultValuesRoundTrip) { ASSERT_EQ(field2.initial_default()->get(), Literal::String("n/a")); ASSERT_FALSE(field2.write_default().has_value()); - ASSERT_EQ(ToJson(*schema).dump(), json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); } TEST(SchemaJsonTest, FieldWithMismatchedDefaultValueFails) { @@ -179,7 +180,8 @@ TEST(SchemaJsonTest, NestedFieldWithDefaultValuesRoundTrip) { ASSERT_TRUE(nested.write_default().has_value()); ASSERT_EQ(nested.write_default()->get(), Literal::Int(21)); - ASSERT_EQ(ToJson(*schema).dump(), json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); } TEST(SchemaJsonTest, UnknownFieldRoundTrip) { @@ -194,7 +196,8 @@ TEST(SchemaJsonTest, UnknownFieldRoundTrip) { ASSERT_EQ(field.name(), "mystery"); ASSERT_EQ(field.type()->type_id(), TypeId::kUnknown); ASSERT_TRUE(field.optional()); - ASSERT_EQ(ToJson(*schema).dump(), json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); } TEST(SchemaJsonTest, NestedUnknownFieldsRoundTrip) { @@ -261,7 +264,8 @@ TEST(SchemaJsonTest, NestedUnknownFieldsRoundTrip) { ASSERT_EQ(properties->value().type()->type_id(), TypeId::kUnknown); ASSERT_TRUE(properties->value().optional()); - ASSERT_EQ(ToJson(*schema), parsed_json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json, parsed_json); } TEST(SchemaJsonTest, IdentifierFieldIds) { @@ -280,7 +284,8 @@ TEST(SchemaJsonTest, IdentifierFieldIds) { ASSERT_EQ(schema_with_identifers->schema_id(), 1); ASSERT_EQ(schema_with_identifers->IdentifierFieldIds().size(), 1); ASSERT_EQ(schema_with_identifers->IdentifierFieldIds()[0], 1); - ASSERT_EQ(ToJson(*schema_with_identifers), json_with_identifiers); + ICEBERG_UNWRAP_OR_FAIL(auto json_with_identifiers_out, ToJson(*schema_with_identifers)); + ASSERT_EQ(json_with_identifiers_out, json_with_identifiers); // Test schema without identifier-field-ids constexpr std::string_view json_without_identifiers_str = @@ -293,7 +298,9 @@ TEST(SchemaJsonTest, IdentifierFieldIds) { ICEBERG_UNWRAP_OR_FAIL(auto schema_without_identifiers, SchemaFromJson(json_without_identifiers)); ASSERT_TRUE(schema_without_identifiers->IdentifierFieldIds().empty()); - ASSERT_EQ(ToJson(*schema_without_identifiers), json_without_identifiers); + ICEBERG_UNWRAP_OR_FAIL(auto json_without_identifiers_out, + ToJson(*schema_without_identifiers)); + ASSERT_EQ(json_without_identifiers_out, json_without_identifiers); // Test schema with multiple identifier fields constexpr std::string_view json_multi_identifiers_str = @@ -309,7 +316,9 @@ TEST(SchemaJsonTest, IdentifierFieldIds) { ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds().size(), 2); ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds()[0], 1); ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds()[1], 2); - ASSERT_EQ(ToJson(*schema_multi_identifiers), json_multi_identifiers); + ICEBERG_UNWRAP_OR_FAIL(auto json_multi_identifiers_out, + ToJson(*schema_multi_identifiers)); + ASSERT_EQ(json_multi_identifiers_out, json_multi_identifiers); } } // namespace iceberg From 36a76ca87eab6db670133c65d5cabd4a59b6748b Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 26 Jun 2026 11:55:05 -0700 Subject: [PATCH 11/22] refactor(serde): avoid duplicate macro for fallible REST ToJson Extract the FromJson declarations into ICEBERG_DECLARE_FROM_JSON so the four schema/metadata-bearing models declare a Result-returning ToJson without duplicating the whole serde macro. --- .../catalog/rest/json_serde_internal.h | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/iceberg/catalog/rest/json_serde_internal.h b/src/iceberg/catalog/rest/json_serde_internal.h index ba05704c6..5b7c90fa0 100644 --- a/src/iceberg/catalog/rest/json_serde_internal.h +++ b/src/iceberg/catalog/rest/json_serde_internal.h @@ -38,23 +38,16 @@ namespace iceberg::rest { template Result FromJson(const nlohmann::json& json); -#define ICEBERG_DECLARE_JSON_SERDE(Model) \ +// Declares the two FromJson entry points for a model. +#define ICEBERG_DECLARE_FROM_JSON(Model) \ ICEBERG_REST_EXPORT Result Model##FromJson(const nlohmann::json& json); \ \ template <> \ - ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); \ - \ - ICEBERG_REST_EXPORT nlohmann::json ToJson(const Model& model); + ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); -// Same as ICEBERG_DECLARE_JSON_SERDE, but ToJson returns Result because the model -// embeds a Schema/TableMetadata whose default-value serialization can fail. -#define ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(Model) \ - ICEBERG_REST_EXPORT Result Model##FromJson(const nlohmann::json& json); \ - \ - template <> \ - ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); \ - \ - ICEBERG_REST_EXPORT Result ToJson(const Model& model); +#define ICEBERG_DECLARE_JSON_SERDE(Model) \ + ICEBERG_DECLARE_FROM_JSON(Model) \ + ICEBERG_REST_EXPORT nlohmann::json ToJson(const Model& model); /// \note Don't forget to add `ICEBERG_DEFINE_FROM_JSON` to the end of /// `json_internal.cc` to define the `FromJson` function for the model. @@ -67,16 +60,23 @@ ICEBERG_DECLARE_JSON_SERDE(GetNamespaceResponse) ICEBERG_DECLARE_JSON_SERDE(UpdateNamespacePropertiesRequest) ICEBERG_DECLARE_JSON_SERDE(UpdateNamespacePropertiesResponse) ICEBERG_DECLARE_JSON_SERDE(ListTablesResponse) -ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(LoadTableResult) ICEBERG_DECLARE_JSON_SERDE(RegisterTableRequest) ICEBERG_DECLARE_JSON_SERDE(RenameTableRequest) -ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(CreateTableRequest) -ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(CommitTableRequest) -ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(CommitTableResponse) ICEBERG_DECLARE_JSON_SERDE(OAuthTokenResponse) +// These models embed a Schema/TableMetadata whose default-value serialization can +// fail, so their ToJson returns Result; FromJson is declared like the others. +ICEBERG_DECLARE_FROM_JSON(LoadTableResult) +ICEBERG_REST_EXPORT Result ToJson(const LoadTableResult& model); +ICEBERG_DECLARE_FROM_JSON(CreateTableRequest) +ICEBERG_REST_EXPORT Result ToJson(const CreateTableRequest& model); +ICEBERG_DECLARE_FROM_JSON(CommitTableRequest) +ICEBERG_REST_EXPORT Result ToJson(const CommitTableRequest& model); +ICEBERG_DECLARE_FROM_JSON(CommitTableResponse) +ICEBERG_REST_EXPORT Result ToJson(const CommitTableResponse& model); + +#undef ICEBERG_DECLARE_FROM_JSON #undef ICEBERG_DECLARE_JSON_SERDE -#undef ICEBERG_DECLARE_JSON_SERDE_FALLIBLE ICEBERG_REST_EXPORT Result PlanTableScanResponseFromJson( const nlohmann::json& json, From 7f49ebc743250e2e3e3d96abd9ca92dec479e45b Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 26 Jun 2026 12:52:08 -0700 Subject: [PATCH 12/22] refactor(serde): declare fallible REST ToJson inline Keep ICEBERG_DECLARE_JSON_SERDE identical to upstream and declare the four schema/metadata-bearing models (LoadTableResult, CreateTableRequest, CommitTableRequest, CommitTableResponse) explicitly, since only their ToJson return type (Result) differs. --- .../catalog/rest/json_serde_internal.h | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/src/iceberg/catalog/rest/json_serde_internal.h b/src/iceberg/catalog/rest/json_serde_internal.h index 5b7c90fa0..efe81a4cf 100644 --- a/src/iceberg/catalog/rest/json_serde_internal.h +++ b/src/iceberg/catalog/rest/json_serde_internal.h @@ -38,15 +38,12 @@ namespace iceberg::rest { template Result FromJson(const nlohmann::json& json); -// Declares the two FromJson entry points for a model. -#define ICEBERG_DECLARE_FROM_JSON(Model) \ +#define ICEBERG_DECLARE_JSON_SERDE(Model) \ ICEBERG_REST_EXPORT Result Model##FromJson(const nlohmann::json& json); \ \ template <> \ - ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); - -#define ICEBERG_DECLARE_JSON_SERDE(Model) \ - ICEBERG_DECLARE_FROM_JSON(Model) \ + ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); \ + \ ICEBERG_REST_EXPORT nlohmann::json ToJson(const Model& model); /// \note Don't forget to add `ICEBERG_DEFINE_FROM_JSON` to the end of @@ -64,19 +61,33 @@ ICEBERG_DECLARE_JSON_SERDE(RegisterTableRequest) ICEBERG_DECLARE_JSON_SERDE(RenameTableRequest) ICEBERG_DECLARE_JSON_SERDE(OAuthTokenResponse) -// These models embed a Schema/TableMetadata whose default-value serialization can -// fail, so their ToJson returns Result; FromJson is declared like the others. -ICEBERG_DECLARE_FROM_JSON(LoadTableResult) +#undef ICEBERG_DECLARE_JSON_SERDE + +// These models embed a Schema/TableMetadata whose default-value serialization can fail, +// so their ToJson returns Result. FromJson is declared like the macro-based models above. +ICEBERG_REST_EXPORT Result LoadTableResultFromJson( + const nlohmann::json& json); +template <> +ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); ICEBERG_REST_EXPORT Result ToJson(const LoadTableResult& model); -ICEBERG_DECLARE_FROM_JSON(CreateTableRequest) + +ICEBERG_REST_EXPORT Result CreateTableRequestFromJson( + const nlohmann::json& json); +template <> +ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); ICEBERG_REST_EXPORT Result ToJson(const CreateTableRequest& model); -ICEBERG_DECLARE_FROM_JSON(CommitTableRequest) + +ICEBERG_REST_EXPORT Result CommitTableRequestFromJson( + const nlohmann::json& json); +template <> +ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); ICEBERG_REST_EXPORT Result ToJson(const CommitTableRequest& model); -ICEBERG_DECLARE_FROM_JSON(CommitTableResponse) -ICEBERG_REST_EXPORT Result ToJson(const CommitTableResponse& model); -#undef ICEBERG_DECLARE_FROM_JSON -#undef ICEBERG_DECLARE_JSON_SERDE +ICEBERG_REST_EXPORT Result CommitTableResponseFromJson( + const nlohmann::json& json); +template <> +ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); +ICEBERG_REST_EXPORT Result ToJson(const CommitTableResponse& model); ICEBERG_REST_EXPORT Result PlanTableScanResponseFromJson( const nlohmann::json& json, From 26b1f1bf739bde117bad8fef54fb376b2e304be4 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 27 Jun 2026 10:33:56 -0700 Subject: [PATCH 13/22] docs: clarify IsUtcOffset accepts Z/+00:00/-00:00; trim default-value comment Addresses review feedback: document that any zero-offset spelling (Z, +00:00, -00:00) is accepted by TemporalUtils::IsUtcOffset, and shorten the verbose comment on the default-value members. --- src/iceberg/schema_field.h | 4 +--- src/iceberg/util/temporal_util.h | 12 ++++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index e90425ca7..34592ce8f 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -134,9 +134,7 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { std::shared_ptr type_; bool optional_; std::string doc_; - // Default values are owned by this field and never mutated after being set; copies - // of the field share the same payload (reference-counted) instead of deep-copying, - // like `type_` above. Sharing is unobservable because the payload is immutable. + // Immutable default values, shared (not deep-copied) across field copies, like `type_`. std::shared_ptr initial_default_; std::shared_ptr write_default_; }; diff --git a/src/iceberg/util/temporal_util.h b/src/iceberg/util/temporal_util.h index cc7a16319..acb1f0e73 100644 --- a/src/iceberg/util/temporal_util.h +++ b/src/iceberg/util/temporal_util.h @@ -125,15 +125,19 @@ class ICEBERG_EXPORT TemporalUtils { /// \return The number of nanoseconds since epoch (UTC), or an error. static Result ParseTimestampNsWithZone(std::string_view str); - /// \brief Reports whether a timestamp-with-zone string uses a UTC offset. + /// \brief Reports whether a timestamp-with-zone string carries a zero (UTC) offset. /// /// The ParseTimestamp*WithZone parsers accept any offset and silently normalize it /// to UTC. The spec's JSON single-value form for `timestamptz` / `timestamptz_ns` - /// default values only permits UTC ("Z" or "+00:00"), so callers that must enforce - /// that rule check the offset here before parsing. + /// default values is meant to be UTC, so callers that must enforce that rule check + /// the offset here before parsing. + /// + /// Any zero-offset spelling is treated as UTC: "Z", "+00:00" and "-00:00" all denote + /// a zero offset and are accepted, even though the spec writes the canonical form as + /// "+00:00". Only a genuinely non-zero offset (e.g. "+05:00") is rejected. /// /// \param str The timestamp-with-zone string to inspect. - /// \return true if the offset is UTC, false if it is a non-UTC offset, or an error + /// \return true if the offset is zero, false if it is a non-zero offset, or an error /// if the timezone suffix cannot be parsed. static Result IsUtcOffset(std::string_view str); From 462d2d92d10f8fe901ee07352b928a4fce2d8880 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 27 Jun 2026 15:04:39 -0700 Subject: [PATCH 14/22] fix(schema): normalize default value to the field type at construction A cross-type default (e.g. an int literal on a long field, accepted to match Java's cast behavior) was stored un-normalized, so projection materialized a wrong-typed fill value and a ToJson/FieldFromJson round-trip compared unequal (Int(34) vs Long(34)). Cast the default to the field type when constructing the field; values that already match are untouched and uncastable ones are left for Validate() to reject. Fixes the projection and round-trip-equality paths. --- src/iceberg/schema_field.cc | 29 ++++++++++++++++++++++++++-- src/iceberg/test/schema_json_test.cc | 16 +++++++++++++++ src/iceberg/test/schema_test.cc | 5 +++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index 144f75799..d854d75cb 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -31,6 +31,31 @@ namespace iceberg { +namespace { + +// Normalizes a default value to the field type. A cross-type default (e.g. an int +// literal on a long field) is accepted, so it is cast to the field type up front; +// otherwise projection, JSON round-trip and equality would observe a literal whose type +// differs from the field. A value that already matches the field type is returned as-is +// (no needless copy), and a value that cannot be cast is left unchanged so Validate() +// can report it. +std::shared_ptr NormalizeDefault(std::shared_ptr value, + const std::shared_ptr& field_type) { + if (value == nullptr || field_type == nullptr || !field_type->is_primitive()) { + return value; + } + if (*value->type() == *field_type) { + return value; + } + auto cast = value->CastTo(internal::checked_pointer_cast(field_type)); + if (!cast.has_value() || cast->IsAboveMax() || cast->IsBelowMin()) { + return value; + } + return std::make_shared(std::move(cast).value()); +} + +} // namespace + SchemaField::SchemaField(int32_t field_id, std::string_view name, std::shared_ptr type, bool optional, std::string_view doc, std::shared_ptr initial_default, @@ -40,8 +65,8 @@ SchemaField::SchemaField(int32_t field_id, std::string_view name, type_(std::move(type)), optional_(optional), doc_(doc), - initial_default_(std::move(initial_default)), - write_default_(std::move(write_default)) {} + initial_default_(NormalizeDefault(std::move(initial_default), type_)), + write_default_(NormalizeDefault(std::move(write_default), type_)) {} SchemaField SchemaField::MakeOptional(int32_t field_id, std::string_view name, std::shared_ptr type, std::string_view doc) { diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index f8b6114c1..70fd21c7c 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -241,6 +241,22 @@ TEST(SchemaJsonTest, NestedFieldWithDefaultValuesRoundTrip) { ASSERT_EQ(schema_json.dump(), json); } +TEST(SchemaJsonTest, CrossTypeDefaultNormalizedRoundTrip) { + // A programmatically-constructed cross-type default (an int literal on a long field) is + // normalized to the field type at construction, so it survives a ToJson/SchemaFromJson + // round-trip as an equal schema (an un-normalized int literal would compare unequal to + // the re-parsed long literal). + Schema original({SchemaField(1, "id", int64(), /*optional=*/false, /*doc=*/{}, + std::make_shared(Literal::Int(34)))}); + const auto& field = original.fields()[0]; + ASSERT_TRUE(field.initial_default().has_value()); + EXPECT_EQ(field.initial_default()->get(), Literal::Long(34)); + + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(original)); + ICEBERG_UNWRAP_OR_FAIL(auto reparsed, SchemaFromJson(json)); + EXPECT_EQ(original, *reparsed); +} + TEST(SchemaJsonTest, UnknownFieldRoundTrip) { constexpr std::string_view json = R"({"fields":[{"id":1,"name":"mystery","required":false,"type":"unknown"}],"schema-id":1,"type":"struct"})"; diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index b78798499..e03d1a29c 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -177,6 +177,11 @@ TEST(SchemaTest, ValidateCastsDefaultToFieldType) { std::make_shared(iceberg::Literal::Int(34)))}); EXPECT_THAT(widened.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), iceberg::IsOk()); + // The stored default is normalized to the field type (long), not kept as the narrower + // int literal, so projection/serialization/equality all observe a long. + const auto& widened_field = widened.fields()[0]; + ASSERT_TRUE(widened_field.initial_default().has_value()); + EXPECT_EQ(widened_field.initial_default()->get(), iceberg::Literal::Long(34)); // A default outside the field type's range is rejected (CastTo returns an // above-max/below-min sentinel). From c58050f96e0e563b67c1e0f40c4e06b53de7b7c8 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 27 Jun 2026 15:54:05 -0700 Subject: [PATCH 15/22] refactor(schema): normalize default values via a fallible SchemaField::Make factory Move default-value normalization out of the constructor (which cannot report errors) into a fallible factory, per review feedback. SchemaField::Make casts a cross-type default to the field type and propagates the cast error / rejects out-of-range values instead of swallowing them; it is a no-op when the default already matches the field type. The constructor stores defaults verbatim, and ValidateDefault requires a stored default to match the field type exactly (Make is the cast-accepting path). This keeps the cross-type-accept behavior while ensuring stored defaults are always field-typed, so projection and JSON round-trip equality are correct. --- src/iceberg/schema_field.cc | 57 +++++++++++++++++----------- src/iceberg/schema_field.h | 11 ++++++ src/iceberg/test/schema_json_test.cc | 15 ++++---- src/iceberg/test/schema_test.cc | 49 ++++++++++++------------ 4 files changed, 78 insertions(+), 54 deletions(-) diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index d854d75cb..51d8181f8 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -33,25 +33,27 @@ namespace iceberg { namespace { -// Normalizes a default value to the field type. A cross-type default (e.g. an int -// literal on a long field) is accepted, so it is cast to the field type up front; -// otherwise projection, JSON round-trip and equality would observe a literal whose type -// differs from the field. A value that already matches the field type is returned as-is -// (no needless copy), and a value that cannot be cast is left unchanged so Validate() -// can report it. -std::shared_ptr NormalizeDefault(std::shared_ptr value, - const std::shared_ptr& field_type) { +// Normalizes a default value to the field type so the stored value always matches the +// field. A cross-type default (e.g. an int literal on a long field) is cast to the field +// type; a value that already matches the field type, or a null, is returned unchanged. +// The cast error is propagated and an out-of-range value is rejected rather than +// swallowed. +Result> NormalizeDefault( + std::shared_ptr value, const std::shared_ptr& field_type) { if (value == nullptr || field_type == nullptr || !field_type->is_primitive()) { return value; } if (*value->type() == *field_type) { return value; } - auto cast = value->CastTo(internal::checked_pointer_cast(field_type)); - if (!cast.has_value() || cast->IsAboveMax() || cast->IsBelowMin()) { - return value; + ICEBERG_ASSIGN_OR_RAISE( + auto cast, + value->CastTo(internal::checked_pointer_cast(field_type))); + if (cast.IsAboveMax() || cast.IsBelowMin()) { + return InvalidSchema("default value of type {} is out of range for {}", + *value->type(), *field_type); } - return std::make_shared(std::move(cast).value()); + return std::make_shared(std::move(cast)); } } // namespace @@ -65,8 +67,21 @@ SchemaField::SchemaField(int32_t field_id, std::string_view name, type_(std::move(type)), optional_(optional), doc_(doc), - initial_default_(NormalizeDefault(std::move(initial_default), type_)), - write_default_(NormalizeDefault(std::move(write_default), type_)) {} + initial_default_(std::move(initial_default)), + write_default_(std::move(write_default)) {} + +Result SchemaField::Make(int32_t field_id, std::string_view name, + std::shared_ptr type, bool optional, + std::string_view doc, + std::shared_ptr initial_default, + std::shared_ptr write_default) { + ICEBERG_ASSIGN_OR_RAISE(initial_default, + NormalizeDefault(std::move(initial_default), type)); + ICEBERG_ASSIGN_OR_RAISE(write_default, + NormalizeDefault(std::move(write_default), type)); + return SchemaField(field_id, name, std::move(type), optional, doc, + std::move(initial_default), std::move(write_default)); +} SchemaField SchemaField::MakeOptional(int32_t field_id, std::string_view name, std::shared_ptr type, std::string_view doc) { @@ -128,15 +143,11 @@ Status ValidateDefault(const SchemaField& field, const Literal& value, "Invalid {} value for {}: default values are only supported for primitive types", kind, field.name()); } - // Match Java (Types.NestedField), which casts the default literal to the field type - // instead of requiring an exact type match (e.g. an int default on a long field, or - // a string default on a date/timestamp/uuid field). Reject only defaults that cannot - // be cast to the field type or fall outside its range (CastTo signals out-of-range as - // an above-max/below-min sentinel). - auto field_type = internal::checked_pointer_cast(field.type()); - ICEBERG_ASSIGN_OR_RAISE(auto cast, value.CastTo(field_type)); - if (cast.IsAboveMax() || cast.IsBelowMin()) { - return InvalidSchema("{} of field {} ({}) is out of range for {}", kind, field.name(), + // A default is normalized to the field type by SchemaField::Make (which casts a + // cross-type default and reports any cast error), so a stored default whose type does + // not match the field type is invalid here. + if (*value.type() != *field.type()) { + return InvalidSchema("{} of field {} has type {} but expected {}", kind, field.name(), *value.type(), *field.type()); } return {}; diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index 34592ce8f..f14659569 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -64,6 +64,17 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { static SchemaField MakeRequired(int32_t field_id, std::string_view name, std::shared_ptr type, std::string_view doc = {}); + /// \brief Construct a field, normalizing the default values to the field type. + /// + /// Unlike the constructor (which stores the defaults verbatim), this casts a default + /// whose literal type differs from the field type to the field type — e.g. an int + /// default on a long field — matching the spec/Java cast behavior. Returns an error if + /// a default cannot be cast to the field type or falls outside its range. + static Result Make( + int32_t field_id, std::string_view name, std::shared_ptr type, bool optional, + std::string_view doc = {}, std::shared_ptr initial_default = nullptr, + std::shared_ptr write_default = nullptr); + /// \brief Get the field ID. [[nodiscard]] int32_t field_id() const; diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index 70fd21c7c..070b4d349 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -242,16 +242,17 @@ TEST(SchemaJsonTest, NestedFieldWithDefaultValuesRoundTrip) { } TEST(SchemaJsonTest, CrossTypeDefaultNormalizedRoundTrip) { - // A programmatically-constructed cross-type default (an int literal on a long field) is - // normalized to the field type at construction, so it survives a ToJson/SchemaFromJson - // round-trip as an equal schema (an un-normalized int literal would compare unequal to - // the re-parsed long literal). - Schema original({SchemaField(1, "id", int64(), /*optional=*/false, /*doc=*/{}, - std::make_shared(Literal::Int(34)))}); - const auto& field = original.fields()[0]; + // A cross-type default (an int literal on a long field) built via SchemaField::Make is + // normalized to the field type, so it survives a ToJson/SchemaFromJson round-trip as an + // equal schema (an un-normalized int literal would compare unequal to the re-parsed + // long literal). + ICEBERG_UNWRAP_OR_FAIL( + auto field, SchemaField::Make(1, "id", int64(), /*optional=*/false, /*doc=*/{}, + std::make_shared(Literal::Int(34)))); ASSERT_TRUE(field.initial_default().has_value()); EXPECT_EQ(field.initial_default()->get(), Literal::Long(34)); + Schema original({std::move(field)}); ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(original)); ICEBERG_UNWRAP_OR_FAIL(auto reparsed, SchemaFromJson(json)); EXPECT_EQ(original, *reparsed); diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index e03d1a29c..bcc98999b 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -158,38 +158,39 @@ TEST(SchemaTest, ValidateDoesNotVersionGateWriteDefault) { } TEST(SchemaTest, ValidateRejectsMismatchedDefaultValue) { + // The constructor stores defaults verbatim (normalization happens in + // SchemaField::Make), so a stored default whose type differs from the field type is + // rejected by Validate. iceberg::Schema schema({iceberg::SchemaField( 1, "id", iceberg::int32(), false, /*doc=*/{}, /*initial_default=*/nullptr, std::make_shared(iceberg::Literal::String("oops")))}); - // The default literal is cast to the field type; an uncastable type surfaces the - // original cast error. auto status = schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion); - ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kNotSupported)); - EXPECT_THAT(status, iceberg::HasErrorMessage("Cast from String")); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + EXPECT_THAT(status, iceberg::HasErrorMessage("write-default")); } -TEST(SchemaTest, ValidateCastsDefaultToFieldType) { - // Matching Java, the default literal is cast to the field type rather than required to - // match exactly: an int default on a long field is accepted (int -> long). - iceberg::Schema widened({iceberg::SchemaField( - 1, "id", iceberg::int64(), false, /*doc=*/{}, - std::make_shared(iceberg::Literal::Int(34)))}); - EXPECT_THAT(widened.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), - iceberg::IsOk()); - // The stored default is normalized to the field type (long), not kept as the narrower - // int literal, so projection/serialization/equality all observe a long. - const auto& widened_field = widened.fields()[0]; - ASSERT_TRUE(widened_field.initial_default().has_value()); - EXPECT_EQ(widened_field.initial_default()->get(), iceberg::Literal::Long(34)); - - // A default outside the field type's range is rejected (CastTo returns an - // above-max/below-min sentinel). - iceberg::Schema out_of_range({iceberg::SchemaField( - 1, "id", iceberg::int32(), false, /*doc=*/{}, - std::make_shared(iceberg::Literal::Long(5'000'000'000)))}); - EXPECT_THAT(out_of_range.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), +TEST(SchemaTest, MakeNormalizesDefaultToFieldType) { + // Make casts a cross-type default to the field type (int -> long) and stores the + // normalized value, so projection/serialization/equality all observe a long. + ICEBERG_UNWRAP_OR_FAIL( + auto field, iceberg::SchemaField::Make(1, "id", iceberg::int64(), false, /*doc=*/{}, + std::make_shared( + iceberg::Literal::Int(34)))); + ASSERT_TRUE(field.initial_default().has_value()); + EXPECT_EQ(field.initial_default()->get(), iceberg::Literal::Long(34)); + + // A default outside the field type's range is rejected by Make. + EXPECT_THAT(iceberg::SchemaField::Make(1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared( + iceberg::Literal::Long(5'000'000'000))), iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + + // A default whose type cannot be cast to the field type surfaces the cast error. + EXPECT_THAT(iceberg::SchemaField::Make(1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared( + iceberg::Literal::String("oops"))), + iceberg::IsError(iceberg::ErrorKind::kNotSupported)); } TEST(SchemaTest, ReassignIdsPreservesDefaultValues) { From 41c2bd1bef0d0fcab77bcf20bb9b8e227360e929 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 27 Jun 2026 17:02:45 -0700 Subject: [PATCH 16/22] refactor(schema): consolidate default getters and drop [[nodiscard]] Per review: replace the four default-value getters (optional readers + _ptr sharers) with a single shared_ptr-returning getter per default, and remove the [[nodiscard]] attributes from SchemaField's getters. --- src/iceberg/json_serde.cc | 9 +++--- src/iceberg/schema.cc | 10 +++---- src/iceberg/schema_field.cc | 19 ++---------- src/iceberg/schema_field.h | 44 ++++++++-------------------- src/iceberg/schema_util.cc | 4 +-- src/iceberg/test/schema_json_test.cc | 26 ++++++++-------- src/iceberg/test/schema_test.cc | 12 ++++---- 7 files changed, 44 insertions(+), 80 deletions(-) diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 549173964..614712f30 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -327,12 +327,11 @@ Result ToJson(const SchemaField& field) { if (!field.doc().empty()) { json[kDoc] = field.doc(); } - if (field.initial_default().has_value()) { - ICEBERG_ASSIGN_OR_RAISE(json[kInitialDefault], - ToJson(field.initial_default()->get())); + if (field.initial_default() != nullptr) { + ICEBERG_ASSIGN_OR_RAISE(json[kInitialDefault], ToJson(*field.initial_default())); } - if (field.write_default().has_value()) { - ICEBERG_ASSIGN_OR_RAISE(json[kWriteDefault], ToJson(field.write_default()->get())); + if (field.write_default() != nullptr) { + ICEBERG_ASSIGN_OR_RAISE(json[kWriteDefault], ToJson(*field.write_default())); } return json; } diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index d6347251c..24df69103 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -123,8 +123,8 @@ SchemaField ReassignField(const SchemaField& field, int32_t new_id, ReassignTypeIds(field.type(), get_id, ids_to_reassigned, ids_to_original), field.optional(), std::string(field.doc()), - field.initial_default_ptr(), - field.write_default_ptr()}; + field.initial_default(), + field.write_default()}; } std::vector ReassignIds(std::vector fields, @@ -457,15 +457,15 @@ Status Schema::Validate(int32_t format_version) const { // data files are read (rows written before the column existed materialize this // value), so it requires the v3 reader contract. A write-default only affects // values written going forward and does not reinterpret existing data. - if (field.initial_default().has_value() && + if (field.initial_default() != nullptr && format_version < TableMetadata::kMinFormatVersionDefaultValues) { return InvalidSchema( "Invalid initial default for {}: non-null default ({}) is not supported " "until v{}", - field.name(), field.initial_default()->get(), + field.name(), *field.initial_default(), TableMetadata::kMinFormatVersionDefaultValues); } - if (field.initial_default().has_value() || field.write_default().has_value()) { + if (field.initial_default() != nullptr || field.write_default() != nullptr) { ICEBERG_RETURN_UNEXPECTED(field.Validate()); } } diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index 51d8181f8..30996a309 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -103,26 +103,11 @@ bool SchemaField::optional() const { return optional_; } std::string_view SchemaField::doc() const { return doc_; } -std::optional> SchemaField::initial_default() - const { - if (initial_default_ == nullptr) { - return std::nullopt; - } - return std::cref(*initial_default_); -} - -std::optional> SchemaField::write_default() const { - if (write_default_ == nullptr) { - return std::nullopt; - } - return std::cref(*write_default_); -} - -const std::shared_ptr& SchemaField::initial_default_ptr() const { +const std::shared_ptr& SchemaField::initial_default() const { return initial_default_; } -const std::shared_ptr& SchemaField::write_default_ptr() const { +const std::shared_ptr& SchemaField::write_default() const { return write_default_; } diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index f14659569..c0f68b0b7 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -24,9 +24,7 @@ /// type (e.g. a struct). #include -#include #include -#include #include #include @@ -76,47 +74,29 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { std::shared_ptr write_default = nullptr); /// \brief Get the field ID. - [[nodiscard]] int32_t field_id() const; + int32_t field_id() const; /// \brief Get the field name. - [[nodiscard]] std::string_view name() const; + std::string_view name() const; /// \brief Get the field type. - [[nodiscard]] const std::shared_ptr& type() const; + const std::shared_ptr& type() const; /// \brief Get whether the field is optional. - [[nodiscard]] bool optional() const; + bool optional() const; /// \brief Get the field documentation. std::string_view doc() const; - /// \brief Get the default value for this field used when reading rows written - /// before the field existed (v3 `initial-default`). Empty if absent. - /// - /// The returned reference is a non-owning view into a value owned by this field; - /// it remains valid for the lifetime of this SchemaField. - [[nodiscard]] std::optional> initial_default() - const; - - /// \brief Get the default value for this field used when a writer does not - /// supply a value (v3 `write-default`). Empty if absent. - /// - /// The returned reference is a non-owning view into a value owned by this field; - /// it remains valid for the lifetime of this SchemaField. - [[nodiscard]] std::optional> write_default() - const; - - /// \brief Get the shared owning pointer to the `initial-default` value, or null if - /// absent. Prefer initial_default() for reading; this exists so a rebuilt field can - /// share the (immutable) value rather than copy it. - [[nodiscard]] const std::shared_ptr& initial_default_ptr() const; + /// \brief Get the owning pointer to the default value for this field used when reading + /// rows written before the field existed (v3 `initial-default`), or null if absent. + const std::shared_ptr& initial_default() const; - /// \brief Get the shared owning pointer to the `write-default` value, or null if - /// absent. Prefer write_default() for reading; this exists so a rebuilt field can - /// share the (immutable) value rather than copy it. - [[nodiscard]] const std::shared_ptr& write_default_ptr() const; + /// \brief Get the owning pointer to the default value for this field used when a writer + /// does not supply a value (v3 `write-default`), or null if absent. + const std::shared_ptr& write_default() const; - [[nodiscard]] std::string ToString() const override; + std::string ToString() const override; Status Validate() const; @@ -138,7 +118,7 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { private: /// \brief Compare two fields for equality. - [[nodiscard]] bool Equals(const SchemaField& other) const; + bool Equals(const SchemaField& other) const; int32_t field_id_; std::string name_; diff --git a/src/iceberg/schema_util.cc b/src/iceberg/schema_util.cc index 5ff828258..b9f32346a 100644 --- a/src/iceberg/schema_util.cc +++ b/src/iceberg/schema_util.cc @@ -172,10 +172,10 @@ Result ProjectNested(const Type& expected_type, const Type& sou iter->second.local_index, prune_source)); } else if (MetadataColumns::IsMetadataColumn(field_id)) { child_projection.kind = FieldProjection::Kind::kMetadata; - } else if (expected_field.initial_default().has_value()) { + } else if (expected_field.initial_default() != nullptr) { // Rows written before the field existed assume its `initial-default` value. child_projection.kind = FieldProjection::Kind::kDefault; - child_projection.from = expected_field.initial_default()->get(); + child_projection.from = *expected_field.initial_default(); } else if (expected_field.optional()) { child_projection.kind = FieldProjection::Kind::kNull; } else { diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index 070b4d349..dde161568 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -203,15 +203,15 @@ TEST(SchemaJsonTest, FieldWithDefaultValuesRoundTrip) { ASSERT_EQ(schema->fields().size(), 2); const auto& field1 = schema->fields()[0]; - ASSERT_TRUE(field1.initial_default().has_value()); - ASSERT_EQ(field1.initial_default()->get(), Literal::Int(42)); - ASSERT_TRUE(field1.write_default().has_value()); - ASSERT_EQ(field1.write_default()->get(), Literal::Int(7)); + ASSERT_NE(field1.initial_default(), nullptr); + ASSERT_EQ(*field1.initial_default(), Literal::Int(42)); + ASSERT_NE(field1.write_default(), nullptr); + ASSERT_EQ(*field1.write_default(), Literal::Int(7)); const auto& field2 = schema->fields()[1]; - ASSERT_TRUE(field2.initial_default().has_value()); - ASSERT_EQ(field2.initial_default()->get(), Literal::String("n/a")); - ASSERT_FALSE(field2.write_default().has_value()); + ASSERT_NE(field2.initial_default(), nullptr); + ASSERT_EQ(*field2.initial_default(), Literal::String("n/a")); + ASSERT_EQ(field2.write_default(), nullptr); ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); ASSERT_EQ(schema_json.dump(), json); @@ -232,10 +232,10 @@ TEST(SchemaJsonTest, NestedFieldWithDefaultValuesRoundTrip) { ICEBERG_UNWRAP_OR_FAIL(auto schema, SchemaFromJson(nlohmann::json::parse(json))); const auto& person = schema->fields()[0]; const auto& nested = dynamic_cast(*person.type()).fields()[0]; - ASSERT_TRUE(nested.initial_default().has_value()); - ASSERT_EQ(nested.initial_default()->get(), Literal::Int(18)); - ASSERT_TRUE(nested.write_default().has_value()); - ASSERT_EQ(nested.write_default()->get(), Literal::Int(21)); + ASSERT_NE(nested.initial_default(), nullptr); + ASSERT_EQ(*nested.initial_default(), Literal::Int(18)); + ASSERT_NE(nested.write_default(), nullptr); + ASSERT_EQ(*nested.write_default(), Literal::Int(21)); ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); ASSERT_EQ(schema_json.dump(), json); @@ -249,8 +249,8 @@ TEST(SchemaJsonTest, CrossTypeDefaultNormalizedRoundTrip) { ICEBERG_UNWRAP_OR_FAIL( auto field, SchemaField::Make(1, "id", int64(), /*optional=*/false, /*doc=*/{}, std::make_shared(Literal::Int(34)))); - ASSERT_TRUE(field.initial_default().has_value()); - EXPECT_EQ(field.initial_default()->get(), Literal::Long(34)); + ASSERT_NE(field.initial_default(), nullptr); + EXPECT_EQ(*field.initial_default(), Literal::Long(34)); Schema original({std::move(field)}); ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(original)); diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index bcc98999b..ebdb8c26c 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -177,8 +177,8 @@ TEST(SchemaTest, MakeNormalizesDefaultToFieldType) { auto field, iceberg::SchemaField::Make(1, "id", iceberg::int64(), false, /*doc=*/{}, std::make_shared( iceberg::Literal::Int(34)))); - ASSERT_TRUE(field.initial_default().has_value()); - EXPECT_EQ(field.initial_default()->get(), iceberg::Literal::Long(34)); + ASSERT_NE(field.initial_default(), nullptr); + EXPECT_EQ(*field.initial_default(), iceberg::Literal::Long(34)); // A default outside the field type's range is rejected by Make. EXPECT_THAT(iceberg::SchemaField::Make(1, "id", iceberg::int32(), false, /*doc=*/{}, @@ -209,10 +209,10 @@ TEST(SchemaTest, ReassignIdsPreservesDefaultValues) { ASSERT_EQ(schema.fields().size(), 1); const iceberg::SchemaField& field = schema.fields()[0]; EXPECT_EQ(field.field_id(), 1001); - ASSERT_TRUE(field.initial_default().has_value()); - EXPECT_EQ(field.initial_default()->get(), iceberg::Literal::Int(42)); - ASSERT_TRUE(field.write_default().has_value()); - EXPECT_EQ(field.write_default()->get(), iceberg::Literal::Int(7)); + ASSERT_NE(field.initial_default(), nullptr); + EXPECT_EQ(*field.initial_default(), iceberg::Literal::Int(42)); + ASSERT_NE(field.write_default(), nullptr); + EXPECT_EQ(*field.write_default(), iceberg::Literal::Int(7)); } TEST(SchemaTest, ValidateRejectsInvalidUnknownFields) { From bda2d781c6d4a6e8fd64ff85a6e2e778ee794838 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 27 Jun 2026 17:07:05 -0700 Subject: [PATCH 17/22] fix(schema): model a null default value as absence Per review, a present-null default (Literal::Null) is no longer rejected: it is modeled as the absence of a default (matching Java), and dropped at construction so it is never stored. This also makes two such fields compare equal with no change to Literal::operator<=> (no null Literal is ever stored or compared), so the global comparison semantics used by expression predicates are left untouched. --- src/iceberg/schema_field.cc | 22 +++++++++++++++++----- src/iceberg/test/schema_test.cc | 17 +++++++++++++++++ 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index 30996a309..62590e0aa 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -43,7 +43,8 @@ Result> NormalizeDefault( if (value == nullptr || field_type == nullptr || !field_type->is_primitive()) { return value; } - if (*value->type() == *field_type) { + if (value->IsNull() || *value->type() == *field_type) { + // A null default is modeled as absence (dropped at construction), so do not cast it. return value; } ICEBERG_ASSIGN_OR_RAISE( @@ -56,6 +57,15 @@ Result> NormalizeDefault( return std::make_shared(std::move(cast)); } +// A null default value is modeled as the absence of a default (matching Java), so it is +// not stored. +std::shared_ptr DropNullDefault(std::shared_ptr value) { + if (value != nullptr && value->IsNull()) { + return nullptr; + } + return value; +} + } // namespace SchemaField::SchemaField(int32_t field_id, std::string_view name, @@ -67,8 +77,8 @@ SchemaField::SchemaField(int32_t field_id, std::string_view name, type_(std::move(type)), optional_(optional), doc_(doc), - initial_default_(std::move(initial_default)), - write_default_(std::move(write_default)) {} + initial_default_(DropNullDefault(std::move(initial_default))), + write_default_(DropNullDefault(std::move(write_default))) {} Result SchemaField::Make(int32_t field_id, std::string_view name, std::shared_ptr type, bool optional, @@ -115,8 +125,10 @@ namespace { Status ValidateDefault(const SchemaField& field, const Literal& value, std::string_view kind) { - if (value.IsNull() || value.IsAboveMax() || value.IsBelowMin()) { - return InvalidSchema("Invalid {} value for {}: must be a non-null value", kind, + // A null default is modeled as absence and dropped at construction, so it never reaches + // here; only the out-of-range cast sentinels need rejecting. + if (value.IsAboveMax() || value.IsBelowMin()) { + return InvalidSchema("Invalid {} value for {}: value is out of range", kind, field.name()); } // Defaults are only supported on primitive fields. The spec also permits JSON diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index ebdb8c26c..6f244b800 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -193,6 +193,23 @@ TEST(SchemaTest, MakeNormalizesDefaultToFieldType) { iceberg::IsError(iceberg::ErrorKind::kNotSupported)); } +TEST(SchemaTest, NullDefaultModeledAsAbsence) { + // A present-null default is modeled as the absence of a default (matching Java): it is + // dropped at construction, so the field has no stored default and compares equal to a + // field with no default, and it validates cleanly. + iceberg::SchemaField with_null( + 1, "id", iceberg::int32(), /*optional=*/true, /*doc=*/{}, + std::make_shared(iceberg::Literal::Null(iceberg::int32()))); + EXPECT_EQ(with_null.initial_default(), nullptr); + + iceberg::SchemaField no_default(1, "id", iceberg::int32(), /*optional=*/true); + EXPECT_EQ(with_null, no_default); + + iceberg::Schema schema({with_null}); + EXPECT_THAT(schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsOk()); +} + TEST(SchemaTest, ReassignIdsPreservesDefaultValues) { // Reassigning field IDs rebuilds each SchemaField, so the rebuild must carry the // default values over to the field with the new ID. From 38bbdb01748e722e1f6efd51b4cac6303643face Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 28 Jun 2026 13:42:19 -0700 Subject: [PATCH 18/22] =?UTF-8?q?fix(schema):=20review=20follow-ups=20?= =?UTF-8?q?=E2=80=94=20lenient=20float=20default=20parse,=20must-be-null?= =?UTF-8?q?=20types,=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - LiteralFromJson accepts an integer JSON token for float/double defaults (Java writes a float default as a bare integer), so Java-produced v3 metadata parses in C++. - ValidateDefault rejects a non-null default on unknown/variant/geometry/geography fields (the spec requires those to default to null). - Tests: full TableMetadata-level v3 default round-trip; non-primitive + must-be-null default rejection; SchemaField equality distinguishes default values; present-null write-default dropped to absence; integer-encoded float default parses. --- src/iceberg/expression/json_serde.cc | 6 +++-- src/iceberg/schema_field.cc | 26 ++++++++++++++---- src/iceberg/test/metadata_serde_test.cc | 20 ++++++++++++++ src/iceberg/test/schema_json_test.cc | 11 ++++++++ src/iceberg/test/schema_test.cc | 35 +++++++++++++++++++++++++ 5 files changed, 91 insertions(+), 7 deletions(-) diff --git a/src/iceberg/expression/json_serde.cc b/src/iceberg/expression/json_serde.cc index df8aba88f..708a2654c 100644 --- a/src/iceberg/expression/json_serde.cc +++ b/src/iceberg/expression/json_serde.cc @@ -342,13 +342,15 @@ Result LiteralFromJson(const nlohmann::json& json, const Type* type) { return Literal::Long(json.get()); case TypeId::kFloat: - if (!json.is_number_float()) [[unlikely]] { + // Accept an integer JSON token too (e.g. `1`), matching Java's lenient numeric + // parsing; otherwise a float default written as a bare integer fails to read. + if (!json.is_number()) [[unlikely]] { return JsonParseError("Cannot parse {} as a float value", SafeDumpJson(json)); } return Literal::Float(json.get()); case TypeId::kDouble: - if (!json.is_number_float()) [[unlikely]] { + if (!json.is_number()) [[unlikely]] { return JsonParseError("Cannot parse {} as a double value", SafeDumpJson(json)); } return Literal::Double(json.get()); diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index 62590e0aa..f15850725 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -131,11 +131,27 @@ Status ValidateDefault(const SchemaField& field, const Literal& value, return InvalidSchema("Invalid {} value for {}: value is out of range", kind, field.name()); } - // Defaults are only supported on primitive fields. The spec also permits JSON - // single-value defaults for struct/list/map (e.g. an empty struct `{}` whose - // sub-field defaults live in field metadata); that matches the current Java model's - // gap and is left as a follow-up. - if (field.type() == nullptr || !field.type()->is_primitive()) { + if (field.type() == nullptr) { + return InvalidSchema("Invalid {} value for {}: field has no type", kind, + field.name()); + } + // The spec requires unknown/variant/geometry/geography columns to default to null, so a + // non-null default on them is invalid (a null default was already dropped as absence). + switch (field.type()->type_id()) { + case TypeId::kUnknown: + case TypeId::kVariant: + case TypeId::kGeometry: + case TypeId::kGeography: + return InvalidSchema("Invalid {} value for {}: {} fields must default to null", + kind, field.name(), *field.type()); + default: + break; + } + // Defaults are otherwise only supported on primitive fields. The spec also permits JSON + // single-value defaults for struct/list/map (e.g. an empty struct `{}` whose sub-field + // defaults live in field metadata); that matches the current Java model's gap and is + // left as a follow-up. + if (!field.type()->is_primitive()) { return InvalidSchema( "Invalid {} value for {}: default values are only supported for primitive types", kind, field.name()); diff --git a/src/iceberg/test/metadata_serde_test.cc b/src/iceberg/test/metadata_serde_test.cc index e214f1606..10529d128 100644 --- a/src/iceberg/test/metadata_serde_test.cc +++ b/src/iceberg/test/metadata_serde_test.cc @@ -23,6 +23,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/partition_field.h" #include "iceberg/partition_spec.h" @@ -445,6 +446,25 @@ TEST(MetadataSerdeTest, DeserializePartitionStatisticsFiles) { ASSERT_EQ(*metadata, expected); } +TEST(MetadataSerdeTest, V3DefaultValuesRoundTrip) { + // Full TableMetadata path: a v3 schema field's initial/write defaults parse correctly + // (the v3 gate in Schema::Validate is satisfied) and survive a ToJson/FromJson round + // trip. The fixture's field is `x: long` with initial-default 1 and write-default 1. + ICEBERG_UNWRAP_OR_FAIL( + auto metadata, ReadTableMetadataFromResource("TableMetadataV3ValidMinimal.json")); + auto schema_result = metadata->Schema(); + ASSERT_TRUE(schema_result.has_value()); + const auto& field = schema_result.value()->fields()[0]; + ASSERT_NE(field.initial_default(), nullptr); + EXPECT_EQ(*field.initial_default(), Literal::Long(1)); + ASSERT_NE(field.write_default(), nullptr); + EXPECT_EQ(*field.write_default(), Literal::Long(1)); + + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(*metadata)); + ICEBERG_UNWRAP_OR_FAIL(auto reparsed, TableMetadataFromJson(json)); + EXPECT_EQ(*reparsed, *metadata); +} + TEST(MetadataSerdeTest, DeserializeUnsupportedVersion) { ReadTableMetadataExpectError("TableMetadataUnsupportedVersion.json", "Cannot read unsupported version"); diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index dde161568..132e0fa64 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -258,6 +258,17 @@ TEST(SchemaJsonTest, CrossTypeDefaultNormalizedRoundTrip) { EXPECT_EQ(original, *reparsed); } +TEST(SchemaJsonTest, FloatDefaultAcceptsIntegerEncoding) { + // Java writes a float/double default as a bare integer token (e.g. 1, not 1.0); it must + // parse, matching Java's lenient numeric reading. + constexpr std::string_view json = + R"({"fields":[{"id":1,"initial-default":1,"name":"f","required":true,"type":"float"}],"schema-id":1,"type":"struct"})"; + ICEBERG_UNWRAP_OR_FAIL(auto schema, SchemaFromJson(nlohmann::json::parse(json))); + const auto& field = schema->fields()[0]; + ASSERT_NE(field.initial_default(), nullptr); + EXPECT_EQ(*field.initial_default(), Literal::Float(1.0F)); +} + TEST(SchemaJsonTest, UnknownFieldRoundTrip) { constexpr std::string_view json = R"({"fields":[{"id":1,"name":"mystery","required":false,"type":"unknown"}],"schema-id":1,"type":"struct"})"; diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index 6f244b800..eb2b82ee6 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -199,8 +199,10 @@ TEST(SchemaTest, NullDefaultModeledAsAbsence) { // field with no default, and it validates cleanly. iceberg::SchemaField with_null( 1, "id", iceberg::int32(), /*optional=*/true, /*doc=*/{}, + std::make_shared(iceberg::Literal::Null(iceberg::int32())), std::make_shared(iceberg::Literal::Null(iceberg::int32()))); EXPECT_EQ(with_null.initial_default(), nullptr); + EXPECT_EQ(with_null.write_default(), nullptr); iceberg::SchemaField no_default(1, "id", iceberg::int32(), /*optional=*/true); EXPECT_EQ(with_null, no_default); @@ -210,6 +212,39 @@ TEST(SchemaTest, NullDefaultModeledAsAbsence) { iceberg::IsOk()); } +TEST(SchemaTest, EqualsDistinguishesDefaultValues) { + auto field = [](std::shared_ptr d) { + return iceberg::SchemaField(1, "id", iceberg::int32(), /*optional=*/true, /*doc=*/{}, + std::move(d)); + }; + // Differ only in default value -> unequal; default vs no-default -> unequal. + EXPECT_NE(field(std::make_shared(iceberg::Literal::Int(1))), + field(std::make_shared(iceberg::Literal::Int(2)))); + EXPECT_NE(field(std::make_shared(iceberg::Literal::Int(1))), + field(nullptr)); +} + +TEST(SchemaTest, ValidateRejectsDefaultOnNonPrimitiveAndMustBeNullTypes) { + // A struct (non-primitive) field with a non-null default is rejected. + iceberg::Schema struct_default({iceberg::SchemaField( + 1, "s", + MakeStructType(iceberg::SchemaField(2, "x", iceberg::int32(), /*optional=*/true)), + /*optional=*/true, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(1)))}); + EXPECT_THAT( + struct_default.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + + // unknown/geometry/geography must default to null: a non-null default is rejected. + iceberg::Schema geo_default({iceberg::SchemaField( + 1, "g", iceberg::geometry(), /*optional=*/true, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(1)))}); + auto status = + geo_default.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + EXPECT_THAT(status, iceberg::HasErrorMessage("must default to null")); +} + TEST(SchemaTest, ReassignIdsPreservesDefaultValues) { // Reassigning field IDs rebuilds each SchemaField, so the rebuild must carry the // default values over to the field with the new ID. From 6fe4640125a5eb6a70ef669133d5872348970848 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 28 Jun 2026 13:50:40 -0700 Subject: [PATCH 19/22] revert: leave expression/json_serde.cc untouched (out of PR scope) Revert the float/double LiteralFromJson leniency change and its test: that is the pre-existing shared literal parser, not code this PR introduces. The integer-encoded float-default interop gap (Java writes float defaults as bare ints) is a separate follow-up in a PR that owns the expression serde. --- src/iceberg/expression/json_serde.cc | 6 ++---- src/iceberg/test/schema_json_test.cc | 11 ----------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/src/iceberg/expression/json_serde.cc b/src/iceberg/expression/json_serde.cc index 708a2654c..df8aba88f 100644 --- a/src/iceberg/expression/json_serde.cc +++ b/src/iceberg/expression/json_serde.cc @@ -342,15 +342,13 @@ Result LiteralFromJson(const nlohmann::json& json, const Type* type) { return Literal::Long(json.get()); case TypeId::kFloat: - // Accept an integer JSON token too (e.g. `1`), matching Java's lenient numeric - // parsing; otherwise a float default written as a bare integer fails to read. - if (!json.is_number()) [[unlikely]] { + if (!json.is_number_float()) [[unlikely]] { return JsonParseError("Cannot parse {} as a float value", SafeDumpJson(json)); } return Literal::Float(json.get()); case TypeId::kDouble: - if (!json.is_number()) [[unlikely]] { + if (!json.is_number_float()) [[unlikely]] { return JsonParseError("Cannot parse {} as a double value", SafeDumpJson(json)); } return Literal::Double(json.get()); diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index 132e0fa64..dde161568 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -258,17 +258,6 @@ TEST(SchemaJsonTest, CrossTypeDefaultNormalizedRoundTrip) { EXPECT_EQ(original, *reparsed); } -TEST(SchemaJsonTest, FloatDefaultAcceptsIntegerEncoding) { - // Java writes a float/double default as a bare integer token (e.g. 1, not 1.0); it must - // parse, matching Java's lenient numeric reading. - constexpr std::string_view json = - R"({"fields":[{"id":1,"initial-default":1,"name":"f","required":true,"type":"float"}],"schema-id":1,"type":"struct"})"; - ICEBERG_UNWRAP_OR_FAIL(auto schema, SchemaFromJson(nlohmann::json::parse(json))); - const auto& field = schema->fields()[0]; - ASSERT_NE(field.initial_default(), nullptr); - EXPECT_EQ(*field.initial_default(), Literal::Float(1.0F)); -} - TEST(SchemaJsonTest, UnknownFieldRoundTrip) { constexpr std::string_view json = R"({"fields":[{"id":1,"name":"mystery","required":false,"type":"unknown"}],"schema-id":1,"type":"struct"})"; From 2efca238b7eea298cefa94f0d715d6ee895b531c Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 28 Jun 2026 14:54:45 -0700 Subject: [PATCH 20/22] revert: keep [[nodiscard]] on pre-existing SchemaField methods Per scope feedback, only drop [[nodiscard]] from this PR's own additions (the default getters). Restore it on the pre-existing field_id/name/type/optional/ToString/Equals; the broader [[nodiscard]] cleanup belongs in a separate PR. --- src/iceberg/schema_field.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index c0f68b0b7..6f779d298 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -74,16 +74,16 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { std::shared_ptr write_default = nullptr); /// \brief Get the field ID. - int32_t field_id() const; + [[nodiscard]] int32_t field_id() const; /// \brief Get the field name. - std::string_view name() const; + [[nodiscard]] std::string_view name() const; /// \brief Get the field type. - const std::shared_ptr& type() const; + [[nodiscard]] const std::shared_ptr& type() const; /// \brief Get whether the field is optional. - bool optional() const; + [[nodiscard]] bool optional() const; /// \brief Get the field documentation. std::string_view doc() const; @@ -96,7 +96,7 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { /// does not supply a value (v3 `write-default`), or null if absent. const std::shared_ptr& write_default() const; - std::string ToString() const override; + [[nodiscard]] std::string ToString() const override; Status Validate() const; @@ -118,7 +118,7 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { private: /// \brief Compare two fields for equality. - bool Equals(const SchemaField& other) const; + [[nodiscard]] bool Equals(const SchemaField& other) const; int32_t field_id_; std::string name_; From 70e029baa0672fb408d78376775a46f7db3f1ef0 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 28 Jun 2026 19:55:42 -0700 Subject: [PATCH 21/22] feat(schema): clarify error for types that cannot have a default value The unknown/variant/geometry/geography branch reported "{type} fields must default to null", which is ambiguous for the unknown type (reads as "unrecognized fields") and awkwardly interpolates a parametrized type (e.g. geometry(srid:...)) into a categorical sentence. Reword to "type {} cannot have a default value" and align with the sibling "Invalid {} value for {}: ..." messages. --- src/iceberg/schema_field.cc | 2 +- src/iceberg/test/schema_test.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index f15850725..ce5a1b629 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -142,7 +142,7 @@ Status ValidateDefault(const SchemaField& field, const Literal& value, case TypeId::kVariant: case TypeId::kGeometry: case TypeId::kGeography: - return InvalidSchema("Invalid {} value for {}: {} fields must default to null", + return InvalidSchema("Invalid {} value for {}: type {} cannot have a default value", kind, field.name(), *field.type()); default: break; diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index eb2b82ee6..c9a21caf6 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -242,7 +242,7 @@ TEST(SchemaTest, ValidateRejectsDefaultOnNonPrimitiveAndMustBeNullTypes) { auto status = geo_default.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion); ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); - EXPECT_THAT(status, iceberg::HasErrorMessage("must default to null")); + EXPECT_THAT(status, iceberg::HasErrorMessage("cannot have a default value")); } TEST(SchemaTest, ReassignIdsPreservesDefaultValues) { From d0229534a74e48d5c6c158804f2ac24d3d0a1a71 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 28 Jun 2026 20:11:13 -0700 Subject: [PATCH 22/22] refactor(schema): drop unused default-value cast machinery SchemaField::Make and the file-local NormalizeDefault cast a cross-type default (e.g. an int literal on a long field) to the field type, but they have no production callers: defaults arrive already at the field type (FieldFromJson parses with the field type), and ValidateDefault's exact type match already rejects any cross-type default. The cast was therefore dead in production and, as wgtmac noted in review, allocated a new Literal even in the same-type case. Remove Make and NormalizeDefault (and their tests). The constructor stores defaults verbatim and Validate rejects type mismatches. The cast belongs in the follow-up that adds a real caller (e.g. schema evolution), where Java performs it in the NestedField constructor (castDefault). --- src/iceberg/schema_field.cc | 43 ++-------------------------- src/iceberg/schema_field.h | 11 ------- src/iceberg/test/schema_json_test.cc | 17 ----------- src/iceberg/test/schema_test.cc | 26 +---------------- 4 files changed, 3 insertions(+), 94 deletions(-) diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index ce5a1b629..6c8d10d97 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -25,7 +25,6 @@ #include "iceberg/expression/literal.h" #include "iceberg/type.h" -#include "iceberg/util/checked_cast.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" @@ -33,30 +32,6 @@ namespace iceberg { namespace { -// Normalizes a default value to the field type so the stored value always matches the -// field. A cross-type default (e.g. an int literal on a long field) is cast to the field -// type; a value that already matches the field type, or a null, is returned unchanged. -// The cast error is propagated and an out-of-range value is rejected rather than -// swallowed. -Result> NormalizeDefault( - std::shared_ptr value, const std::shared_ptr& field_type) { - if (value == nullptr || field_type == nullptr || !field_type->is_primitive()) { - return value; - } - if (value->IsNull() || *value->type() == *field_type) { - // A null default is modeled as absence (dropped at construction), so do not cast it. - return value; - } - ICEBERG_ASSIGN_OR_RAISE( - auto cast, - value->CastTo(internal::checked_pointer_cast(field_type))); - if (cast.IsAboveMax() || cast.IsBelowMin()) { - return InvalidSchema("default value of type {} is out of range for {}", - *value->type(), *field_type); - } - return std::make_shared(std::move(cast)); -} - // A null default value is modeled as the absence of a default (matching Java), so it is // not stored. std::shared_ptr DropNullDefault(std::shared_ptr value) { @@ -80,19 +55,6 @@ SchemaField::SchemaField(int32_t field_id, std::string_view name, initial_default_(DropNullDefault(std::move(initial_default))), write_default_(DropNullDefault(std::move(write_default))) {} -Result SchemaField::Make(int32_t field_id, std::string_view name, - std::shared_ptr type, bool optional, - std::string_view doc, - std::shared_ptr initial_default, - std::shared_ptr write_default) { - ICEBERG_ASSIGN_OR_RAISE(initial_default, - NormalizeDefault(std::move(initial_default), type)); - ICEBERG_ASSIGN_OR_RAISE(write_default, - NormalizeDefault(std::move(write_default), type)); - return SchemaField(field_id, name, std::move(type), optional, doc, - std::move(initial_default), std::move(write_default)); -} - SchemaField SchemaField::MakeOptional(int32_t field_id, std::string_view name, std::shared_ptr type, std::string_view doc) { return {field_id, name, std::move(type), true, doc}; @@ -156,9 +118,8 @@ Status ValidateDefault(const SchemaField& field, const Literal& value, "Invalid {} value for {}: default values are only supported for primitive types", kind, field.name()); } - // A default is normalized to the field type by SchemaField::Make (which casts a - // cross-type default and reports any cast error), so a stored default whose type does - // not match the field type is invalid here. + // Defaults are stored verbatim (no implicit cast), so a default whose literal type does + // not match the field type is invalid. if (*value.type() != *field.type()) { return InvalidSchema("{} of field {} has type {} but expected {}", kind, field.name(), *value.type(), *field.type()); diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index 6f779d298..8066a6406 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -62,17 +62,6 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { static SchemaField MakeRequired(int32_t field_id, std::string_view name, std::shared_ptr type, std::string_view doc = {}); - /// \brief Construct a field, normalizing the default values to the field type. - /// - /// Unlike the constructor (which stores the defaults verbatim), this casts a default - /// whose literal type differs from the field type to the field type — e.g. an int - /// default on a long field — matching the spec/Java cast behavior. Returns an error if - /// a default cannot be cast to the field type or falls outside its range. - static Result Make( - int32_t field_id, std::string_view name, std::shared_ptr type, bool optional, - std::string_view doc = {}, std::shared_ptr initial_default = nullptr, - std::shared_ptr write_default = nullptr); - /// \brief Get the field ID. [[nodiscard]] int32_t field_id() const; diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index dde161568..c946550cc 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -241,23 +241,6 @@ TEST(SchemaJsonTest, NestedFieldWithDefaultValuesRoundTrip) { ASSERT_EQ(schema_json.dump(), json); } -TEST(SchemaJsonTest, CrossTypeDefaultNormalizedRoundTrip) { - // A cross-type default (an int literal on a long field) built via SchemaField::Make is - // normalized to the field type, so it survives a ToJson/SchemaFromJson round-trip as an - // equal schema (an un-normalized int literal would compare unequal to the re-parsed - // long literal). - ICEBERG_UNWRAP_OR_FAIL( - auto field, SchemaField::Make(1, "id", int64(), /*optional=*/false, /*doc=*/{}, - std::make_shared(Literal::Int(34)))); - ASSERT_NE(field.initial_default(), nullptr); - EXPECT_EQ(*field.initial_default(), Literal::Long(34)); - - Schema original({std::move(field)}); - ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(original)); - ICEBERG_UNWRAP_OR_FAIL(auto reparsed, SchemaFromJson(json)); - EXPECT_EQ(original, *reparsed); -} - TEST(SchemaJsonTest, UnknownFieldRoundTrip) { constexpr std::string_view json = R"({"fields":[{"id":1,"name":"mystery","required":false,"type":"unknown"}],"schema-id":1,"type":"struct"})"; diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index c9a21caf6..90cfdb13c 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -158,8 +158,7 @@ TEST(SchemaTest, ValidateDoesNotVersionGateWriteDefault) { } TEST(SchemaTest, ValidateRejectsMismatchedDefaultValue) { - // The constructor stores defaults verbatim (normalization happens in - // SchemaField::Make), so a stored default whose type differs from the field type is + // Defaults are stored verbatim, so a default whose type differs from the field type is // rejected by Validate. iceberg::Schema schema({iceberg::SchemaField( 1, "id", iceberg::int32(), false, /*doc=*/{}, /*initial_default=*/nullptr, @@ -170,29 +169,6 @@ TEST(SchemaTest, ValidateRejectsMismatchedDefaultValue) { EXPECT_THAT(status, iceberg::HasErrorMessage("write-default")); } -TEST(SchemaTest, MakeNormalizesDefaultToFieldType) { - // Make casts a cross-type default to the field type (int -> long) and stores the - // normalized value, so projection/serialization/equality all observe a long. - ICEBERG_UNWRAP_OR_FAIL( - auto field, iceberg::SchemaField::Make(1, "id", iceberg::int64(), false, /*doc=*/{}, - std::make_shared( - iceberg::Literal::Int(34)))); - ASSERT_NE(field.initial_default(), nullptr); - EXPECT_EQ(*field.initial_default(), iceberg::Literal::Long(34)); - - // A default outside the field type's range is rejected by Make. - EXPECT_THAT(iceberg::SchemaField::Make(1, "id", iceberg::int32(), false, /*doc=*/{}, - std::make_shared( - iceberg::Literal::Long(5'000'000'000))), - iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); - - // A default whose type cannot be cast to the field type surfaces the cast error. - EXPECT_THAT(iceberg::SchemaField::Make(1, "id", iceberg::int32(), false, /*doc=*/{}, - std::make_shared( - iceberg::Literal::String("oops"))), - iceberg::IsError(iceberg::ErrorKind::kNotSupported)); -} - TEST(SchemaTest, NullDefaultModeledAsAbsence) { // A present-null default is modeled as the absence of a default (matching Java): it is // dropped at construction, so the field has no stored default and compares equal to a