diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index c9b320ffe..614712f30 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -27,6 +27,8 @@ #include #include "iceberg/constants.h" +#include "iceberg/expression/json_serde_internal.h" +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/name_mapping.h" #include "iceberg/partition_field.h" @@ -49,6 +51,7 @@ #include "iceberg/util/json_util_internal.h" #include "iceberg/util/macros.h" #include "iceberg/util/string_util.h" +#include "iceberg/util/temporal_util.h" #include "iceberg/util/timepoint.h" namespace iceberg { @@ -324,6 +327,12 @@ Result ToJson(const SchemaField& field) { if (!field.doc().empty()) { json[kDoc] = field.doc(); } + if (field.initial_default() != nullptr) { + ICEBERG_ASSIGN_OR_RAISE(json[kInitialDefault], ToJson(*field.initial_default())); + } + if (field.write_default() != nullptr) { + ICEBERG_ASSIGN_OR_RAISE(json[kWriteDefault], ToJson(*field.write_default())); + } return json; } @@ -337,7 +346,6 @@ Result ToJson(const Type& type) { for (const auto& field : struct_type.fields()) { ICEBERG_ASSIGN_OR_RAISE(auto field_json, ToJson(field)); fields_json.push_back(std::move(field_json)); - // TODO(gangwu): add default values } json[kFields] = fields_json; return json; @@ -628,6 +636,34 @@ Result> TypeFromJson(const nlohmann::json& json) { } } +namespace { + +// The spec's JSON single-value form for `timestamptz` / `timestamptz_ns` default +// values requires a UTC offset. The shared timestamp parser accepts any offset and +// silently normalizes to UTC, which would let C++ accept default metadata that Java +// rejects and then rewrite the offset on serialization. Enforce UTC for these +// defaults at parse time, where the original offset is still visible. +Status ValidateTimestamptzDefaultIsUtc(const Type& type, const nlohmann::json& value) { + const auto type_id = type.type_id(); + if (type_id != TypeId::kTimestampTz && type_id != TypeId::kTimestampTzNs) { + return {}; + } + if (!value.is_string()) { + return JsonParseError("Invalid timestamptz default {} for {}: expected a string", + SafeDumpJson(value), type.ToString()); + } + const auto str = value.get(); + ICEBERG_ASSIGN_OR_RAISE(bool is_utc, TemporalUtils::IsUtcOffset(str)); + if (!is_utc) { + return JsonParseError( + "Invalid timestamptz default '{}' for {}: default values must use a UTC offset", + str, type.ToString()); + } + return {}; +} + +} // namespace + Result> FieldFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE( auto type, GetJsonValue(json, kType).and_then(TypeFromJson)); @@ -635,9 +671,31 @@ Result> FieldFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue(json, kName)); ICEBERG_ASSIGN_OR_RAISE(auto required, GetJsonValue(json, kRequired)); ICEBERG_ASSIGN_OR_RAISE(auto doc, GetJsonValueOrDefault(json, kDoc)); + ICEBERG_ASSIGN_OR_RAISE(auto initial_default_json, + GetJsonValueOptional(json, kInitialDefault)); + ICEBERG_ASSIGN_OR_RAISE(auto write_default_json, + GetJsonValueOptional(json, kWriteDefault)); + + std::shared_ptr initial_default; + if (initial_default_json.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + ValidateTimestamptzDefaultIsUtc(*type, *initial_default_json)); + ICEBERG_ASSIGN_OR_RAISE(Literal literal, + LiteralFromJson(*initial_default_json, type.get())); + initial_default = std::make_shared(std::move(literal)); + } + std::shared_ptr write_default; + if (write_default_json.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + ValidateTimestamptzDefaultIsUtc(*type, *write_default_json)); + ICEBERG_ASSIGN_OR_RAISE(Literal literal, + LiteralFromJson(*write_default_json, type.get())); + write_default = std::make_shared(std::move(literal)); + } return std::make_unique(field_id, std::move(name), std::move(type), - !required, doc); + !required, doc, std::move(initial_default), + std::move(write_default)); } Result> SchemaFromJson(const nlohmann::json& json) { diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index fcac43c78..24df69103 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -116,9 +116,15 @@ std::shared_ptr ReassignTypeIds(const std::shared_ptr& type, SchemaField ReassignField(const SchemaField& field, int32_t new_id, const Schema::GetId& get_id, Schema::IdMap& ids_to_reassigned, Schema::IdMap& ids_to_original) { - return {new_id, std::string(field.name()), + // Reassigning IDs only rewrites the field ID and nested type IDs; share the field's + // (immutable) default values rather than copying them. + return {new_id, + std::string(field.name()), ReassignTypeIds(field.type(), get_id, ids_to_reassigned, ids_to_original), - field.optional(), std::string(field.doc())}; + field.optional(), + std::string(field.doc()), + field.initial_default(), + field.write_default()}; } std::vector ReassignIds(std::vector fields, @@ -447,7 +453,21 @@ Status Schema::Validate(int32_t format_version) const { } } - // TODO(GuoTao.yu): Check default values when they are supported + // Only the initial-default is gated on format version: it changes how existing + // data files are read (rows written before the column existed materialize this + // value), so it requires the v3 reader contract. A write-default only affects + // values written going forward and does not reinterpret existing data. + if (field.initial_default() != nullptr && + format_version < TableMetadata::kMinFormatVersionDefaultValues) { + return InvalidSchema( + "Invalid initial default for {}: non-null default ({}) is not supported " + "until v{}", + field.name(), *field.initial_default(), + TableMetadata::kMinFormatVersionDefaultValues); + } + if (field.initial_default() != nullptr || field.write_default() != nullptr) { + ICEBERG_RETURN_UNEXPECTED(field.Validate()); + } } return {}; diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index 206915ec2..6c8d10d97 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -21,19 +21,39 @@ #include #include +#include +#include "iceberg/expression/literal.h" #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/macros.h" namespace iceberg { +namespace { + +// A null default value is modeled as the absence of a default (matching Java), so it is +// not stored. +std::shared_ptr DropNullDefault(std::shared_ptr value) { + if (value != nullptr && value->IsNull()) { + return nullptr; + } + return value; +} + +} // namespace + SchemaField::SchemaField(int32_t field_id, std::string_view name, - std::shared_ptr type, bool optional, std::string_view doc) + std::shared_ptr type, bool optional, std::string_view doc, + std::shared_ptr initial_default, + std::shared_ptr write_default) : field_id_(field_id), name_(name), type_(std::move(type)), optional_(optional), - doc_(doc) {} + doc_(doc), + initial_default_(DropNullDefault(std::move(initial_default))), + write_default_(DropNullDefault(std::move(write_default))) {} SchemaField SchemaField::MakeOptional(int32_t field_id, std::string_view name, std::shared_ptr type, std::string_view doc) { @@ -55,6 +75,60 @@ bool SchemaField::optional() const { return optional_; } std::string_view SchemaField::doc() const { return doc_; } +const std::shared_ptr& SchemaField::initial_default() const { + return initial_default_; +} + +const std::shared_ptr& SchemaField::write_default() const { + return write_default_; +} + +namespace { + +Status ValidateDefault(const SchemaField& field, const Literal& value, + std::string_view kind) { + // A null default is modeled as absence and dropped at construction, so it never reaches + // here; only the out-of-range cast sentinels need rejecting. + if (value.IsAboveMax() || value.IsBelowMin()) { + return InvalidSchema("Invalid {} value for {}: value is out of range", kind, + field.name()); + } + if (field.type() == nullptr) { + return InvalidSchema("Invalid {} value for {}: field has no type", kind, + field.name()); + } + // The spec requires unknown/variant/geometry/geography columns to default to null, so a + // non-null default on them is invalid (a null default was already dropped as absence). + switch (field.type()->type_id()) { + case TypeId::kUnknown: + case TypeId::kVariant: + case TypeId::kGeometry: + case TypeId::kGeography: + return InvalidSchema("Invalid {} value for {}: type {} cannot have a default value", + kind, field.name(), *field.type()); + default: + break; + } + // Defaults are otherwise only supported on primitive fields. The spec also permits JSON + // single-value defaults for struct/list/map (e.g. an empty struct `{}` whose sub-field + // defaults live in field metadata); that matches the current Java model's gap and is + // left as a follow-up. + if (!field.type()->is_primitive()) { + return InvalidSchema( + "Invalid {} value for {}: default values are only supported for primitive types", + kind, field.name()); + } + // Defaults are stored verbatim (no implicit cast), so a default whose literal type does + // not match the field type is invalid. + if (*value.type() != *field.type()) { + return InvalidSchema("{} of field {} has type {} but expected {}", kind, field.name(), + *value.type(), *field.type()); + } + return {}; +} + +} // namespace + Status SchemaField::Validate() const { if (name_.empty()) [[unlikely]] { return InvalidSchema("SchemaField cannot have empty name"); @@ -62,6 +136,13 @@ Status SchemaField::Validate() const { if (type_ == nullptr) [[unlikely]] { return InvalidSchema("SchemaField cannot have null type"); } + if (initial_default_ != nullptr) { + ICEBERG_RETURN_UNEXPECTED( + ValidateDefault(*this, *initial_default_, "initial-default")); + } + if (write_default_ != nullptr) { + ICEBERG_RETURN_UNEXPECTED(ValidateDefault(*this, *write_default_, "write-default")); + } return {}; } @@ -72,9 +153,23 @@ std::string SchemaField::ToString() const { return result; } +namespace { + +bool DefaultEquals(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) { + if (lhs == nullptr || rhs == nullptr) { + return lhs == rhs; + } + return *lhs == *rhs; +} + +} // namespace + bool SchemaField::Equals(const SchemaField& other) const { return field_id_ == other.field_id_ && name_ == other.name_ && *type_ == *other.type_ && - optional_ == other.optional_; + optional_ == other.optional_ && + DefaultEquals(initial_default_, other.initial_default_) && + DefaultEquals(write_default_, other.write_default_); } } // namespace iceberg diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index fd20226a5..8066a6406 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -46,8 +46,14 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { /// \param[in] type The field type. /// \param[in] optional Whether values of this field are required or nullable. /// \param[in] doc Optional documentation string for the field. + /// \param[in] initial_default The v3 `initial-default` value, or null if absent. The + /// field shares ownership of the (immutable) value. + /// \param[in] write_default The v3 `write-default` value, or null if absent. The field + /// shares ownership of the (immutable) value. SchemaField(int32_t field_id, std::string_view name, std::shared_ptr type, - bool optional, std::string_view doc = {}); + bool optional, std::string_view doc = {}, + std::shared_ptr initial_default = nullptr, + std::shared_ptr write_default = nullptr); /// \brief Construct an optional (nullable) field. static SchemaField MakeOptional(int32_t field_id, std::string_view name, @@ -71,6 +77,14 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { /// \brief Get the field documentation. std::string_view doc() const; + /// \brief Get the owning pointer to the default value for this field used when reading + /// rows written before the field existed (v3 `initial-default`), or null if absent. + const std::shared_ptr& initial_default() const; + + /// \brief Get the owning pointer to the default value for this field used when a writer + /// does not supply a value (v3 `write-default`), or null if absent. + const std::shared_ptr& write_default() const; + [[nodiscard]] std::string ToString() const override; Status Validate() const; @@ -100,6 +114,9 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { std::shared_ptr type_; bool optional_; std::string doc_; + // Immutable default values, shared (not deep-copied) across field copies, like `type_`. + std::shared_ptr initial_default_; + std::shared_ptr write_default_; }; } // namespace iceberg diff --git a/src/iceberg/schema_util.cc b/src/iceberg/schema_util.cc index 4ff678fc6..b9f32346a 100644 --- a/src/iceberg/schema_util.cc +++ b/src/iceberg/schema_util.cc @@ -172,10 +172,14 @@ Result ProjectNested(const Type& expected_type, const Type& sou iter->second.local_index, prune_source)); } else if (MetadataColumns::IsMetadataColumn(field_id)) { child_projection.kind = FieldProjection::Kind::kMetadata; + } else if (expected_field.initial_default() != nullptr) { + // Rows written before the field existed assume its `initial-default` value. + child_projection.kind = FieldProjection::Kind::kDefault; + child_projection.from = *expected_field.initial_default(); } else if (expected_field.optional()) { child_projection.kind = FieldProjection::Kind::kNull; } else { - // TODO(gangwu): support default value for v3 and constant value + // TODO(gangwu): support constant value return InvalidSchema("Missing required field: {}", expected_field.ToString()); } result.children.emplace_back(std::move(child_projection)); diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index 562471608..c97ed64a6 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -23,10 +23,12 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/name_mapping.h" #include "iceberg/partition_spec.h" #include "iceberg/schema.h" +#include "iceberg/schema_field.h" #include "iceberg/snapshot.h" #include "iceberg/sort_field.h" #include "iceberg/sort_order.h" @@ -35,10 +37,12 @@ #include "iceberg/table_update.h" #include "iceberg/test/matchers.h" #include "iceberg/transform.h" +#include "iceberg/type.h" #include "iceberg/util/base64.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" // IWYU pragma: keep #include "iceberg/util/timepoint.h" +#include "iceberg/util/uuid.h" namespace iceberg { @@ -72,6 +76,11 @@ Result> FromJsonHelper(const nlohmann::json& json) return NameMappingFromJson(json); } +template <> +Result> FromJsonHelper(const nlohmann::json& json) { + return FieldFromJson(json); +} + // Helper function to reduce duplication in testing template void TestJsonConversion(const T& obj, const nlohmann::json& expected_json) { @@ -84,8 +93,102 @@ void TestJsonConversion(const T& obj, const nlohmann::json& expected_json) { EXPECT_EQ(obj, *obj_ex.value()) << "Deserialized object mismatch."; } +// ToJson(SchemaField) returns Result, so it cannot use the shared +// TestJsonConversion helper. Unwrap the serialized json before comparing and +// round-tripping. +void TestSchemaFieldJsonConversion(const SchemaField& field, + const nlohmann::json& expected_json) { + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(field)); + EXPECT_EQ(expected_json, json) << "JSON conversion mismatch."; + + auto obj_ex = FieldFromJson(expected_json); + EXPECT_TRUE(obj_ex.has_value()) << "Failed to deserialize JSON."; + EXPECT_EQ(field, *obj_ex.value()) << "Deserialized object mismatch."; +} + } // namespace +// Pins the wire format produced by ToJson(SchemaField) / FieldFromJson for +// `initial-default` and `write-default`, including the absence of the keys when a +// field carries no defaults. +TEST(JsonInternalTest, SchemaFieldDefaultValues) { + // Both defaults present. + SchemaField with_both(/*field_id=*/1, "id", int32(), /*optional=*/false, /*doc=*/{}, + std::make_shared(Literal::Int(42)), + std::make_shared(Literal::Int(7))); + TestSchemaFieldJsonConversion( + with_both, + R"({"id":1,"name":"id","required":true,"type":"int","initial-default":42,"write-default":7})"_json); + + // Only an initial-default; write-default must not appear in the JSON. + SchemaField initial_only(/*field_id=*/2, "name", string(), /*optional=*/true, + /*doc=*/{}, + std::make_shared(Literal::String("n/a")), + /*write_default=*/nullptr); + TestSchemaFieldJsonConversion( + initial_only, + R"({"id":2,"name":"name","required":false,"type":"string","initial-default":"n/a"})"_json); + + // No defaults; neither key may appear. + SchemaField no_defaults(/*field_id=*/3, "plain", int32(), /*optional=*/false); + TestSchemaFieldJsonConversion( + no_defaults, R"({"id":3,"name":"plain","required":true,"type":"int"})"_json); +} + +// Round-trips a field carrying both defaults through ToJson -> FieldFromJson for +// every primitive type, exercising the per-type single-value serialization the +// default path reuses (date/timestamp/decimal/uuid/binary have non-trivial wire +// encodings). +TEST(JsonInternalTest, SchemaFieldDefaultValuesRoundTripAllTypes) { + ICEBERG_UNWRAP_OR_FAIL(auto uuid_value, + Uuid::FromString("f79c3e09-677c-4bbd-a479-3f349cb785e7")); + std::vector, Literal>> cases; + cases.emplace_back(boolean(), Literal::Boolean(true)); + cases.emplace_back(int32(), Literal::Int(-7)); + cases.emplace_back(int64(), Literal::Long(1234567890123LL)); + cases.emplace_back(float32(), Literal::Float(1.5f)); + cases.emplace_back(float64(), Literal::Double(2.5)); + cases.emplace_back(date(), Literal::Date(19738)); + cases.emplace_back(time(), Literal::Time(43200000000LL)); + cases.emplace_back(timestamp(), Literal::Timestamp(1719446400000000LL)); + cases.emplace_back(timestamp_tz(), Literal::TimestampTz(1719446400000000LL)); + cases.emplace_back(timestamp_ns(), Literal::TimestampNs(1719446400000000123LL)); + cases.emplace_back(timestamptz_ns(), Literal::TimestampTzNs(1719446400000000123LL)); + cases.emplace_back(string(), Literal::String("hello")); + cases.emplace_back(decimal(9, 2), Literal::Decimal(12345, 9, 2)); + cases.emplace_back(fixed(3), Literal::Fixed({0x01, 0x02, 0x03})); + cases.emplace_back(binary(), Literal::Binary({0xDE, 0xAD, 0xBE, 0xEF})); + cases.emplace_back(uuid(), Literal::UUID(uuid_value)); + + int32_t field_id = 1; + for (const auto& [type, literal] : cases) { + SchemaField field(field_id++, "f", type, /*optional=*/false, /*doc=*/{}, + std::make_shared(literal), + std::make_shared(literal)); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(field)); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, FieldFromJson(json)); + EXPECT_EQ(field, *parsed) << "round-trip mismatch for type " << type->ToString() + << ", json=" << json.dump(); + } +} + +// The spec only permits UTC offsets for timestamptz / timestamptz_ns default values. +// A non-UTC offset (which the shared parser would silently normalize) must be rejected, +// while the UTC form is accepted. +TEST(JsonInternalTest, SchemaFieldRejectsNonUtcTimestamptzDefault) { + auto non_utc = nlohmann::json::parse( + R"({"id":1,"name":"ts","required":true,"type":"timestamptz","initial-default":"2024-06-27T05:00:00+05:00"})"); + EXPECT_FALSE(FieldFromJson(non_utc).has_value()); + + auto non_utc_ns = nlohmann::json::parse( + R"({"id":1,"name":"ts","required":true,"type":"timestamptz_ns","write-default":"2024-06-27T05:00:00-08:00"})"); + EXPECT_FALSE(FieldFromJson(non_utc_ns).has_value()); + + auto utc = nlohmann::json::parse( + R"({"id":1,"name":"ts","required":true,"type":"timestamptz","initial-default":"2024-06-27T00:00:00+00:00"})"); + EXPECT_TRUE(FieldFromJson(utc).has_value()); +} + TEST(JsonInternalTest, SortField) { auto identity_transform = Transform::Identity(); diff --git a/src/iceberg/test/metadata_serde_test.cc b/src/iceberg/test/metadata_serde_test.cc index e214f1606..10529d128 100644 --- a/src/iceberg/test/metadata_serde_test.cc +++ b/src/iceberg/test/metadata_serde_test.cc @@ -23,6 +23,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/partition_field.h" #include "iceberg/partition_spec.h" @@ -445,6 +446,25 @@ TEST(MetadataSerdeTest, DeserializePartitionStatisticsFiles) { ASSERT_EQ(*metadata, expected); } +TEST(MetadataSerdeTest, V3DefaultValuesRoundTrip) { + // Full TableMetadata path: a v3 schema field's initial/write defaults parse correctly + // (the v3 gate in Schema::Validate is satisfied) and survive a ToJson/FromJson round + // trip. The fixture's field is `x: long` with initial-default 1 and write-default 1. + ICEBERG_UNWRAP_OR_FAIL( + auto metadata, ReadTableMetadataFromResource("TableMetadataV3ValidMinimal.json")); + auto schema_result = metadata->Schema(); + ASSERT_TRUE(schema_result.has_value()); + const auto& field = schema_result.value()->fields()[0]; + ASSERT_NE(field.initial_default(), nullptr); + EXPECT_EQ(*field.initial_default(), Literal::Long(1)); + ASSERT_NE(field.write_default(), nullptr); + EXPECT_EQ(*field.write_default(), Literal::Long(1)); + + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(*metadata)); + ICEBERG_UNWRAP_OR_FAIL(auto reparsed, TableMetadataFromJson(json)); + EXPECT_EQ(*reparsed, *metadata); +} + TEST(MetadataSerdeTest, DeserializeUnsupportedVersion) { ReadTableMetadataExpectError("TableMetadataUnsupportedVersion.json", "Cannot read unsupported version"); diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index 944687e7d..c946550cc 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -24,6 +24,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/schema.h" #include "iceberg/schema_field.h" @@ -194,6 +195,52 @@ TEST(SchemaJsonTest, RoundTrip) { ASSERT_EQ(schema_json.dump(), json); } +TEST(SchemaJsonTest, FieldWithDefaultValuesRoundTrip) { + constexpr std::string_view json = + R"({"fields":[{"id":1,"initial-default":42,"name":"id","required":true,"type":"int","write-default":7},{"id":2,"initial-default":"n/a","name":"name","required":false,"type":"string"}],"schema-id":1,"type":"struct"})"; + + ICEBERG_UNWRAP_OR_FAIL(auto schema, SchemaFromJson(nlohmann::json::parse(json))); + ASSERT_EQ(schema->fields().size(), 2); + + const auto& field1 = schema->fields()[0]; + ASSERT_NE(field1.initial_default(), nullptr); + ASSERT_EQ(*field1.initial_default(), Literal::Int(42)); + ASSERT_NE(field1.write_default(), nullptr); + ASSERT_EQ(*field1.write_default(), Literal::Int(7)); + + const auto& field2 = schema->fields()[1]; + ASSERT_NE(field2.initial_default(), nullptr); + ASSERT_EQ(*field2.initial_default(), Literal::String("n/a")); + ASSERT_EQ(field2.write_default(), nullptr); + + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); +} + +TEST(SchemaJsonTest, FieldWithMismatchedDefaultValueFails) { + constexpr std::string_view json = + R"({"fields":[{"id":1,"initial-default":"oops","name":"id","required":true,"type":"int"}],"schema-id":1,"type":"struct"})"; + + auto result = SchemaFromJson(nlohmann::json::parse(json)); + ASSERT_FALSE(result.has_value()); +} + +TEST(SchemaJsonTest, NestedFieldWithDefaultValuesRoundTrip) { + constexpr std::string_view json = + R"({"fields":[{"id":1,"name":"person","required":true,"type":{"fields":[{"id":2,"initial-default":18,"name":"age","required":true,"type":"int","write-default":21}],"type":"struct"}}],"schema-id":1,"type":"struct"})"; + + ICEBERG_UNWRAP_OR_FAIL(auto schema, SchemaFromJson(nlohmann::json::parse(json))); + const auto& person = schema->fields()[0]; + const auto& nested = dynamic_cast(*person.type()).fields()[0]; + ASSERT_NE(nested.initial_default(), nullptr); + ASSERT_EQ(*nested.initial_default(), Literal::Int(18)); + ASSERT_NE(nested.write_default(), nullptr); + ASSERT_EQ(*nested.write_default(), Literal::Int(21)); + + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); +} + TEST(SchemaJsonTest, UnknownFieldRoundTrip) { constexpr std::string_view json = R"({"fields":[{"id":1,"name":"mystery","required":false,"type":"unknown"}],"schema-id":1,"type":"struct"})"; @@ -274,7 +321,8 @@ TEST(SchemaJsonTest, NestedUnknownFieldsRoundTrip) { ASSERT_EQ(properties->value().type()->type_id(), TypeId::kUnknown); ASSERT_TRUE(properties->value().optional()); - ASSERT_EQ(ToJson(*schema), parsed_json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json, parsed_json); } TEST(SchemaJsonTest, IdentifierFieldIds) { @@ -293,7 +341,8 @@ TEST(SchemaJsonTest, IdentifierFieldIds) { ASSERT_EQ(schema_with_identifers->schema_id(), 1); ASSERT_EQ(schema_with_identifers->IdentifierFieldIds().size(), 1); ASSERT_EQ(schema_with_identifers->IdentifierFieldIds()[0], 1); - ASSERT_EQ(ToJson(*schema_with_identifers), json_with_identifiers); + ICEBERG_UNWRAP_OR_FAIL(auto json_with_identifiers_out, ToJson(*schema_with_identifers)); + ASSERT_EQ(json_with_identifiers_out, json_with_identifiers); // Test schema without identifier-field-ids constexpr std::string_view json_without_identifiers_str = @@ -306,7 +355,9 @@ TEST(SchemaJsonTest, IdentifierFieldIds) { ICEBERG_UNWRAP_OR_FAIL(auto schema_without_identifiers, SchemaFromJson(json_without_identifiers)); ASSERT_TRUE(schema_without_identifiers->IdentifierFieldIds().empty()); - ASSERT_EQ(ToJson(*schema_without_identifiers), json_without_identifiers); + ICEBERG_UNWRAP_OR_FAIL(auto json_without_identifiers_out, + ToJson(*schema_without_identifiers)); + ASSERT_EQ(json_without_identifiers_out, json_without_identifiers); // Test schema with multiple identifier fields constexpr std::string_view json_multi_identifiers_str = @@ -322,7 +373,9 @@ TEST(SchemaJsonTest, IdentifierFieldIds) { ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds().size(), 2); ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds()[0], 1); ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds()[1], 2); - ASSERT_EQ(ToJson(*schema_multi_identifiers), json_multi_identifiers); + ICEBERG_UNWRAP_OR_FAIL(auto json_multi_identifiers_out, + ToJson(*schema_multi_identifiers)); + ASSERT_EQ(json_multi_identifiers_out, json_multi_identifiers); } } // namespace iceberg diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index db99eb02b..90cfdb13c 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -25,6 +25,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/table_metadata.h" @@ -133,6 +134,115 @@ TEST(SchemaTest, ValidateRejectsV3TypesBeforeFormatV3) { iceberg::IsOk()); } +TEST(SchemaTest, ValidateRejectsInitialDefaultBeforeFormatV3) { + iceberg::Schema schema({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(42)))}); + + auto status = schema.Validate(2); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + EXPECT_THAT(status, iceberg::HasErrorMessage("is not supported until v3")); + + EXPECT_THAT(schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsOk()); +} + +TEST(SchemaTest, ValidateDoesNotVersionGateWriteDefault) { + // A write-default does not reinterpret existing data, so it is not gated on + // format version: a write-default alone is accepted below v3. + iceberg::Schema schema({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, /*initial_default=*/nullptr, + std::make_shared(iceberg::Literal::Int(7)))}); + + EXPECT_THAT(schema.Validate(2), iceberg::IsOk()); +} + +TEST(SchemaTest, ValidateRejectsMismatchedDefaultValue) { + // Defaults are stored verbatim, so a default whose type differs from the field type is + // rejected by Validate. + iceberg::Schema schema({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, /*initial_default=*/nullptr, + std::make_shared(iceberg::Literal::String("oops")))}); + + auto status = schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + EXPECT_THAT(status, iceberg::HasErrorMessage("write-default")); +} + +TEST(SchemaTest, NullDefaultModeledAsAbsence) { + // A present-null default is modeled as the absence of a default (matching Java): it is + // dropped at construction, so the field has no stored default and compares equal to a + // field with no default, and it validates cleanly. + iceberg::SchemaField with_null( + 1, "id", iceberg::int32(), /*optional=*/true, /*doc=*/{}, + std::make_shared(iceberg::Literal::Null(iceberg::int32())), + std::make_shared(iceberg::Literal::Null(iceberg::int32()))); + EXPECT_EQ(with_null.initial_default(), nullptr); + EXPECT_EQ(with_null.write_default(), nullptr); + + iceberg::SchemaField no_default(1, "id", iceberg::int32(), /*optional=*/true); + EXPECT_EQ(with_null, no_default); + + iceberg::Schema schema({with_null}); + EXPECT_THAT(schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsOk()); +} + +TEST(SchemaTest, EqualsDistinguishesDefaultValues) { + auto field = [](std::shared_ptr d) { + return iceberg::SchemaField(1, "id", iceberg::int32(), /*optional=*/true, /*doc=*/{}, + std::move(d)); + }; + // Differ only in default value -> unequal; default vs no-default -> unequal. + EXPECT_NE(field(std::make_shared(iceberg::Literal::Int(1))), + field(std::make_shared(iceberg::Literal::Int(2)))); + EXPECT_NE(field(std::make_shared(iceberg::Literal::Int(1))), + field(nullptr)); +} + +TEST(SchemaTest, ValidateRejectsDefaultOnNonPrimitiveAndMustBeNullTypes) { + // A struct (non-primitive) field with a non-null default is rejected. + iceberg::Schema struct_default({iceberg::SchemaField( + 1, "s", + MakeStructType(iceberg::SchemaField(2, "x", iceberg::int32(), /*optional=*/true)), + /*optional=*/true, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(1)))}); + EXPECT_THAT( + struct_default.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + + // unknown/geometry/geography must default to null: a non-null default is rejected. + iceberg::Schema geo_default({iceberg::SchemaField( + 1, "g", iceberg::geometry(), /*optional=*/true, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(1)))}); + auto status = + geo_default.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + EXPECT_THAT(status, iceberg::HasErrorMessage("cannot have a default value")); +} + +TEST(SchemaTest, ReassignIdsPreservesDefaultValues) { + // Reassigning field IDs rebuilds each SchemaField, so the rebuild must carry the + // default values over to the field with the new ID. + std::vector fields; + fields.push_back(iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(42)), + std::make_shared(iceberg::Literal::Int(7)))); + auto reassign_id = [](int32_t old_id) { return old_id + 1000; }; + + iceberg::Schema schema(std::move(fields), iceberg::Schema::kInitialSchemaId, + reassign_id); + + ASSERT_EQ(schema.fields().size(), 1); + const iceberg::SchemaField& field = schema.fields()[0]; + EXPECT_EQ(field.field_id(), 1001); + ASSERT_NE(field.initial_default(), nullptr); + EXPECT_EQ(*field.initial_default(), iceberg::Literal::Int(42)); + ASSERT_NE(field.write_default(), nullptr); + EXPECT_EQ(*field.write_default(), iceberg::Literal::Int(7)); +} + TEST(SchemaTest, ValidateRejectsInvalidUnknownFields) { iceberg::Schema required_unknown_schema( {iceberg::SchemaField(1, "mystery", iceberg::unknown(), false)}); diff --git a/src/iceberg/test/schema_util_test.cc b/src/iceberg/test/schema_util_test.cc index ee075006f..9a3eff887 100644 --- a/src/iceberg/test/schema_util_test.cc +++ b/src/iceberg/test/schema_util_test.cc @@ -24,6 +24,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/metadata_columns.h" #include "iceberg/schema.h" #include "iceberg/schema_field.h" @@ -179,6 +180,58 @@ TEST(SchemaUtilTest, ProjectMissingRequiredField) { ASSERT_THAT(projection_result, HasErrorMessage("Missing required field")); } +TEST(SchemaUtilTest, ProjectMissingRequiredFieldWithInitialDefault) { + Schema source_schema = CreateFlatSchema(); + Schema expected_schema({ + SchemaField::MakeRequired(/*field_id=*/1, "id", iceberg::int64()), + SchemaField(/*field_id=*/10, "extra", iceberg::int32(), /*optional=*/false, + /*doc=*/{}, std::make_shared(Literal::Int(42))), + }); + + auto projection_result = + Project(expected_schema, source_schema, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + + const auto& projection = *projection_result; + ASSERT_EQ(projection.fields.size(), 2); + AssertProjectedField(projection.fields[0], 0); + ASSERT_EQ(projection.fields[1].kind, FieldProjection::Kind::kDefault); + ASSERT_EQ(std::get(projection.fields[1].from), Literal::Int(42)); +} + +TEST(SchemaUtilTest, ProjectMissingOptionalFieldWithInitialDefault) { + // An optional field with an initial-default reads the default, not null. + Schema source_schema = CreateFlatSchema(); + Schema expected_schema({ + SchemaField::MakeRequired(/*field_id=*/1, "id", iceberg::int64()), + SchemaField(/*field_id=*/10, "extra", iceberg::string(), /*optional=*/true, + /*doc=*/{}, std::make_shared(Literal::String("n/a"))), + }); + + auto projection_result = + Project(expected_schema, source_schema, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + + const auto& projection = *projection_result; + ASSERT_EQ(projection.fields.size(), 2); + ASSERT_EQ(projection.fields[1].kind, FieldProjection::Kind::kDefault); + ASSERT_EQ(std::get(projection.fields[1].from), Literal::String("n/a")); +} + +TEST(SchemaUtilTest, ProjectPresentFieldIgnoresInitialDefault) { + // initial-default only applies when the field is missing from the data file. + Schema source_schema = CreateFlatSchema(); + Schema expected_schema({ + SchemaField(/*field_id=*/1, "id", iceberg::int64(), /*optional=*/false, + /*doc=*/{}, std::make_shared(Literal::Long(-1))), + }); + + auto projection_result = + Project(expected_schema, source_schema, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + AssertProjectedField(projection_result->fields[0], 0); +} + TEST(SchemaUtilTest, ProjectMetadataColumn) { Schema source_schema = CreateFlatSchema(); Schema expected_schema({ diff --git a/src/iceberg/test/temporal_util_test.cc b/src/iceberg/test/temporal_util_test.cc index 0d4426b0b..791d02c56 100644 --- a/src/iceberg/test/temporal_util_test.cc +++ b/src/iceberg/test/temporal_util_test.cc @@ -53,6 +53,31 @@ TEST(TemporalUtilTest, ParseTimestampNsChecksInt64Bounds) { IsError(ErrorKind::kInvalidArgument)); } +TEST(TemporalUtilTest, IsUtcOffset) { + // UTC offsets: "Z", "+00:00" and "-00:00". + ICEBERG_UNWRAP_OR_FAIL(auto z, TemporalUtils::IsUtcOffset("2024-06-27T00:00:00Z")); + EXPECT_TRUE(z); + ICEBERG_UNWRAP_OR_FAIL(auto plus_zero, + TemporalUtils::IsUtcOffset("2024-06-27T00:00:00+00:00")); + EXPECT_TRUE(plus_zero); + ICEBERG_UNWRAP_OR_FAIL(auto minus_zero, + TemporalUtils::IsUtcOffset("2024-06-27T00:00:00-00:00")); + EXPECT_TRUE(minus_zero); + + // Non-UTC offsets. + ICEBERG_UNWRAP_OR_FAIL(auto plus_five, + TemporalUtils::IsUtcOffset("2024-06-27T05:00:00+05:00")); + EXPECT_FALSE(plus_five); + ICEBERG_UNWRAP_OR_FAIL(auto minus_eight, + TemporalUtils::IsUtcOffset("2024-06-27T00:00:00-08:00")); + EXPECT_FALSE(minus_eight); + + // A missing or unparseable timezone suffix is an error. + EXPECT_THAT(TemporalUtils::IsUtcOffset("2024-06-27T00:00:00"), + IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(TemporalUtils::IsUtcOffset(""), IsError(ErrorKind::kInvalidArgument)); +} + TEST(TemporalUtilTest, ParseTimestampNsRejectsMoreThanNineFractionalDigits) { EXPECT_THAT(TemporalUtils::ParseTimestampNs("2026-01-01T00:00:01.0000010011"), IsError(ErrorKind::kInvalidArgument)); diff --git a/src/iceberg/util/temporal_util.cc b/src/iceberg/util/temporal_util.cc index e00ee7cf5..9e6a93cfd 100644 --- a/src/iceberg/util/temporal_util.cc +++ b/src/iceberg/util/temporal_util.cc @@ -444,6 +444,11 @@ Result TemporalUtils::ParseTimestampNsWithZone(std::string_view str) { /*units_per_micro=*/internal::kNanosPerMicro); } +Result TemporalUtils::IsUtcOffset(std::string_view str) { + ICEBERG_ASSIGN_OR_RAISE(auto timestamp_with_offset, ParseTimestampWithZoneSuffix(str)); + return timestamp_with_offset.second == 0; +} + #define DISPATCH_EXTRACT_YEAR(type_id) \ case type_id: \ return ExtractYearImpl(literal); diff --git a/src/iceberg/util/temporal_util.h b/src/iceberg/util/temporal_util.h index 2121f565d..acb1f0e73 100644 --- a/src/iceberg/util/temporal_util.h +++ b/src/iceberg/util/temporal_util.h @@ -125,6 +125,22 @@ class ICEBERG_EXPORT TemporalUtils { /// \return The number of nanoseconds since epoch (UTC), or an error. static Result ParseTimestampNsWithZone(std::string_view str); + /// \brief Reports whether a timestamp-with-zone string carries a zero (UTC) offset. + /// + /// The ParseTimestamp*WithZone parsers accept any offset and silently normalize it + /// to UTC. The spec's JSON single-value form for `timestamptz` / `timestamptz_ns` + /// default values is meant to be UTC, so callers that must enforce that rule check + /// the offset here before parsing. + /// + /// Any zero-offset spelling is treated as UTC: "Z", "+00:00" and "-00:00" all denote + /// a zero offset and are accepted, even though the spec writes the canonical form as + /// "+00:00". Only a genuinely non-zero offset (e.g. "+05:00") is rejected. + /// + /// \param str The timestamp-with-zone string to inspect. + /// \return true if the offset is zero, false if it is a non-zero offset, or an error + /// if the timezone suffix cannot be parsed. + static Result IsUtcOffset(std::string_view str); + /// \brief Extract a date or timestamp year, as years from 1970 static Result ExtractYear(const Literal& literal);