From e2b7c9905ed324666e09e60424901bbf507eb4ca Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Mon, 15 Jun 2026 18:24:17 +0000 Subject: [PATCH 01/12] Fix REPLACE buffer overflow for large output strings Gandiva's REPLACE hardcoded a 65535-byte output cap, throwing "Buffer overflow for output string" whenever the result exceeded 64 KB. Size the output buffer to the exact result instead, by counting non-overlapping matches of from_str: text_len + num_matches * (to_str_len - from_str_len). Removes the arbitrary cap; the internal replace_with_max_len variant and its bounds checks are unchanged. Gandiva variable-length output uses int32 offsets, so a single output string cannot exceed INT_MAX (2 GB). Guard that boundary explicitly with a clear error message instead of letting the int32 size cast wrap silently (which could otherwise lead to under-allocation). Co-Authored-By: Claude Opus 4.8 (1M context) --- cpp/src/gandiva/precompiled/string_ops.cc | 28 +++++++++++++-- .../gandiva/precompiled/string_ops_test.cc | 36 +++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 9a2d8c8415c..6523eb01247 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1914,9 +1914,33 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, const char* from_str, gdv_int32 from_str_len, const char* to_str, gdv_int32 to_str_len, gdv_int32* out_len) { + // Count non-overlapping matches to size the output buffer exactly, so large + // results are not capped by an arbitrary limit. + gdv_int64 num_matches = 0; + if (from_str_len > 0 && from_str_len <= text_len) { + for (gdv_int32 i = 0; i <= text_len - from_str_len;) { + if (memcmp(text + i, from_str, from_str_len) == 0) { + num_matches++; + i += from_str_len; + } else { + i++; + } + } + } + gdv_int64 max_length = + static_cast(text_len) + num_matches * (to_str_len - from_str_len); + // Gandiva variable-length output uses int32 offsets, so a single output string + // cannot exceed INT_MAX bytes. Report this explicitly instead of letting the + // cast below wrap silently. + if (max_length > INT_MAX) { + gdv_fn_context_set_error_msg(context, + "REPLACE: output string exceeds maximum size of 2GB"); + *out_len = 0; + return ""; + } return replace_with_max_len_utf8_utf8_utf8(context, text, text_len, from_str, - from_str_len, to_str, to_str_len, 65535, - out_len); + from_str_len, to_str, to_str_len, + static_cast(max_length), out_len); } // Returns the quoted string (Includes escape character for any single quotes) diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index a31683c65ac..b499da56689 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1971,6 +1971,42 @@ TEST(TestStringOps, TestReplace) { EXPECT_EQ(std::string(out_str, out_len), "TestString"); EXPECT_FALSE(ctx.has_error()); + // Large output (>64 KB) must not overflow: buffer is sized to the exact result. + std::string large_in(35000, 'X'); + std::string large_expected(70000, '\0'); + for (int i = 0; i < 35000; ++i) { + large_expected[2 * i] = 'X'; + large_expected[2 * i + 1] = 'Y'; + } + out_str = replace_utf8_utf8_utf8(ctx_ptr, large_in.data(), + static_cast(large_in.size()), "X", 1, "XY", 2, + &out_len); + EXPECT_EQ(out_len, 70000); + EXPECT_EQ(std::string(out_str, out_len), large_expected); + EXPECT_FALSE(ctx.has_error()); + + // Large shrinking output ("XX" -> "X") on a >64 KB input. + std::string large_shrink_in(70000, 'X'); + std::string large_shrink_expected(35000, 'X'); + out_str = replace_utf8_utf8_utf8(ctx_ptr, large_shrink_in.data(), + static_cast(large_shrink_in.size()), "XX", 2, + "X", 1, &out_len); + EXPECT_EQ(out_len, 35000); + EXPECT_EQ(std::string(out_str, out_len), large_shrink_expected); + EXPECT_FALSE(ctx.has_error()); + + // Output that would exceed INT_MAX (2GB) is reported cleanly rather than + // silently wrapping the int32 size. 50000 matches each expanding to 50000 + // bytes implies max_length = 2.5e9; the guard fires before any large alloc. + std::string huge_in(50000, 'X'); + std::string huge_to(50000, 'Z'); + replace_utf8_utf8_utf8(ctx_ptr, huge_in.data(), static_cast(huge_in.size()), + "X", 1, huge_to.data(), static_cast(huge_to.size()), + &out_len); + EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size")); + EXPECT_EQ(out_len, 0); + ctx.Reset(); + replace_with_max_len_utf8_utf8_utf8(ctx_ptr, "Hell", 4, "ell", 3, "ollow", 5, 5, &out_len); EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer overflow for output string")); From 6d6bf39a14ff5c58aa28a7af472f39397063ce41 Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Thu, 18 Jun 2026 17:18:33 +0000 Subject: [PATCH 02/12] More tests --- cpp/src/gandiva/precompiled/string_ops.cc | 32 ++++++++++++------- .../gandiva/precompiled/string_ops_test.cc | 20 ++++++++++++ 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 6523eb01247..089eeaea0d5 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1914,21 +1914,29 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, const char* from_str, gdv_int32 from_str_len, const char* to_str, gdv_int32 to_str_len, gdv_int32* out_len) { - // Count non-overlapping matches to size the output buffer exactly, so large - // results are not capped by an arbitrary limit. - gdv_int64 num_matches = 0; - if (from_str_len > 0 && from_str_len <= text_len) { - for (gdv_int32 i = 0; i <= text_len - from_str_len;) { - if (memcmp(text + i, from_str, from_str_len) == 0) { - num_matches++; - i += from_str_len; - } else { - i++; + // Size the output buffer to the exact result, so large results are not capped + // by an arbitrary limit. When the replacement is no longer than the matched + // text, the result can only shrink or stay the same, so text_len is a safe + // bound and we can skip scanning. Otherwise count non-overlapping matches to + // get the exact expanded size. + gdv_int64 max_length; + if (to_str_len <= from_str_len) { + max_length = text_len; + } else { + gdv_int64 num_matches = 0; + if (from_str_len > 0 && from_str_len <= text_len) { + for (gdv_int32 i = 0; i <= text_len - from_str_len;) { + if (memcmp(text + i, from_str, from_str_len) == 0) { + num_matches++; + i += from_str_len; + } else { + i++; + } } } + max_length = + static_cast(text_len) + num_matches * (to_str_len - from_str_len); } - gdv_int64 max_length = - static_cast(text_len) + num_matches * (to_str_len - from_str_len); // Gandiva variable-length output uses int32 offsets, so a single output string // cannot exceed INT_MAX bytes. Report this explicitly instead of letting the // cast below wrap silently. diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index b499da56689..9b7d9c6d9d1 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1995,6 +1995,26 @@ TEST(TestStringOps, TestReplace) { EXPECT_EQ(std::string(out_str, out_len), large_shrink_expected); EXPECT_FALSE(ctx.has_error()); + // Edge case: result size of exactly 0 (every byte of text is removed). Takes + // the no-scan shrink path (to_str_len <= from_str_len). + out_str = replace_utf8_utf8_utf8(ctx_ptr, "aaaa", 4, "a", 1, "", 0, &out_len); + EXPECT_EQ(out_len, 0); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + // Edge case: result size one past the INT_MAX boundary. 65536 single-char + // matches each expanding to 32768 bytes gives max_length = 65536 * 32768 = + // 2^31 = INT_MAX + 1, so it is reported cleanly (guard fires before any alloc). + std::string boundary_in(65536, 'a'); + std::string boundary_to(32768, 'b'); + replace_utf8_utf8_utf8(ctx_ptr, boundary_in.data(), + static_cast(boundary_in.size()), "a", 1, + boundary_to.data(), static_cast(boundary_to.size()), + &out_len); + EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size")); + EXPECT_EQ(out_len, 0); + ctx.Reset(); + // Output that would exceed INT_MAX (2GB) is reported cleanly rather than // silently wrapping the int32 size. 50000 matches each expanding to 50000 // bytes implies max_length = 2.5e9; the guard fires before any large alloc. From 60653bf350dbf47c228cfaf5438ed2c59f139684 Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Thu, 18 Jun 2026 18:26:31 +0000 Subject: [PATCH 03/12] TODO: benchmark --- cpp/src/gandiva/precompiled/CMakeLists.txt | 16 ++ .../precompiled/string_ops_benchmark.cc | 156 ++++++++++++++++++ 2 files changed, 172 insertions(+) create mode 100644 cpp/src/gandiva/precompiled/string_ops_benchmark.cc diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index e1427e25fb6..4c862f848a9 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -84,3 +84,19 @@ add_gandiva_test(precompiled-test GANDIVA_UNIT_TEST=1 ARROW_STATIC GANDIVA_STATIC) + +# microbenchmark (built as a gandiva test so it links the precompiled sources +# directly; run on demand via the gandiva-precompiled-benchmark binary) +add_gandiva_test(precompiled-benchmark + SOURCES + ../context_helper.cc + string_ops_benchmark.cc + string_ops.cc + EXTRA_INCLUDES + ${CMAKE_SOURCE_DIR}/src + EXTRA_LINK_LIBS + Boost::headers + DEFINITIONS + GANDIVA_UNIT_TEST=1 + ARROW_STATIC + GANDIVA_STATIC) diff --git a/cpp/src/gandiva/precompiled/string_ops_benchmark.cc b/cpp/src/gandiva/precompiled/string_ops_benchmark.cc new file mode 100644 index 00000000000..0316b298822 --- /dev/null +++ b/cpp/src/gandiva/precompiled/string_ops_benchmark.cc @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Microbenchmark comparing the current REPLACE implementation against the +// pre-change one, to measure the cost of the O(n) match-counting scan that the +// fix added to size the output buffer exactly. +// +// new = replace_utf8_utf8_utf8 (counting scan + exact allocation) +// old = replace_with_max_len_utf8_utf8_utf8(..., capacity, ...) +// The old wrapper was literally a call to this with a fixed cap of +// 65535; given a buffer large enough to hold the result it is the +// pre-change algorithm with no counting scan, so new-vs-old isolates +// the scan overhead. (The shipped old cap of 65535 additionally errored +// out for any result above ~64 KB -- the bug this change fixes.) +// +// Reported time includes one ExecutionContext::Reset() per call in both arms +// (it cancels out in the delta), mirroring per-call arena reuse. + +#include +#include +#include +#include +#include +#include + +#include + +#include "gandiva/execution_context.h" +#include "gandiva/precompiled/types.h" + +namespace gandiva { +namespace { + +// Builds a `len`-byte string whose `match` byte occurs once every `stride` +// bytes (the rest filled with `filler`), giving len/stride non-overlapping +// matches. +std::string MakeText(int64_t len, int stride, char match, char filler) { + std::string s(static_cast(len), filler); + for (int64_t i = 0; i < len; i += stride) { + s[static_cast(i)] = match; + } + return s; +} + +double TimeNsPerCall(const std::function& fn, int iters) { + auto t0 = std::chrono::steady_clock::now(); + for (int i = 0; i < iters; ++i) { + fn(); + } + auto t1 = std::chrono::steady_clock::now(); + return std::chrono::duration(t1 - t0).count() / iters; +} + +struct Case { + std::string name; + int64_t text_len; + int stride; // a match every `stride` bytes + std::string from; + std::string to; + int iters; +}; + +} // namespace + +TEST(StringOpsBenchmark, Replace) { + ExecutionContext ctx; + int64_t ctx_ptr = reinterpret_cast(&ctx); + + const int64_t kSmall = 256; + const int64_t kMedium = 64 * 1024; + const int64_t kLarge = 4 * 1024 * 1024; + + std::vector cases = { + // Expansion cases (to longer than from) -- these take the O(n) scan path. + {"small/dense expand a->ab", kSmall, 1, "a", "ab", 200000}, + {"small/sparse expand a->ab", kSmall, 64, "a", "ab", 200000}, + {"medium/dense expand a->ab", kMedium, 1, "a", "ab", 4000}, + {"medium/sparse expand a->ab", kMedium, 64, "a", "ab", 4000}, + {"large/dense expand a->ab", kLarge, 1, "a", "ab", 80}, + {"large/sparse expand a->ab", kLarge, 64, "a", "ab", 80}, + // Shrink case (to no longer than from) -- new skips the scan entirely. + {"large/dense shrink ab->a", kLarge, 2, "ab", "a", 80}, + }; + + printf("\n%-28s %10s %9s %12s %12s %9s\n", "case", "text_len", "matches", "new ns/call", + "old ns/call", "delta"); + printf("%s\n", std::string(92, '-').c_str()); + + for (const auto& c : cases) { + std::string text = MakeText(c.text_len, c.stride, c.from[0], 'x'); + const char* tp = text.data(); + auto tlen = static_cast(text.size()); + const char* fp = c.from.data(); + auto flen = static_cast(c.from.size()); + const char* op = c.to.data(); + auto olen = static_cast(c.to.size()); + + // Count matches and compute the exact output size to give the "old" arm a + // buffer large enough to complete (precomputed -- not part of the timing). + int64_t matches = 0; + if (flen > 0 && flen <= tlen) { + for (int32_t i = 0; i <= tlen - flen;) { + if (memcmp(tp + i, fp, flen) == 0) { + ++matches; + i += flen; + } else { + ++i; + } + } + } + auto out_capacity = static_cast(tlen + matches * (olen - flen)); + + auto run_new = [&]() { + int32_t out_len = 0; + replace_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, &out_len); + ctx.Reset(); + }; + auto run_old = [&]() { + int32_t out_len = 0; + replace_with_max_len_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, + out_capacity, &out_len); + ctx.Reset(); + }; + + // Warm up (and verify both produce the same length / no error). + run_new(); + ASSERT_FALSE(ctx.has_error()); + run_old(); + ASSERT_FALSE(ctx.has_error()); + + double new_ns = TimeNsPerCall(run_new, c.iters); + double old_ns = TimeNsPerCall(run_old, c.iters); + double delta = old_ns > 0 ? (new_ns - old_ns) / old_ns * 100.0 : 0.0; + + printf("%-28s %10lld %9lld %12.1f %12.1f %+8.1f%%\n", c.name.c_str(), + static_cast(c.text_len), static_cast(matches), new_ns, + old_ns, delta); + } + printf("\n"); +} + +} // namespace gandiva From 3878dc484c7a8851f30b568c952ce97ff5dae8be Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Mon, 22 Jun 2026 18:43:38 +0000 Subject: [PATCH 04/12] New approach to avoid full scan in some cases --- cpp/src/gandiva/precompiled/string_ops.cc | 28 ++++++++++++------- .../precompiled/string_ops_benchmark.cc | 4 +++ 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 089eeaea0d5..6d52022a839 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1914,17 +1914,26 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, const char* from_str, gdv_int32 from_str_len, const char* to_str, gdv_int32 to_str_len, gdv_int32* out_len) { - // Size the output buffer to the exact result, so large results are not capped - // by an arbitrary limit. When the replacement is no longer than the matched - // text, the result can only shrink or stay the same, so text_len is a safe - // bound and we can skip scanning. Otherwise count non-overlapping matches to - // get the exact expanded size. + // Size the output buffer so large results are not capped by an arbitrary + // limit, while avoiding a second pass over the input in the common case. + // - No replacement possible, or the result can only shrink/stay equal: + // text_len is a safe exact-or-upper bound, no scan. + // - Small expansion (replacement at most ~2x the match): use an O(1) upper + // bound that assumes every position matches. This over-allocates by at + // most ~text_len bytes but skips the match-counting scan entirely. + // - Large expansion: that upper bound could be many times the input for + // sparse matches, so count non-overlapping matches for the exact size. gdv_int64 max_length; - if (to_str_len <= from_str_len) { + if (from_str_len <= 0 || from_str_len > text_len || to_str_len <= from_str_len) { max_length = text_len; } else { - gdv_int64 num_matches = 0; - if (from_str_len > 0 && from_str_len <= text_len) { + gdv_int32 delta = to_str_len - from_str_len; // > 0 + gdv_int64 upper_bound = static_cast(text_len) + + (static_cast(text_len) / from_str_len) * delta; + if (delta <= from_str_len && upper_bound <= INT_MAX) { + max_length = upper_bound; + } else { + gdv_int64 num_matches = 0; for (gdv_int32 i = 0; i <= text_len - from_str_len;) { if (memcmp(text + i, from_str, from_str_len) == 0) { num_matches++; @@ -1933,9 +1942,8 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text, i++; } } + max_length = static_cast(text_len) + num_matches * delta; } - max_length = - static_cast(text_len) + num_matches * (to_str_len - from_str_len); } // Gandiva variable-length output uses int32 offsets, so a single output string // cannot exceed INT_MAX bytes. Report this explicitly instead of letting the diff --git a/cpp/src/gandiva/precompiled/string_ops_benchmark.cc b/cpp/src/gandiva/precompiled/string_ops_benchmark.cc index 0316b298822..7270a7c1a89 100644 --- a/cpp/src/gandiva/precompiled/string_ops_benchmark.cc +++ b/cpp/src/gandiva/precompiled/string_ops_benchmark.cc @@ -92,6 +92,10 @@ TEST(StringOpsBenchmark, Replace) { {"medium/sparse expand a->ab", kMedium, 64, "a", "ab", 4000}, {"large/dense expand a->ab", kLarge, 1, "a", "ab", 80}, {"large/sparse expand a->ab", kLarge, 64, "a", "ab", 80}, + // Big-expansion cases (to_len - from_len > from_len) -- these fall back to + // the exact match-counting scan, so new should still show the scan delta. + {"large/dense bigexp a->abcd", kLarge, 1, "a", "abcd", 80}, + {"large/sparse bigexp a->abcd", kLarge, 64, "a", "abcd", 80}, // Shrink case (to no longer than from) -- new skips the scan entirely. {"large/dense shrink ab->a", kLarge, 2, "ab", "a", 80}, }; From 2c4741e1cf4480eac4524a8f500fa781b039e646 Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Mon, 22 Jun 2026 18:52:28 +0000 Subject: [PATCH 05/12] addresssed pr feedback --- cpp/src/gandiva/precompiled/string_ops.cc | 14 ++++++++++++-- cpp/src/gandiva/precompiled/string_ops_test.cc | 8 ++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 6d52022a839..247622681d2 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1863,7 +1863,11 @@ const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* t for (; text_index <= text_len - from_str_len;) { if (memcmp(text + text_index, from_str, from_str_len) == 0) { - if (out_index + text_index - last_match_index + to_str_len > max_length) { + // Subtract first: out_index and text_index can both approach INT_MAX now + // that the wrapper may pass a large max_length, so out_index + text_index + // could overflow gdv_int32 before the subtraction. (text_index - + // last_match_index) is a bounded non-negative span, keeping the sum legal. + if (out_index + (text_index - last_match_index) + to_str_len > max_length) { gdv_fn_context_set_error_msg(context, "Buffer overflow for output string"); *out_len = 0; return ""; @@ -1898,7 +1902,7 @@ const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* t return text; } - if (out_index + text_len - last_match_index > max_length) { + if (out_index + (text_len - last_match_index) > max_length) { gdv_fn_context_set_error_msg(context, "Buffer overflow for output string"); *out_len = 0; return ""; @@ -1942,6 +1946,12 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text, i++; } } + // No matches: the result is the input unchanged; return it without calling + // the helper (which would otherwise scan the text a second time). + if (num_matches == 0) { + *out_len = text_len; + return text; + } max_length = static_cast(text_len) + num_matches * delta; } } diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 9b7d9c6d9d1..1759023110b 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1971,6 +1971,14 @@ TEST(TestStringOps, TestReplace) { EXPECT_EQ(std::string(out_str, out_len), "TestString"); EXPECT_FALSE(ctx.has_error()); + // No match on the large-expansion (counting) path: from "z" to "zzz" expands + // by more than from_len, so this exercises the count branch's zero-match + // early return. + out_str = + replace_utf8_utf8_utf8(ctx_ptr, "TestString", 10, "z", 1, "zzz", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + EXPECT_FALSE(ctx.has_error()); + // Large output (>64 KB) must not overflow: buffer is sized to the exact result. std::string large_in(35000, 'X'); std::string large_expected(70000, '\0'); From 6470eb0062a218f4e52fcf579212fb8dfb738c2e Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Mon, 22 Jun 2026 18:53:39 +0000 Subject: [PATCH 06/12] lint --- cpp/src/gandiva/precompiled/string_ops_test.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 1759023110b..daa7b0be29d 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1974,8 +1974,7 @@ TEST(TestStringOps, TestReplace) { // No match on the large-expansion (counting) path: from "z" to "zzz" expands // by more than from_len, so this exercises the count branch's zero-match // early return. - out_str = - replace_utf8_utf8_utf8(ctx_ptr, "TestString", 10, "z", 1, "zzz", 3, &out_len); + out_str = replace_utf8_utf8_utf8(ctx_ptr, "TestString", 10, "z", 1, "zzz", 3, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestString"); EXPECT_FALSE(ctx.has_error()); @@ -2015,10 +2014,9 @@ TEST(TestStringOps, TestReplace) { // 2^31 = INT_MAX + 1, so it is reported cleanly (guard fires before any alloc). std::string boundary_in(65536, 'a'); std::string boundary_to(32768, 'b'); - replace_utf8_utf8_utf8(ctx_ptr, boundary_in.data(), - static_cast(boundary_in.size()), "a", 1, - boundary_to.data(), static_cast(boundary_to.size()), - &out_len); + replace_utf8_utf8_utf8( + ctx_ptr, boundary_in.data(), static_cast(boundary_in.size()), "a", 1, + boundary_to.data(), static_cast(boundary_to.size()), &out_len); EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size")); EXPECT_EQ(out_len, 0); ctx.Reset(); From 652ccad502d5c5496059d31a7dad62396e7c822a Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Tue, 23 Jun 2026 17:16:51 +0000 Subject: [PATCH 07/12] use int64 for counters to prevent overflow --- cpp/src/gandiva/precompiled/string_ops.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 15920abba98..151cb0a54cc 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1896,11 +1896,14 @@ const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* t for (; text_index <= text_len - from_str_len;) { if (memcmp(text + text_index, from_str, from_str_len) == 0) { - // Subtract first: out_index and text_index can both approach INT_MAX now - // that the wrapper may pass a large max_length, so out_index + text_index - // could overflow gdv_int32 before the subtraction. (text_index - - // last_match_index) is a bounded non-negative span, keeping the sum legal. - if (out_index + (text_index - last_match_index) + to_str_len > max_length) { + // Compute the prospective length in gdv_int64: now that the wrapper may + // pass a max_length near INT_MAX, out_index can approach INT_MAX and a + // 32-bit sum would overflow before this guard runs -- precisely the case + // the guard exists to catch. (text_index - last_match_index) is a bounded + // non-negative span. + gdv_int64 prospective_len = static_cast(out_index) + + (text_index - last_match_index) + to_str_len; + if (prospective_len > max_length) { gdv_fn_context_set_error_msg(context, "REPLACE: Buffer overflow for output string"); *out_len = 0; @@ -1936,7 +1939,8 @@ const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* t return text; } - if (out_index + (text_len - last_match_index) > max_length) { + gdv_int64 final_len = static_cast(out_index) + (text_len - last_match_index); + if (final_len > max_length) { gdv_fn_context_set_error_msg(context, "REPLACE: Buffer overflow for output string"); *out_len = 0; return ""; From e9c172d2a12064d7771e77ff45cc23eb529fc88a Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Wed, 24 Jun 2026 17:40:19 +0000 Subject: [PATCH 08/12] fix lint --- cpp/src/gandiva/precompiled/string_ops_benchmark.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops_benchmark.cc b/cpp/src/gandiva/precompiled/string_ops_benchmark.cc index 7270a7c1a89..9c37ca1239f 100644 --- a/cpp/src/gandiva/precompiled/string_ops_benchmark.cc +++ b/cpp/src/gandiva/precompiled/string_ops_benchmark.cc @@ -31,6 +31,7 @@ // (it cancels out in the delta), mirroring per-call arena reuse. #include +#include #include #include #include @@ -150,9 +151,8 @@ TEST(StringOpsBenchmark, Replace) { double old_ns = TimeNsPerCall(run_old, c.iters); double delta = old_ns > 0 ? (new_ns - old_ns) / old_ns * 100.0 : 0.0; - printf("%-28s %10lld %9lld %12.1f %12.1f %+8.1f%%\n", c.name.c_str(), - static_cast(c.text_len), static_cast(matches), new_ns, - old_ns, delta); + printf("%-28s %10" PRId64 " %9" PRId64 " %12.1f %12.1f %+8.1f%%\n", c.name.c_str(), + c.text_len, matches, new_ns, old_ns, delta); } printf("\n"); } From 97850bb54b313dc18f7195a45cf1f60b30f1892a Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Wed, 24 Jun 2026 18:32:14 +0000 Subject: [PATCH 09/12] Fix benchmark test and remove from standard test suite. --- cpp/src/gandiva/precompiled/CMakeLists.txt | 40 ++++++++++++------- .../precompiled/string_ops_benchmark.cc | 17 +++++--- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index 4c862f848a9..0bb59a58542 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -85,18 +85,28 @@ add_gandiva_test(precompiled-test ARROW_STATIC GANDIVA_STATIC) -# microbenchmark (built as a gandiva test so it links the precompiled sources -# directly; run on demand via the gandiva-precompiled-benchmark binary) -add_gandiva_test(precompiled-benchmark - SOURCES - ../context_helper.cc - string_ops_benchmark.cc - string_ops.cc - EXTRA_INCLUDES - ${CMAKE_SOURCE_DIR}/src - EXTRA_LINK_LIBS - Boost::headers - DEFINITIONS - GANDIVA_UNIT_TEST=1 - ARROW_STATIC - GANDIVA_STATIC) +# REPLACE microbenchmark. Built on demand only -- it is a plain executable, NOT +# registered as a ctest test, so it never runs in check-in/ctest runs. Build and +# run it explicitly via the gandiva-precompiled-benchmark binary. It compiles the +# precompiled sources locally (with GANDIVA_UNIT_TEST=1) so the functions are +# directly callable, mirroring how the precompiled-test target links. +if(ARROW_BUILD_TESTS) + if(ARROW_TEST_LINKAGE STREQUAL "static") + set(GANDIVA_BENCHMARK_LINK_LIBS ${GANDIVA_STATIC_TEST_LINK_LIBS}) + else() + set(GANDIVA_BENCHMARK_LINK_LIBS ${GANDIVA_SHARED_TEST_LINK_LIBS}) + endif() + add_executable(gandiva-precompiled-benchmark string_ops_benchmark.cc string_ops.cc + ../context_helper.cc) + target_link_libraries(gandiva-precompiled-benchmark + PRIVATE ${ARROW_GTEST_GTEST_HEADERS} + ${GANDIVA_BENCHMARK_LINK_LIBS} Boost::headers) + target_include_directories(gandiva-precompiled-benchmark SYSTEM + PUBLIC ${CMAKE_SOURCE_DIR}/src) + target_compile_definitions(gandiva-precompiled-benchmark + PRIVATE GANDIVA_UNIT_TEST=1 ARROW_STATIC GANDIVA_STATIC) + if(ARROW_TEST_LINKAGE STREQUAL "static") + # libLLVM*.a needs the executable to export the ORC EH-frame wrappers. + set_target_properties(gandiva-precompiled-benchmark PROPERTIES ENABLE_EXPORTS TRUE) + endif() +endif() diff --git a/cpp/src/gandiva/precompiled/string_ops_benchmark.cc b/cpp/src/gandiva/precompiled/string_ops_benchmark.cc index 9c37ca1239f..bd5f368d53e 100644 --- a/cpp/src/gandiva/precompiled/string_ops_benchmark.cc +++ b/cpp/src/gandiva/precompiled/string_ops_benchmark.cc @@ -141,11 +141,18 @@ TEST(StringOpsBenchmark, Replace) { ctx.Reset(); }; - // Warm up (and verify both produce the same length / no error). - run_new(); - ASSERT_FALSE(ctx.has_error()); - run_old(); - ASSERT_FALSE(ctx.has_error()); + // Warm up and verify correctness. Check has_error()/out_len BEFORE Reset(), + // since Reset() clears the error and the timed lambdas reset internally. + int32_t new_len = 0; + replace_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, &new_len); + ASSERT_FALSE(ctx.has_error()) << c.name << ": " << ctx.get_error(); + ctx.Reset(); + int32_t old_len = 0; + replace_with_max_len_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, + out_capacity, &old_len); + ASSERT_FALSE(ctx.has_error()) << c.name << ": " << ctx.get_error(); + ctx.Reset(); + ASSERT_EQ(new_len, old_len) << c.name << ": new/old output lengths differ"; double new_ns = TimeNsPerCall(run_new, c.iters); double old_ns = TimeNsPerCall(run_old, c.iters); From 562dcf9c7378873e9922ae10483f74d005809d23 Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Wed, 24 Jun 2026 18:40:05 +0000 Subject: [PATCH 10/12] Fix lambda resets in benchmark --- .../precompiled/string_ops_benchmark.cc | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops_benchmark.cc b/cpp/src/gandiva/precompiled/string_ops_benchmark.cc index bd5f368d53e..5a5c97c6a8b 100644 --- a/cpp/src/gandiva/precompiled/string_ops_benchmark.cc +++ b/cpp/src/gandiva/precompiled/string_ops_benchmark.cc @@ -129,29 +129,27 @@ TEST(StringOpsBenchmark, Replace) { } auto out_capacity = static_cast(tlen + matches * (olen - flen)); + // Reset at the start of each lambda (not the end) so the arena is recycled + // every iteration while any error set by the call persists for inspection + // afterwards. Reset is in both timed paths, so it cancels in the delta. + int32_t new_len = 0; + int32_t old_len = 0; auto run_new = [&]() { - int32_t out_len = 0; - replace_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, &out_len); ctx.Reset(); + replace_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, &new_len); }; auto run_old = [&]() { - int32_t out_len = 0; - replace_with_max_len_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, - out_capacity, &out_len); ctx.Reset(); + replace_with_max_len_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, + out_capacity, &old_len); }; - // Warm up and verify correctness. Check has_error()/out_len BEFORE Reset(), - // since Reset() clears the error and the timed lambdas reset internally. - int32_t new_len = 0; - replace_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, &new_len); + // Warm up and verify correctness: no error from either, and both agree on + // the output length. + run_new(); ASSERT_FALSE(ctx.has_error()) << c.name << ": " << ctx.get_error(); - ctx.Reset(); - int32_t old_len = 0; - replace_with_max_len_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, - out_capacity, &old_len); + run_old(); ASSERT_FALSE(ctx.has_error()) << c.name << ": " << ctx.get_error(); - ctx.Reset(); ASSERT_EQ(new_len, old_len) << c.name << ": new/old output lengths differ"; double new_ns = TimeNsPerCall(run_new, c.iters); From 89d4b940a6f9d9207c8aaa9d8ce2b44269f64f9e Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Thu, 25 Jun 2026 19:42:46 +0000 Subject: [PATCH 11/12] Move string ops benchmark to fit existing framework. --- cpp/src/gandiva/precompiled/CMakeLists.txt | 26 --- .../precompiled/string_ops_benchmark.cc | 165 ------------------ cpp/src/gandiva/tests/CMakeLists.txt | 19 ++ cpp/src/gandiva/tests/string_ops_benchmark.cc | 164 +++++++++++++++++ 4 files changed, 183 insertions(+), 191 deletions(-) delete mode 100644 cpp/src/gandiva/precompiled/string_ops_benchmark.cc create mode 100644 cpp/src/gandiva/tests/string_ops_benchmark.cc diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index 0bb59a58542..e1427e25fb6 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -84,29 +84,3 @@ add_gandiva_test(precompiled-test GANDIVA_UNIT_TEST=1 ARROW_STATIC GANDIVA_STATIC) - -# REPLACE microbenchmark. Built on demand only -- it is a plain executable, NOT -# registered as a ctest test, so it never runs in check-in/ctest runs. Build and -# run it explicitly via the gandiva-precompiled-benchmark binary. It compiles the -# precompiled sources locally (with GANDIVA_UNIT_TEST=1) so the functions are -# directly callable, mirroring how the precompiled-test target links. -if(ARROW_BUILD_TESTS) - if(ARROW_TEST_LINKAGE STREQUAL "static") - set(GANDIVA_BENCHMARK_LINK_LIBS ${GANDIVA_STATIC_TEST_LINK_LIBS}) - else() - set(GANDIVA_BENCHMARK_LINK_LIBS ${GANDIVA_SHARED_TEST_LINK_LIBS}) - endif() - add_executable(gandiva-precompiled-benchmark string_ops_benchmark.cc string_ops.cc - ../context_helper.cc) - target_link_libraries(gandiva-precompiled-benchmark - PRIVATE ${ARROW_GTEST_GTEST_HEADERS} - ${GANDIVA_BENCHMARK_LINK_LIBS} Boost::headers) - target_include_directories(gandiva-precompiled-benchmark SYSTEM - PUBLIC ${CMAKE_SOURCE_DIR}/src) - target_compile_definitions(gandiva-precompiled-benchmark - PRIVATE GANDIVA_UNIT_TEST=1 ARROW_STATIC GANDIVA_STATIC) - if(ARROW_TEST_LINKAGE STREQUAL "static") - # libLLVM*.a needs the executable to export the ORC EH-frame wrappers. - set_target_properties(gandiva-precompiled-benchmark PROPERTIES ENABLE_EXPORTS TRUE) - endif() -endif() diff --git a/cpp/src/gandiva/precompiled/string_ops_benchmark.cc b/cpp/src/gandiva/precompiled/string_ops_benchmark.cc deleted file mode 100644 index 5a5c97c6a8b..00000000000 --- a/cpp/src/gandiva/precompiled/string_ops_benchmark.cc +++ /dev/null @@ -1,165 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Microbenchmark comparing the current REPLACE implementation against the -// pre-change one, to measure the cost of the O(n) match-counting scan that the -// fix added to size the output buffer exactly. -// -// new = replace_utf8_utf8_utf8 (counting scan + exact allocation) -// old = replace_with_max_len_utf8_utf8_utf8(..., capacity, ...) -// The old wrapper was literally a call to this with a fixed cap of -// 65535; given a buffer large enough to hold the result it is the -// pre-change algorithm with no counting scan, so new-vs-old isolates -// the scan overhead. (The shipped old cap of 65535 additionally errored -// out for any result above ~64 KB -- the bug this change fixes.) -// -// Reported time includes one ExecutionContext::Reset() per call in both arms -// (it cancels out in the delta), mirroring per-call arena reuse. - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "gandiva/execution_context.h" -#include "gandiva/precompiled/types.h" - -namespace gandiva { -namespace { - -// Builds a `len`-byte string whose `match` byte occurs once every `stride` -// bytes (the rest filled with `filler`), giving len/stride non-overlapping -// matches. -std::string MakeText(int64_t len, int stride, char match, char filler) { - std::string s(static_cast(len), filler); - for (int64_t i = 0; i < len; i += stride) { - s[static_cast(i)] = match; - } - return s; -} - -double TimeNsPerCall(const std::function& fn, int iters) { - auto t0 = std::chrono::steady_clock::now(); - for (int i = 0; i < iters; ++i) { - fn(); - } - auto t1 = std::chrono::steady_clock::now(); - return std::chrono::duration(t1 - t0).count() / iters; -} - -struct Case { - std::string name; - int64_t text_len; - int stride; // a match every `stride` bytes - std::string from; - std::string to; - int iters; -}; - -} // namespace - -TEST(StringOpsBenchmark, Replace) { - ExecutionContext ctx; - int64_t ctx_ptr = reinterpret_cast(&ctx); - - const int64_t kSmall = 256; - const int64_t kMedium = 64 * 1024; - const int64_t kLarge = 4 * 1024 * 1024; - - std::vector cases = { - // Expansion cases (to longer than from) -- these take the O(n) scan path. - {"small/dense expand a->ab", kSmall, 1, "a", "ab", 200000}, - {"small/sparse expand a->ab", kSmall, 64, "a", "ab", 200000}, - {"medium/dense expand a->ab", kMedium, 1, "a", "ab", 4000}, - {"medium/sparse expand a->ab", kMedium, 64, "a", "ab", 4000}, - {"large/dense expand a->ab", kLarge, 1, "a", "ab", 80}, - {"large/sparse expand a->ab", kLarge, 64, "a", "ab", 80}, - // Big-expansion cases (to_len - from_len > from_len) -- these fall back to - // the exact match-counting scan, so new should still show the scan delta. - {"large/dense bigexp a->abcd", kLarge, 1, "a", "abcd", 80}, - {"large/sparse bigexp a->abcd", kLarge, 64, "a", "abcd", 80}, - // Shrink case (to no longer than from) -- new skips the scan entirely. - {"large/dense shrink ab->a", kLarge, 2, "ab", "a", 80}, - }; - - printf("\n%-28s %10s %9s %12s %12s %9s\n", "case", "text_len", "matches", "new ns/call", - "old ns/call", "delta"); - printf("%s\n", std::string(92, '-').c_str()); - - for (const auto& c : cases) { - std::string text = MakeText(c.text_len, c.stride, c.from[0], 'x'); - const char* tp = text.data(); - auto tlen = static_cast(text.size()); - const char* fp = c.from.data(); - auto flen = static_cast(c.from.size()); - const char* op = c.to.data(); - auto olen = static_cast(c.to.size()); - - // Count matches and compute the exact output size to give the "old" arm a - // buffer large enough to complete (precomputed -- not part of the timing). - int64_t matches = 0; - if (flen > 0 && flen <= tlen) { - for (int32_t i = 0; i <= tlen - flen;) { - if (memcmp(tp + i, fp, flen) == 0) { - ++matches; - i += flen; - } else { - ++i; - } - } - } - auto out_capacity = static_cast(tlen + matches * (olen - flen)); - - // Reset at the start of each lambda (not the end) so the arena is recycled - // every iteration while any error set by the call persists for inspection - // afterwards. Reset is in both timed paths, so it cancels in the delta. - int32_t new_len = 0; - int32_t old_len = 0; - auto run_new = [&]() { - ctx.Reset(); - replace_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, &new_len); - }; - auto run_old = [&]() { - ctx.Reset(); - replace_with_max_len_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, - out_capacity, &old_len); - }; - - // Warm up and verify correctness: no error from either, and both agree on - // the output length. - run_new(); - ASSERT_FALSE(ctx.has_error()) << c.name << ": " << ctx.get_error(); - run_old(); - ASSERT_FALSE(ctx.has_error()) << c.name << ": " << ctx.get_error(); - ASSERT_EQ(new_len, old_len) << c.name << ": new/old output lengths differ"; - - double new_ns = TimeNsPerCall(run_new, c.iters); - double old_ns = TimeNsPerCall(run_old, c.iters); - double delta = old_ns > 0 ? (new_ns - old_ns) / old_ns * 100.0 : 0.0; - - printf("%-28s %10" PRId64 " %9" PRId64 " %12.1f %12.1f %+8.1f%%\n", c.name.c_str(), - c.text_len, matches, new_ns, old_ns, delta); - } - printf("\n"); -} - -} // namespace gandiva diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt index 356b976e005..c7e257ae216 100644 --- a/cpp/src/gandiva/tests/CMakeLists.txt +++ b/cpp/src/gandiva/tests/CMakeLists.txt @@ -51,6 +51,25 @@ if(ARROW_BUILD_STATIC) "gandiva" EXTRA_LINK_LIBS gandiva_static) + + # Calls the precompiled REPLACE functions directly, so it compiles + # string_ops.cc/context_helper.cc with GANDIVA_UNIT_TEST=1 (which exposes them + # as linkable symbols). Only built when ARROW_BUILD_BENCHMARKS is ON. + add_arrow_benchmark(string_ops_benchmark + SOURCES + string_ops_benchmark.cc + ../precompiled/string_ops.cc + ../context_helper.cc + PREFIX + "gandiva" + EXTRA_LINK_LIBS + gandiva_static) + if(TARGET gandiva-string-ops-benchmark) + target_compile_definitions(gandiva-string-ops-benchmark + PRIVATE GANDIVA_UNIT_TEST=1 ARROW_STATIC GANDIVA_STATIC) + target_include_directories(gandiva-string-ops-benchmark SYSTEM + PRIVATE ${CMAKE_SOURCE_DIR}/src) + endif() endif() add_subdirectory(external_functions) diff --git a/cpp/src/gandiva/tests/string_ops_benchmark.cc b/cpp/src/gandiva/tests/string_ops_benchmark.cc new file mode 100644 index 00000000000..4600906e83d --- /dev/null +++ b/cpp/src/gandiva/tests/string_ops_benchmark.cc @@ -0,0 +1,164 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Microbenchmark comparing the current REPLACE implementation against the +// pre-change one, to measure the cost of the match-counting scan the fix added +// to size the output buffer. +// +// BM_ReplaceNew = replace_utf8_utf8_utf8 (upper bound or counting scan, then a +// single write pass) +// BM_ReplaceOld = replace_with_max_len_utf8_utf8_utf8(..., capacity, ...) given +// an exact buffer: the pre-change algorithm with no counting +// scan. Compare the two rows per case to read the scan's cost. +// +// Unlike the projector-level micro_benchmarks, this calls the precompiled +// functions directly, so the build compiles string_ops.cc with GANDIVA_UNIT_TEST. + +#include +#include +#include +#include + +#include "benchmark/benchmark.h" + +#include "gandiva/execution_context.h" +#include "gandiva/precompiled/types.h" + +namespace gandiva { +namespace { + +struct ReplaceCase { + const char* name; + int64_t text_len; + int stride; // a match (the first byte of `from`) every `stride` bytes + const char* from; + const char* to; +}; + +const std::vector& Cases() { + static const std::vector cases = { + // Small expansion (to_len - from_len <= from_len): no scan, upper bound. + {"small/dense expand a->ab", 256, 1, "a", "ab"}, + {"small/sparse expand a->ab", 256, 64, "a", "ab"}, + {"medium/dense expand a->ab", 64 * 1024, 1, "a", "ab"}, + {"medium/sparse expand a->ab", 64 * 1024, 64, "a", "ab"}, + {"large/dense expand a->ab", 4 * 1024 * 1024, 1, "a", "ab"}, + {"large/sparse expand a->ab", 4 * 1024 * 1024, 64, "a", "ab"}, + // Big expansion (to_len - from_len > from_len): falls back to the scan. + {"large/dense bigexp a->abcd", 4 * 1024 * 1024, 1, "a", "abcd"}, + {"large/sparse bigexp a->abcd", 4 * 1024 * 1024, 64, "a", "abcd"}, + // Shrink (to_len <= from_len): no scan. + {"large/dense shrink ab->a", 4 * 1024 * 1024, 2, "ab", "a"}, + }; + return cases; +} + +// Builds a `len`-byte string with `match` once every `stride` bytes. +std::string MakeText(int64_t len, int stride, char match, char filler) { + std::string s(static_cast(len), filler); + for (int64_t i = 0; i < len; i += stride) { + s[static_cast(i)] = match; + } + return s; +} + +// Exact output size, so the "old" arm gets a buffer large enough to complete. +int32_t ExactCapacity(const std::string& text, const char* from, int flen, int olen) { + int64_t matches = 0; + auto tlen = static_cast(text.size()); + if (flen > 0 && flen <= tlen) { + for (int32_t i = 0; i <= tlen - flen;) { + if (memcmp(text.data() + i, from, flen) == 0) { + ++matches; + i += flen; + } else { + ++i; + } + } + } + return static_cast(tlen + matches * (olen - flen)); +} + +void BM_ReplaceNew(benchmark::State& state) { + const ReplaceCase& c = Cases()[state.range(0)]; + auto flen = static_cast(strlen(c.from)); + auto olen = static_cast(strlen(c.to)); + std::string text = MakeText(c.text_len, c.stride, c.from[0], 'x'); + auto tlen = static_cast(text.size()); + ExecutionContext ctx; + auto ctx_ptr = reinterpret_cast(&ctx); + + // One warm-up call doubling as a correctness guard. + int32_t out_len = 0; + replace_utf8_utf8_utf8(ctx_ptr, text.data(), tlen, c.from, flen, c.to, olen, &out_len); + if (ctx.has_error()) { + state.SkipWithError(ctx.get_error().c_str()); + return; + } + + for (auto _ : state) { + ctx.Reset(); + const char* out = + replace_utf8_utf8_utf8(ctx_ptr, text.data(), tlen, c.from, flen, c.to, olen, + &out_len); + benchmark::DoNotOptimize(out); + benchmark::DoNotOptimize(out_len); + } + state.SetBytesProcessed(state.iterations() * tlen); + state.SetLabel(c.name); +} + +void BM_ReplaceOld(benchmark::State& state) { + const ReplaceCase& c = Cases()[state.range(0)]; + auto flen = static_cast(strlen(c.from)); + auto olen = static_cast(strlen(c.to)); + std::string text = MakeText(c.text_len, c.stride, c.from[0], 'x'); + auto tlen = static_cast(text.size()); + int32_t capacity = ExactCapacity(text, c.from, flen, olen); + ExecutionContext ctx; + auto ctx_ptr = reinterpret_cast(&ctx); + + int32_t out_len = 0; + replace_with_max_len_utf8_utf8_utf8(ctx_ptr, text.data(), tlen, c.from, flen, c.to, + olen, capacity, &out_len); + if (ctx.has_error()) { + state.SkipWithError(ctx.get_error().c_str()); + return; + } + + for (auto _ : state) { + ctx.Reset(); + const char* out = + replace_with_max_len_utf8_utf8_utf8(ctx_ptr, text.data(), tlen, c.from, flen, + c.to, olen, capacity, &out_len); + benchmark::DoNotOptimize(out); + benchmark::DoNotOptimize(out_len); + } + state.SetBytesProcessed(state.iterations() * tlen); + state.SetLabel(c.name); +} + +} // namespace + +BENCHMARK(BM_ReplaceNew) + ->DenseRange(0, static_cast(Cases().size()) - 1) + ->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_ReplaceOld) + ->DenseRange(0, static_cast(Cases().size()) - 1) + ->Unit(benchmark::kMicrosecond); + +} // namespace gandiva From 62860ec2dfda8e84d56c4d01bd7665e9bc91b08d Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Thu, 25 Jun 2026 19:50:24 +0000 Subject: [PATCH 12/12] Lint --- cpp/src/gandiva/tests/string_ops_benchmark.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/gandiva/tests/string_ops_benchmark.cc b/cpp/src/gandiva/tests/string_ops_benchmark.cc index 4600906e83d..4b8b82d08a4 100644 --- a/cpp/src/gandiva/tests/string_ops_benchmark.cc +++ b/cpp/src/gandiva/tests/string_ops_benchmark.cc @@ -112,9 +112,8 @@ void BM_ReplaceNew(benchmark::State& state) { for (auto _ : state) { ctx.Reset(); - const char* out = - replace_utf8_utf8_utf8(ctx_ptr, text.data(), tlen, c.from, flen, c.to, olen, - &out_len); + const char* out = replace_utf8_utf8_utf8(ctx_ptr, text.data(), tlen, c.from, flen, + c.to, olen, &out_len); benchmark::DoNotOptimize(out); benchmark::DoNotOptimize(out_len); } @@ -142,9 +141,8 @@ void BM_ReplaceOld(benchmark::State& state) { for (auto _ : state) { ctx.Reset(); - const char* out = - replace_with_max_len_utf8_utf8_utf8(ctx_ptr, text.data(), tlen, c.from, flen, - c.to, olen, capacity, &out_len); + const char* out = replace_with_max_len_utf8_utf8_utf8( + ctx_ptr, text.data(), tlen, c.from, flen, c.to, olen, capacity, &out_len); benchmark::DoNotOptimize(out); benchmark::DoNotOptimize(out_len); }