apache · adriangb · Jun 11, 2026
diff --git a/benchmarks/sql_benchmarks/predicate_eval/benchmarks/correlation/q73.benchmark b/benchmarks/sql_benchmarks/predicate_eval/benchmarks/correlation/q73.benchmark
@@ -0,0 +1,7 @@
+subgroup correlation
+
+template sql_benchmarks/predicate_eval/predicate_eval.benchmark.template
+SUBGROUP=correlation
+QPAD=73
+DATASET=corrproxy
+NAME=correlation_q73_redundant_proxy
diff --git a/benchmarks/sql_benchmarks/predicate_eval/load/corrproxy.sql b/benchmarks/sql_benchmarks/predicate_eval/load/corrproxy.sql
@@ -0,0 +1,30 @@
+-- Correlated-proxy dataset: a cheap integer predicate that is a perfect proxy
+-- for three string predicates, plus one independent string predicate.
+--
+--   c0    = 1 for ~30% of rows (cheap proxy)
+--   s1    contains 'aaa', 'ccc' and 'ddd' exactly where c0 = 1  (correlated)
+--   s2    contains 'bbb' for an independent ~30% of rows        (independent)
+--
+-- Marginally, the four regex predicates are indistinguishable: similar cost,
+-- the same ~30% selectivity. Their *conditional* selectivities behind the
+-- proxy differ completely: after `c0 = 1`, the three s1 regexes keep every
+-- survivor (each re-tests the proxy's condition) while the s2 regex still
+-- discards ~70%. Only joint statistics can see that; an independence
+-- assumption prices all four regexes identically in every position.
+--
+-- PRED_FILL sets the filler width around each marker (a non-matching
+-- `regexp_like` must scan the whole value), and PRED_ROWS sizes the table.
+CREATE TABLE t AS
+SELECT
+  CASE WHEN (value * 7) % 100 < 30 THEN 1 ELSE 0 END AS c0,
+  repeat('q', ${PRED_FILL:-30})
+    || CASE WHEN (value * 7) % 100 < 30 THEN 'aaa' ELSE 'zzz' END
+    || repeat('q', ${PRED_FILL:-30})
+    || CASE WHEN (value * 7) % 100 < 30 THEN 'ccc' ELSE 'zzz' END
+    || repeat('q', ${PRED_FILL:-30})
+    || CASE WHEN (value * 7) % 100 < 30 THEN 'ddd' ELSE 'zzz' END
+    || repeat('q', ${PRED_FILL:-30}) AS s1,
+  repeat('q', ${PRED_FILL:-30})
+    || CASE WHEN (value * 13) % 100 < 30 THEN 'bbb' ELSE 'zzz' END
+    || repeat('q', ${PRED_FILL:-30}) AS s2
+FROM generate_series(1, ${PRED_ROWS:-1000000});
diff --git a/benchmarks/sql_benchmarks/predicate_eval/queries/correlation/q73.sql b/benchmarks/sql_benchmarks/predicate_eval/queries/correlation/q73.sql
@@ -0,0 +1,14 @@
+-- Hidden: `c0 = 1` is a perfect proxy for all three s1 regexes -- after the
+-- cheap proxy, each s1 regex keeps every survivor while the equally selective
+-- (~30%) s2 regex still discards ~70%. The optimal order is [c0, s2, s1...]
+-- (one informative regex on 30% of rows, the three redundant ones on 9%),
+-- but marginal statistics cannot tell the four regexes apart in any position:
+-- ranking them takes their *joint* distribution with the proxy. Written with
+-- the redundant regexes first, grouped with their proxy, as an author
+-- naturally would.
+SELECT count(*) FROM t
+WHERE c0 = 1
+  AND regexp_like(s1, 'a.a')
+  AND regexp_like(s1, 'c.c')
+  AND regexp_like(s1, 'd.d')
+  AND regexp_like(s2, 'b.b');