Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
subgroup correlation

template sql_benchmarks/predicate_eval/predicate_eval.benchmark.template
SUBGROUP=correlation
QPAD=73
DATASET=corrproxy
NAME=correlation_q73_redundant_proxy
30 changes: 30 additions & 0 deletions benchmarks/sql_benchmarks/predicate_eval/load/corrproxy.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
-- Correlated-proxy dataset: a cheap integer predicate that is a perfect proxy
-- for three string predicates, plus one independent string predicate.
--
-- c0 = 1 for ~30% of rows (cheap proxy)
-- s1 contains 'aaa', 'ccc' and 'ddd' exactly where c0 = 1 (correlated)
-- s2 contains 'bbb' for an independent ~30% of rows (independent)
--
-- Marginally, the four regex predicates are indistinguishable: similar cost,
-- the same ~30% selectivity. Their *conditional* selectivities behind the
-- proxy differ completely: after `c0 = 1`, the three s1 regexes keep every
-- survivor (each re-tests the proxy's condition) while the s2 regex still
-- discards ~70%. Only joint statistics can see that; an independence
-- assumption prices all four regexes identically in every position.
--
-- PRED_FILL sets the filler width around each marker (a non-matching
-- `regexp_like` must scan the whole value), and PRED_ROWS sizes the table.
CREATE TABLE t AS
SELECT
CASE WHEN (value * 7) % 100 < 30 THEN 1 ELSE 0 END AS c0,
repeat('q', ${PRED_FILL:-30})
|| CASE WHEN (value * 7) % 100 < 30 THEN 'aaa' ELSE 'zzz' END
|| repeat('q', ${PRED_FILL:-30})
|| CASE WHEN (value * 7) % 100 < 30 THEN 'ccc' ELSE 'zzz' END
|| repeat('q', ${PRED_FILL:-30})
|| CASE WHEN (value * 7) % 100 < 30 THEN 'ddd' ELSE 'zzz' END
|| repeat('q', ${PRED_FILL:-30}) AS s1,
repeat('q', ${PRED_FILL:-30})
|| CASE WHEN (value * 13) % 100 < 30 THEN 'bbb' ELSE 'zzz' END
|| repeat('q', ${PRED_FILL:-30}) AS s2
FROM generate_series(1, ${PRED_ROWS:-1000000});
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
-- Hidden: `c0 = 1` is a perfect proxy for all three s1 regexes -- after the
-- cheap proxy, each s1 regex keeps every survivor while the equally selective
-- (~30%) s2 regex still discards ~70%. The optimal order is [c0, s2, s1...]
-- (one informative regex on 30% of rows, the three redundant ones on 9%),
-- but marginal statistics cannot tell the four regexes apart in any position:
-- ranking them takes their *joint* distribution with the proxy. Written with
-- the redundant regexes first, grouped with their proxy, as an author
-- naturally would.
SELECT count(*) FROM t
WHERE c0 = 1
AND regexp_like(s1, 'a.a')
AND regexp_like(s1, 'c.c')
AND regexp_like(s1, 'd.d')
AND regexp_like(s2, 'b.b');
Loading