should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,219 @@
# name: test/sql/parallelism/intraquery/depth_first_evaluation.test_slow
# description: Test that query plans are evaluated in a depth-first fashion
# group: [intraquery]
mode skip
# we need a persistent DB because we want to compress the table that we're working with
load __TEST_DIR__/depth_first_evaluation.db
# we don't want any disk spilling because we're testing memory pressure
statement ok
SET temp_directory = ''
# 2GiB is pretty tight, each single aggregation should take only slightly less
# in this test, we're testing that we don't have multiple aggregations active simultaneously
# so this limit should be tight enough
statement ok
SET memory_limit = '2GiB'
statement ok
SET threads = 4
# 10M integers but the table is tiny because of delta compression
statement ok
CREATE TABLE integers AS SELECT range i FROM range(10_000_000)
# one of these should easily fit in memory
query I
SELECT count(*) c FROM (SELECT DISTINCT i FROM integers)
----
10000000
# the next query performs 10 of the same distinct aggregations and unions them together
# each distinct aggregation has a different limit (which doesn't do anything)
# so that this test is future-proof (in case DuckDB does any common sub-plan elimination in the future)
# the idea here is that if DuckDB would do breadth-first plan evaluation (like it did before)
# DuckDB would first perform the 'Sink' for every distinct aggregation one by one
# this would create a HUGE temporary intermediates
# only after that DuckDB would perform the 'Finalize' for every distinct aggregation one by one
# the 'Finalize' reduces the data size to a single row
# so, this used to throw an OOM exception given the current memory limit
# with depth-first plan evaluation, DuckDB performs 'Finalize' for every distinct aggregation,
# before starting 'Sink' on the next distinct aggregation
# now this query completes without much memory pressure!
query I
SELECT sum(c)
FROM (
SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_000)
UNION ALL
SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_001)
UNION ALL
SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_002)
UNION ALL
SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_003)
UNION ALL
SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_004)
UNION ALL
SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_005)
UNION ALL
SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_006)
UNION ALL
SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_007)
UNION ALL
SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_008)
UNION ALL
SELECT count(DISTINCT i) c FROM (SELECT i FROM integers LIMIT 100_000_009)
)
----
100000000
statement ok
DROP TABLE integers
# column i has 0, 100, 200, etc., around 100 unique values spread out across the range 0 to 10 million
# all other values in column j are equal to range + 0.5
# column j and k are just ranges from 0 to 10 million
# we have to do this so our statistics propagation and dynamic join filters don't trivialise the query
statement ok
CREATE TABLE doubles AS
SELECT
CASE WHEN range % 100_000 = 0 THEN range ELSE range + 0.5 END i,
range::DOUBLE j,
range::DOUBLE k0,
range::DOUBLE k1,
range::DOUBLE k2,
range::DOUBLE k3,
range::DOUBLE k4,
range::DOUBLE k5,
range::DOUBLE k6,
range::DOUBLE k7,
range::DOUBLE k8,
range::DOUBLE k9
FROM range(10_000_000)
# one of these should always fit in memory
# the idea is that the cte is a large join (10m x 10m)
# but it's really selective, only 100 tuples come out of it
# then, we join with doubles union'ed with itself, so that it becomes the probe pipeline,
# i.e., it has a higher cardinality than the selective join, which goes into a build
query I
WITH c AS NOT MATERIALIZED (
SELECT d0.k0
FROM doubles d0
JOIN doubles d1
ON (d0.i = d1.j)
)
SELECT count(*)
FROM (
SELECT * FROM doubles
UNION ALL
SELECT * FROM doubles
) d
JOIN c
ON (d.k0 = c.k0)
----
200
# now we just crank up the number of ctes that we're joining with to 10
# again, if DuckDB would do breadth-first plan evaluation (like it did before)
# DuckDB would 'Sink' into all of of the builds in the cte's one by one, creating huge intermediates
# only after that it would perform all the selective joins and reduce the size of the intermediates
# so, this used to throw an OOM exception
# with depth-first plan evaluation, DuckDB performs the selective joins one by one,
# reducing the size of intermediates immediately, and the query completes!
query I
WITH c0 AS NOT MATERIALIZED (
SELECT d0.k0
FROM doubles d0
JOIN doubles d1
ON (d0.i = d1.j)
LIMIT 100_000_000
), c1 AS NOT MATERIALIZED (
SELECT d0.k1
FROM doubles d0
JOIN doubles d1
ON (d0.i = d1.j)
LIMIT 100_000_001
), c2 AS NOT MATERIALIZED (
SELECT d0.k2
FROM doubles d0
JOIN doubles d1
ON (d0.i = d1.j)
LIMIT 100_000_002
), c3 AS NOT MATERIALIZED (
SELECT d0.k3
FROM doubles d0
JOIN doubles d1
ON (d0.i = d1.j)
LIMIT 100_000_003
), c4 AS NOT MATERIALIZED (
SELECT d0.k4
FROM doubles d0
JOIN doubles d1
ON (d0.i = d1.j)
LIMIT 100_000_004
), c5 AS NOT MATERIALIZED (
SELECT d0.k5
FROM doubles d0
JOIN doubles d1
ON (d0.i = d1.j)
LIMIT 100_000_005
), c6 AS NOT MATERIALIZED (
SELECT d0.k6
FROM doubles d0
JOIN doubles d1
ON (d0.i = d1.j)
LIMIT 100_000_006
), c7 AS NOT MATERIALIZED (
SELECT d0.k7
FROM doubles d0
JOIN doubles d1
ON (d0.i = d1.j)
LIMIT 100_000_007
), c8 AS NOT MATERIALIZED (
SELECT d0.k8
FROM doubles d0
JOIN doubles d1
ON (d0.i = d1.j)
LIMIT 100_000_008
), c9 AS NOT MATERIALIZED (
SELECT d0.k9
FROM doubles d0
JOIN doubles d1
ON (d0.i = d1.j)
LIMIT 100_000_009
)
SELECT count(*)
FROM (
SELECT * FROM doubles
UNION ALL
SELECT * FROM doubles
) d
JOIN c0
ON (d.k0 = c0.k0)
JOIN c1
ON (d.k1 = c1.k1)
JOIN c2
ON (d.k2 = c2.k2)
JOIN c3
ON (d.k3 = c3.k3)
JOIN c4
ON (d.k4 = c4.k4)
JOIN c5
ON (d.k5 = c5.k5)
JOIN c6
ON (d.k6 = c6.k6)
JOIN c7
ON (d.k7 = c7.k7)
JOIN c8
ON (d.k8 = c8.k8)
JOIN c9
ON (d.k9 = c9.k9)
----
200

View File

@@ -0,0 +1,43 @@
# name: test/sql/parallelism/intraquery/depth_first_evaluation_union_and_join.test
# description: Test that combinations of unions and joins do not lead to circular dependencies due to depth-first
# group: [intraquery]
require tpcds
statement ok
call dsdgen(sf=0.01)
# tpcds q14 was giving some issues before
statement ok
SELECT ss_quantity quantity,
ss_list_price list_price
FROM store_sales,
date_dim
WHERE ss_sold_date_sk = d_date_sk
AND d_year BETWEEN 1999 AND 1999 + 2
UNION ALL SELECT cs_quantity quantity,
cs_list_price list_price
FROM catalog_sales,
date_dim
WHERE cs_sold_date_sk = d_date_sk
AND d_year BETWEEN 1999 AND 1999 + 2
statement ok
SELECT ss_quantity quantity,
ss_list_price list_price
FROM store_sales,
date_dim
WHERE ss_sold_date_sk = d_date_sk
AND d_year BETWEEN 1999 AND 1999 + 2
UNION ALL SELECT cs_quantity quantity,
cs_list_price list_price
FROM catalog_sales,
date_dim
WHERE cs_sold_date_sk = d_date_sk
AND d_year BETWEEN 1999 AND 1999 + 2
UNION ALL SELECT ws_quantity quantity,
ws_list_price list_price
FROM web_sales,
date_dim
WHERE ws_sold_date_sk = d_date_sk
AND d_year BETWEEN 1999 AND 1999 + 2

View File

@@ -0,0 +1,21 @@
# name: test/sql/parallelism/intraquery/error_in_pipeline.test
# description: Test errors happening in pipelines
# group: [intraquery]
statement ok
PRAGMA enable_verification
statement ok
PRAGMA threads=16
statement ok
create table varchars as select i::varchar i from range(1000000) tbl(i);
statement ok
insert into varchars values ('hello')
# we get a conversion error in the pipeline here
statement error
select (select min(i::int)+tbl.k from varchars) from (values (1), (2), (3)) tbl(k);
----
<REGEX>:.*Conversion Error.*Could not convert string.*

View File

@@ -0,0 +1,81 @@
# name: test/sql/parallelism/intraquery/parallel_materialization.test_slow
# description: Test parallel materialization of results
# group: [intraquery]
statement ok
PRAGMA enable_verification
statement ok
PRAGMA threads=4
statement ok
PRAGMA verify_parallelism
statement ok
CREATE TABLE integers(i INTEGER)
statement ok
CREATE TABLE other_table AS SELECT 337 i UNION ALL SELECT 948247 UNION ALL SELECT 1779793 UNION ALL SELECT 4779793;
statement ok
INSERT INTO integers SELECT * FROM range(2500000)
statement ok
BEGIN TRANSACTION
statement ok
INSERT INTO integers SELECT * FROM range(2500000, 5000000) tbl(i);
# run these tests twice - once with transaction local data and once without
loop i 0 2
# IN-clause (semi join)
query I sort
SELECT * FROM integers WHERE i IN (SELECT * FROM other_table)
----
1779793
337
4779793
948247
# explicit join
query I sort
SELECT * FROM integers JOIN other_table USING(i)
----
1779793
337
4779793
948247
# simple WHERE clause
query I
SELECT * FROM integers WHERE i > 337 AND i < 340
----
338
339
# IN-clause
query I sort
SELECT * FROM integers WHERE i IN (337, 948247, 1779793, 4779793, 99999999999999)
----
1779793
337
4779793
948247
# more complex where clause
query I
SELECT * FROM integers WHERE i=337 OR (i+i>1896494 AND i+i<= 1896498) OR (i*2=9559586)
----
337
948248
948249
4779793
statement ok
COMMIT
statement ok
BEGIN TRANSACTION
endloop

View File

@@ -0,0 +1,33 @@
# name: test/sql/parallelism/intraquery/parallel_sample.test
# description: Test parallel reservoir sampling
# group: [intraquery]
statement ok
PRAGMA enable_verification
statement ok
PRAGMA threads=4
statement ok
PRAGMA verify_parallelism
statement ok
PRAGMA enable_profiling
statement ok
PRAGMA profiling_output='__TEST_DIR__/test.json'
statement ok
PRAGMA profiling_mode = detailed
statement ok
CREATE TABLE integers AS SELECT * FROM range(50000) tbl(i)
loop i 0 3
query I
SELECT COUNT(*) FROM integers TABLESAMPLE RESERVOIR(100)
----
100
endloop

View File

@@ -0,0 +1,161 @@
# name: test/sql/parallelism/intraquery/test_aggregations_parallelism.test_slow
# description: Test parallel aggregations
# group: [intraquery]
statement ok
PRAGMA threads=4
statement ok
PRAGMA verify_parallelism
statement ok
PRAGMA enable_profiling
statement ok
PRAGMA profiling_output='__TEST_DIR__/test.json'
statement ok
PRAGMA profiling_mode = detailed
statement ok
create table t as select range a, range%10 b from range(100000);
statement ok
create table bool (a bool);
statement ok
insert into bool select i from (values (True),(False)) tbl(i), range(5000);
query I rowsort
select regr_avgx(a, b) from t group by b%2;
----
4
5
query I rowsort
select regr_avgy(a, b) from t group by b%2;
----
49999
50000
query I
select regr_count(a, b) from t group by b%2;
----
50000
50000
query I
select regr_slope(a, b) from t group by b%2;
----
1
1
query I
select regr_r2(a, b) from t group by b%2;
----
0
0
query I
select regr_sxx(a, b) from t group by b%2;
----
400000
400000
query I
select regr_syy(a, b) from t group by b%2;
----
41666666650000
41666666650000
query I
select regr_sxy(a, b) from t group by b%2;
----
400000
400000
query I
select regr_intercept(a, b) from t group by b%2;
----
49995
49995
query II
select bool_or(a) AS or_result,
bool_and(a) AS and_result
from bool;
----
1 0
query II rowsort
select approx_count_distinct(a), approx_count_distinct(b) from t group by b%2;
----
41234 5
50630 5
query II
select arg_min(b,a), arg_max(b,a) from t;
----
0 9
query I
select corr(a,b) from t group by b%2;
----
0.000098
0.000098
query I
select entropy(a) from t;
----
16.609640
query I
select product(b) from t where a < 2000 group by b ORDER BY ALL;
----
0.000000
1.000000
1606938044258990275541962092341162602522202993782792835301376.000000
265613988875874780598610418785575466612106726486464451918226939374088579537852722003778744614912.000000
2582249878086908589655919172003011874329705792829223512830659356540647622016841194629645353280137831435903171972747493376.000000
62230152778611436341719212978715912065845640340255171677509352007186090271914090005514397667938156624558634630421732454813630518023306608640.000000
426825223812027418890822638940363656845648276125323888998923834808612716932857614130884809238924869437532348038131987282955125485318424988074770692583718912.000000
10461838291314366330317127670141550737702465658592295023670588778589393524795407978164871712859217601822699210766850846310566017409265619375066400522840981934981110562816.000000
4149515568880992958512407863691161151012446232242436899995657329690652811412908146399707048947103794288197886611300789182395151075411775307886874834113963687061181803401509523685376.000000
70550791086553342048730135009513303838175056420995588227179312780658581221418811410649818193340747718196946609062859227923187761346197139563553410502493292326451779794675743751622116224532480.000000
query II
select kurtosis(a),kurtosis(b) from t where a < 5000
----
-1.200000 -1.224267
query I
select skewness(skw_c) from (select case when a %2=0 then 1 else a end as skw_c from t) as skw_tbl
----
0.9295
statement ok
insert into t values (1,1),(2,2)
query II
select mode(a), mode(b) from t group by b%2 order by all;
----
1 1
2 2
query I
select histogram(b) from t;
----
{0=10000, 1=10001, 2=10001, 3=10000, 4=10000, 5=10000, 6=10000, 7=10000, 8=10000, 9=10000}
query I
select string_split(string_agg('a', ','), ',')[100] from t group by b%2;
----
a
a
query I
select string_split(string_agg(NULL, ','), ',')[100] from t group by b%2;
----
NULL
NULL

View File

@@ -0,0 +1,51 @@
# name: test/sql/parallelism/intraquery/test_list_parallelism.test_slow
# description: Test list aggregates on a small-ish tables in parallel(few thousand rows)
# group: [intraquery]
statement ok
PRAGMA threads=4
statement ok
PRAGMA verify_parallelism
statement ok
PRAGMA enable_profiling
statement ok
PRAGMA profiling_output='__TEST_DIR__/test.json'
statement ok
PRAGMA profiling_mode = detailed
statement ok
create table t (a integer, b integer);
loop i 0 2000
statement ok
insert into t values (${i},${i}%10);
statement ok
insert into t values (NULL,NULL);
endloop
query I
select count(*) from (SELECT UNNEST(l1) as un FROM (SELECT LIST(a) l1 FROM t) t1) t2 where un is not null
----
2000
query I
select count(*) from (SELECT UNNEST(l1) as un FROM (SELECT LIST(a) l1 FROM t) t1) t2 where un is null
----
2000
query I
select count(*) from (SELECT UNNEST(l1) as un FROM (SELECT LIST(a) l1 FROM t) t1) t2
----
4000
query I
select count (*) from (SELECT LIST(b) l1 FROM t group by a) t1
----
2001

View File

@@ -0,0 +1,74 @@
# name: test/sql/parallelism/intraquery/test_parallel_nested_aggregates.test
# description: Test parallel aggregations
# group: [intraquery]
statement ok
PRAGMA threads=4
statement ok
PRAGMA verify_parallelism
statement ok
PRAGMA enable_profiling
statement ok
PRAGMA profiling_output='__TEST_DIR__/test.json'
statement ok
PRAGMA profiling_mode = detailed
statement ok
create table t as select range a, range%10 b from range(100000);
# non-deterministic so we can't check the result
statement ok
select first([a]) from t group by b%2;
query II rowsort
select min([a]), max([a]) from t group by b%2;
----
[0] [99998]
[1] [99999]
query II rowsort
select min([-a, 1, a]), max([-a, 1, a]) from t group by b%2;
----
[-99998, 1, 99998] [0, 1, 0]
[-99999, 1, 99999] [-1, 1, 1]
query II rowsort
select min({'i': a}), max({'i': a}) from t group by b%2 order by all;
----
{'i': 0} {'i': 99998}
{'i': 1} {'i': 99999}
query II rowsort
select min({'i': a, 'j': a % 2}), max({'i': a, 'j': a % 2}) from t group by b%2;
----
{'i': 0, 'j': 0} {'i': 99998, 'j': 0}
{'i': 1, 'j': 1} {'i': 99999, 'j': 1}
# NULL inputs
query I
select first(NULL::INT[]) from t group by b%2;
----
NULL
NULL
query I
select min(NULL::INT[]) from t group by b%2;
----
NULL
NULL
query I
select first(NULL::ROW(i INTEGER)) from t group by b%2;
----
NULL
NULL
query I
select min(NULL::ROW(i INTEGER)) from t group by b%2;
----
NULL
NULL

View File

@@ -0,0 +1,69 @@
# name: test/sql/parallelism/intraquery/test_persistent_parallelism.test
# description: Test force parallelism on small-ish tables (few thousand rows)
# group: [intraquery]
load __TEST_DIR__/test_parallelism.db
statement ok
PRAGMA enable_profiling
statement ok
PRAGMA profiling_output='__TEST_DIR__/test.json'
statement ok
PRAGMA profiling_mode = detailed
statement ok
BEGIN TRANSACTION
statement ok
CREATE TABLE integers AS SELECT * FROM range(0, 5000) tbl(i)
query II
SELECT MIN(i), MAX(i) FROM integers
----
0 4999
statement ok
COMMIT
query II
SELECT MIN(i), MAX(i) FROM integers
----
0 4999
restart
statement ok
PRAGMA enable_verification
statement ok
PRAGMA threads=4
statement ok
PRAGMA verify_parallelism
query II
SELECT MIN(i), MAX(i) FROM integers
----
0 4999
# add some transient data
statement ok
BEGIN TRANSACTION
statement ok
INSERT INTO integers SELECT * FROM range(5000, 10000)
query II
SELECT MIN(i), MAX(i) FROM integers
----
0 9999
statement ok
COMMIT
query II
SELECT MIN(i), MAX(i) FROM integers
----
0 9999

View File

@@ -0,0 +1,56 @@
# name: test/sql/parallelism/intraquery/test_simple_parallelism.test
# description: Test simple parallelism
# group: [intraquery]
statement ok
PRAGMA enable_verification
statement ok
PRAGMA threads=4
statement ok
PRAGMA verify_parallelism
statement ok
PRAGMA enable_profiling
statement ok
PRAGMA profiling_output='__TEST_DIR__/test.json'
statement ok
PRAGMA profiling_mode = detailed
statement ok
CREATE TABLE integers(i INTEGER)
statement ok
INSERT INTO integers VALUES (1), (2), (3), (NULL)
# perform a query with many pipelines
query R
SELECT SUM(i) FROM integers UNION ALL SELECT AVG(i) FROM integers UNION ALL SELECT MIN(i) FROM integers UNION ALL SELECT MAX(i) FROM integers;
----
6.000000
2.000000
1.000000
3.000000
statement ok
CREATE VIEW v1(i) AS SELECT SUM(i) FROM integers UNION ALL SELECT AVG(i) FROM integers UNION ALL SELECT MIN(i) FROM integers UNION ALL SELECT MAX(i) FROM integers;
# errors in separate pipelines
statement error
SELECT SUM(i) FROM integers UNION ALL SELECT AVG(i) FROM integers UNION ALL SELECT MIN(i::DATE) FROM integers UNION ALL SELECT MAX(i::DATE) FROM integers;
----
<REGEX>:Conversion Error.*Unimplemented type for cast.*
# errors are properly cleared
query R sort
SELECT * FROM v1 t1 JOIN v1 t2 USING (i);
----
1.000000
2.000000
3.000000
6.000000

View File

@@ -0,0 +1,47 @@
# name: test/sql/parallelism/intraquery/test_verify_parallelism.test
# description: Test force parallelism on small-ish tables (few thousand rows)
# group: [intraquery]
statement ok
PRAGMA enable_verification
statement ok
PRAGMA threads=4
statement ok
PRAGMA verify_parallelism
statement ok
PRAGMA enable_profiling
statement ok
PRAGMA profiling_output='__TEST_DIR__/test.json'
statement ok
PRAGMA profiling_mode = detailed
statement ok
CREATE TABLE integers AS SELECT * FROM range(0, 5000) tbl(i)
# test simple aggregates
query II
SELECT MIN(i), MAX(i) FROM integers
----
0 4999
query II
SELECT MIN(i), MAX(i) FROM integers WHERE i>2000
----
2001 4999
# test grouped aggregates
statement ok
CREATE TABLE integers2 AS SELECT i%4 i, i j FROM range(0, 5000) tbl(i)
query IIII
SELECT i, SUM(j), MIN(j), MAX(j) FROM integers2 GROUP BY i ORDER BY i
----
0 3122500 0 4996
1 3123750 1 4997
2 3125000 2 4998
3 3126250 3 4999