should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,53 @@
# name: test/sql/storage/parallel/batch_insert_filtered_row_groups.test_slow
# description: Test batch insert with small batches
# group: [parallel]
require vector_size 512
require parquet
load __TEST_DIR__/insert_mix_batches.db
loop i 0 2
statement ok
COPY (FROM range(10000000)) TO '__TEST_DIR__/many_small_batches.parquet' (row_group_size 5000)
statement ok
CREATE TABLE test AS FROM '__TEST_DIR__/many_small_batches.parquet' t(i) WHERE (i // 6144) % 3 = 0;
query I
SELECT * FROM test LIMIT 5 OFFSET 500000
----
1495328
1495329
1495330
1495331
1495332
query I
SELECT * FROM test QUALIFY i <= lag(i) over ()
----
# ensure that we still write close to our row group size as our row group size count, even for different block sizes
query I
SELECT MAX(count) > 100000 FROM pragma_storage_info('test')
----
true
# The median differs between block sizes because the upper bound of the segment size is the block size.
require block_size 262144
query I
SELECT MEDIAN(count) > 100000 FROM pragma_storage_info('test')
----
true
statement ok
DROP TABLE test
# repeat with a low memory limit
statement ok
SET memory_limit='500MB'
endloop

View File

@@ -0,0 +1,223 @@
# name: test/sql/storage/parallel/batch_insert_mix_batches.test_slow
# description: Test batch insert with small batches
# group: [parallel]
require parquet
load __TEST_DIR__/insert_mix_batches.db
statement ok
COPY (FROM range(100000) tbl(i)) TO '__TEST_DIR__/mix_batches_small.parquet' (ROW_GROUP_SIZE 5000)
statement ok
COPY (FROM range(100000, 400000) tbl(i)) TO '__TEST_DIR__/mix_batches_large.parquet' (ROW_GROUP_SIZE 200000)
statement ok
COPY (FROM range(400000, 700000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd.parquet' (ROW_GROUP_SIZE 999)
statement ok
COPY (FROM range(700000, 1000000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd_again.parquet' (ROW_GROUP_SIZE 99979)
# create views that read the batches
statement ok
CREATE VIEW v1 AS SELECT * FROM parquet_scan(['__TEST_DIR__/mix_batches_small.parquet', '__TEST_DIR__/mix_batches_large.parquet', '__TEST_DIR__/mix_batches_odd.parquet', '__TEST_DIR__/mix_batches_odd_again.parquet'])
statement ok
CREATE VIEW v2 AS FROM v1 WHERE (i//10000)%2=0;
statement ok
CREATE VIEW v3 AS FROM v1 WHERE (i//10000)%2=0 OR (i>200000 AND i < 400000) OR (i>600000 AND i < 800000);
loop i 0 2
query I
CREATE TABLE integers AS FROM v1;
----
1000000
# verify that we are not consuming an unnecessarily giant amount of blocks
# we have a total of 1.1M values - this should not be more than 20 row groups (ideally it is 10)
query I
select count(distinct row_group_id) < 20 from pragma_storage_info('integers');
----
true
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM v1
----
499999500000 0 999999 1000000 1000000
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers
----
499999500000 0 999999 1000000 1000000
query I
SELECT * FROM v1 LIMIT 5 OFFSET 99998
----
99998
99999
100000
100001
100002
query I
SELECT * FROM integers LIMIT 5 OFFSET 99998
----
99998
99999
100000
100001
100002
# now do the same, but filter out half of the values
query I
CREATE TABLE integers2 AS FROM v2
----
500000
# verify that we are not consuming an unnecessarily giant amount of blocks
# we have a total of 500K values - this should not be more than 20 row groups (ideally it is 5)
query I
select count(distinct row_group_id) < 20 from pragma_storage_info('integers2');
----
true
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM v2
----
247499750000 0 989999 500000 500000
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers2
----
247499750000 0 989999 500000 500000
query I
SELECT * FROM v2 LIMIT 5 OFFSET 99998
----
189998
189999
200000
200001
200002
query I
SELECT * FROM integers2 LIMIT 5 OFFSET 99998
----
189998
189999
200000
200001
200002
# do it again, but this time only filter out SOME small batches
query I
CREATE TABLE integers3 AS FROM v3
----
700000
# verify that we are not consuming an unnecessarily giant amount of blocks
# we have a total of 750K values - this should not be more than 20 row groups (ideally it is 7)
query I
select count(distinct row_group_id) < 20 from pragma_storage_info('integers3');
----
true
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM v3
----
348499650000 0 989999 700000 700000
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers3
----
348499650000 0 989999 700000 700000
query I
SELECT * FROM v3 LIMIT 5 OFFSET 9999
----
9999
20000
20001
20002
20003
query I
SELECT * FROM integers3 LIMIT 5 OFFSET 9999
----
9999
20000
20001
20002
20003
statement ok
DROP TABLE integers
statement ok
DROP TABLE integers2
statement ok
DROP TABLE integers3
statement ok
drop view if exists v2;
statement ok
drop view if exists v3;
# create views that read the batches using unions
statement ok
CREATE OR REPLACE VIEW v1 AS FROM '__TEST_DIR__/mix_batches_small.parquet' UNION ALL FROM '__TEST_DIR__/mix_batches_large.parquet' UNION ALL FROM '__TEST_DIR__/mix_batches_odd.parquet' UNION ALL FROM '__TEST_DIR__/mix_batches_odd_again.parquet'
statement ok
CREATE OR REPLACE VIEW v2 AS FROM v1 WHERE (i//10000)%2=0;
statement ok
CREATE OR REPLACE VIEW v3 AS FROM v1 WHERE (i//10000)%2=0 OR (i>200000 AND i < 400000) OR (i>600000 AND i < 800000);
endloop
statement ok
CREATE TABLE integers4(i INTEGER)
# mix batches transaction
statement ok
BEGIN TRANSACTION
statement ok
INSERT INTO integers4 FROM '__TEST_DIR__/mix_batches_small.parquet'
statement ok
INSERT INTO integers4 FROM '__TEST_DIR__/mix_batches_large.parquet'
statement ok
INSERT INTO integers4 FROM '__TEST_DIR__/mix_batches_odd.parquet'
statement ok
INSERT INTO integers4 FROM '__TEST_DIR__/mix_batches_odd_again.parquet'
statement ok
COMMIT
query I
select count(distinct row_group_id) < 20 from pragma_storage_info('integers4');
----
true
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers4
----
499999500000 0 999999 1000000 1000000
query I
SELECT * FROM integers4 LIMIT 5 OFFSET 99998
----
99998
99999
100000
100001
100002

View File

@@ -0,0 +1,185 @@
# name: test/sql/storage/parallel/batch_insert_small_batches.test_slow
# description: Test batch insert with small batches
# group: [parallel]
require parquet
load __TEST_DIR__/insert_small_batches.db
foreach row_group_size 5000 100000
statement ok
COPY (FROM range(1000000) tbl(i)) TO '__TEST_DIR__/small_batches.parquet' (ROW_GROUP_SIZE ${row_group_size})
statement ok
drop view if exists v2;
statement ok
drop view if exists v3;
statement ok
CREATE VIEW v1 AS SELECT * FROM '__TEST_DIR__/small_batches.parquet'
statement ok
CREATE VIEW v2 AS FROM v1 WHERE (i//10000)%2=0;
statement ok
CREATE VIEW v3 AS FROM v1 WHERE (i//10000)%2=0 OR (i>200000 AND i < 400000) OR (i>600000 AND i < 800000);
query I
CREATE TABLE integers AS FROM v1;
----
1000000
# verify that we are not consuming an unnecessarily giant amount of blocks
# we have a total of 1.1M values - this should not be more than 20 row groups (ideally it is 10)
query I
select count(distinct row_group_id) < 20 from pragma_storage_info('integers');
----
true
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM v1
----
499999500000 0 999999 1000000 1000000
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers
----
499999500000 0 999999 1000000 1000000
query II
SELECT * FROM (select i, lag(i) over () from v1) t(i, lag) WHERE i <= lag
----
query II
SELECT * FROM (select i, lag(i) over () from integers) t(i, lag) WHERE i <= lag
----
query I
SELECT * FROM v1 LIMIT 5 OFFSET 99998
----
99998
99999
100000
100001
100002
query I
SELECT * FROM integers LIMIT 5 OFFSET 99998
----
99998
99999
100000
100001
100002
# now do the same, but filter out half of the values
query I
CREATE TABLE integers2 AS FROM v2;
----
500000
# verify that we are not consuming an unnecessarily giant amount of blocks
# we have a total of 500K values - this should not be more than 20 row groups (ideally it is 5)
query I
select count(distinct row_group_id) < 20 from pragma_storage_info('integers2');
----
true
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM v2;
----
247499750000 0 989999 500000 500000
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers2
----
247499750000 0 989999 500000 500000
query II
SELECT * FROM (select i, lag(i) over () from v2) t(i, lag) WHERE i <= lag
----
query II
SELECT * FROM (select i, lag(i) over () from integers2) t(i, lag) WHERE i <= lag
----
query I
SELECT * FROM v2 LIMIT 5 OFFSET 99998
----
189998
189999
200000
200001
200002
query I
SELECT * FROM integers2 LIMIT 5 OFFSET 99998
----
189998
189999
200000
200001
200002
# do it again, but this time only filter out SOME small batches
query I
CREATE TABLE integers3 AS FROM v3
----
700000
# verify that we are not consuming an unnecessarily giant amount of blocks
# we have a total of 750K values - this should not be more than 20 row groups (ideally it is 7)
query I
select count(distinct row_group_id) < 20 from pragma_storage_info('integers3');
----
true
query IIIII nosort full_match3
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM v3
----
261256463520 0 802815 602816 602816
query IIIII nosort full_match3
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers3
----
261256463520 0 802815 602816 602816
query I
SELECT * FROM v3 LIMIT 5 OFFSET 9999
----
9999
20000
20001
20002
20003
query I
SELECT * FROM integers3 LIMIT 5 OFFSET 9999
----
9999
20000
20001
20002
20003
statement ok
DROP VIEW v2
statement ok
DROP VIEW v3
statement ok
DROP VIEW v1;
statement ok
DROP TABLE integers
statement ok
DROP TABLE integers2
statement ok
DROP TABLE integers3
endloop

View File

@@ -0,0 +1,52 @@
# name: test/sql/storage/parallel/batch_row_group_size_plus_one.test_slow
# description: Test batches that are slightly larger than a single row group
# group: [parallel]
require parquet
load __TEST_DIR__/insert_row_group_size_plus_one.db
# write many batches of row group size plus one vector
statement ok
COPY (FROM range(10000000) tbl(i)) TO '__TEST_DIR__/row_group_size_plus_one.parquet' (ROW_GROUP_SIZE 124928)
# create a view that reads the batch
statement ok
CREATE VIEW v1 AS SELECT * FROM parquet_scan('__TEST_DIR__/row_group_size_plus_one.parquet')
query I
CREATE TABLE integers AS FROM v1;
----
10000000
# verify that we are not consuming an unnecessarily giant amount of blocks
# we have a total of 10M values - this should not be more than 200 row groups (ideally it is 100)
query I
select count(distinct row_group_id) < 200 from pragma_storage_info('integers');
----
true
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM v1
----
49999995000000 0 9999999 10000000 10000000
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers
----
49999995000000 0 9999999 10000000 10000000
query I
SELECT * FROM integers LIMIT 5 OFFSET 99998
----
99998
99999
100000
100001
100002
query I
SELECT * FROM integers LIMIT 5 OFFSET 9999998
----
9999998
9999999

View File

@@ -0,0 +1,165 @@
# name: test/sql/storage/parallel/custom_row_group_size.test_slow
# description: Test batch insert with small batches
# group: [parallel]
require parquet
statement ok
ATTACH '__TEST_DIR__/custom_row_group_size.db' AS custom_row_group_size (ROW_GROUP_SIZE 204800, STORAGE_VERSION 'v1.2.0')
statement ok
USE custom_row_group_size
statement ok
COPY (FROM range(100000) tbl(i)) TO '__TEST_DIR__/mix_batches_small.parquet' (ROW_GROUP_SIZE 5000)
statement ok
COPY (FROM range(100000, 400000) tbl(i)) TO '__TEST_DIR__/mix_batches_large.parquet' (ROW_GROUP_SIZE 200000)
statement ok
COPY (FROM range(400000, 700000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd.parquet' (ROW_GROUP_SIZE 999)
statement ok
COPY (FROM range(700000, 1000000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd_again.parquet' (ROW_GROUP_SIZE 99979)
# create views that read the batches
statement ok
CREATE VIEW v1 AS SELECT * FROM parquet_scan(['__TEST_DIR__/mix_batches_small.parquet', '__TEST_DIR__/mix_batches_large.parquet', '__TEST_DIR__/mix_batches_odd.parquet', '__TEST_DIR__/mix_batches_odd_again.parquet'])
statement ok
CREATE VIEW v2 AS FROM v1 WHERE (i//10000)%2=0;
statement ok
CREATE VIEW v3 AS FROM v1 WHERE (i//10000)%2=0 OR (i>200000 AND i < 400000) OR (i>600000 AND i < 800000);
query I
CREATE TABLE integers AS FROM v1;
----
1000000
# verify we are actually creating larger row groups
query I
SELECT MAX(count) FROM pragma_storage_info('integers')
----
204800
# we have a total of 1M values - this should not be more than 10 row groups (ideally it is 5)
query I
select count(distinct row_group_id) < 10 from pragma_storage_info('integers');
----
true
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM v1
----
499999500000 0 999999 1000000 1000000
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers
----
499999500000 0 999999 1000000 1000000
# now do the same, but filter out half of the values
query I
CREATE TABLE integers2 AS FROM v2
----
500000
# also test deletions
query I
DELETE FROM integers WHERE (i//10000)%2<>0;
----
500000
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers
----
247499750000 0 989999 500000 500000
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers2
----
247499750000 0 989999 500000 500000
# test updates
query I
UPDATE integers SET i=i+1 WHERE i%2=0
----
250000
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers
----
247500000000 1 989999 500000 500000
query I
CREATE TABLE integers3 AS FROM v3
----
700000
# verify that we are not consuming an unnecessarily giant amount of blocks
# we have a total of 750K values - this should not be more than 10 row groups (ideally it is 4)
query I
select count(distinct row_group_id) < 10 from pragma_storage_info('integers3');
----
true
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers3
----
348499650000 0 989999 700000 700000
# non-batch insert
statement ok
SET preserve_insertion_order = false
statement ok
CREATE TABLE integers4 AS FROM integers
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers4
----
247500000000 1 989999 500000 500000
# re-attach without the parameter
statement ok
ATTACH ':memory:' AS mem
statement ok
USE mem
statement ok
DETACH custom_row_group_size
statement ok
ATTACH '__TEST_DIR__/custom_row_group_size.db' AS custom_row_group_size
statement ok
USE custom_row_group_size
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers
----
247500000000 1 989999 500000 500000
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers2
----
247499750000 0 989999 500000 500000
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers3
----
348499650000 0 989999 700000 700000
# invalid row group size parameters
statement error
ATTACH '__TEST_DIR__/custom_row_group_size_xx.db' AS custom_row_group_size_x1 (ROW_GROUP_SIZE 0)
----
row group size must be bigger than 0
statement error
ATTACH '__TEST_DIR__/custom_row_group_size_xx.db' AS custom_row_group_size_x2 (ROW_GROUP_SIZE 77)
----
row group size must be divisible by the vector size

View File

@@ -0,0 +1,119 @@
# name: test/sql/storage/parallel/insert_many_compressible_batches.test_slow
# description: Test writing many compressible batches
# group: [parallel]
require vector_size 512
require parquet
load __TEST_DIR__/insert_many_compressible_batches.db
# 50M values, extremely compressible
query I
CREATE TABLE integers AS SELECT CASE WHEN i % 50000 = 0 THEN 1 ELSE 0 END AS i FROM range(50000000) tbl(i);
----
50000000
# check the block count and median number of rows per row group
query I
SELECT COUNT(DISTINCT block_id) < 4 FROM pragma_storage_info('integers');
----
true
query I
SELECT MEDIAN(count) FROM pragma_storage_info('integers');
----
122880
statement ok
COPY integers TO '__TEST_DIR__/integers.parquet'
# verify that reading while preserving insertion order creates the same size table,
# with very small block variations for compact block sizes
statement ok
CREATE TABLE integers_parquet AS FROM '__TEST_DIR__/integers.parquet';
query I
SELECT * FROM integers_parquet LIMIT 5
----
1
0
0
0
0
query II
SELECT i, COUNT(*) FROM integers_parquet GROUP BY ALL ORDER BY ALL
----
0 49999000
1 1000
query I
SELECT COUNT(DISTINCT block_id) < 5 FROM pragma_storage_info('integers_parquet');
----
true
# verify that loading in separate SQL statements within the same transaction generates the same size table
statement ok
CREATE TABLE integers_batched_load(i INTEGER)
statement ok
BEGIN TRANSACTION
loop i 0 50
query I
INSERT INTO integers_batched_load SELECT CASE WHEN i % 50000 = 0 THEN 1 ELSE 0 END AS i FROM range(1000000) tbl(i);
----
1000000
endloop
statement ok
COMMIT
query I
SELECT COUNT(DISTINCT block_id) < 4 FROM pragma_storage_info('integers_batched_load');
----
true
query III
SELECT COUNT(*), COUNT(i), SUM(i) FROM integers_batched_load
----
50000000 50000000 1000
# now with NULL values
statement ok
CREATE TABLE integers_batched_load_nulls(i INTEGER)
statement ok
BEGIN TRANSACTION
loop i 0 50
query I
INSERT INTO integers_batched_load_nulls SELECT CASE WHEN i % 50000 = 0 THEN 1 ELSE NULL END AS i FROM range(1000000) tbl(i);
----
1000000
endloop
statement ok
COMMIT
# NULLs are RLE compressed (with Roaring)
# So even with nulls we reach a similar compression ratio
mode skip
query I
SELECT COUNT(DISTINCT block_id) < 8 FROM pragma_storage_info('integers_batched_load_nulls');
----
true
query III
SELECT COUNT(*), COUNT(i), SUM(i) FROM integers_batched_load_nulls
----
50000000 1000 1000

View File

@@ -0,0 +1,53 @@
# name: test/sql/storage/parallel/insert_many_grouping_sets.test_slow
# description: Test parallel insert from many groups
# group: [parallel]
load __TEST_DIR__/insert_many_grouping_sets.db
query I
CREATE TABLE integers AS SELECT i, i%2 as j FROM generate_series(0,9999999,1) tbl(i);
----
10000000
query I
CREATE TABLE integers2 AS SELECT * FROM integers GROUP BY GROUPING SETS ((), (i), (i, j), (j));
----
20000003
query IIIII
SELECT SUM(i), SUM(j), COUNT(*), COUNT(i), COUNT(j) FROM integers;
----
49999995000000 5000000 10000000 10000000 10000000
query IIIII
SELECT SUM(i), SUM(j), COUNT(*), COUNT(i), COUNT(j) FROM integers2;
----
99999990000000 5000001 20000003 20000000 10000002
statement ok
DROP TABLE integers;
statement ok
DROP TABLE integers2;
# now with null values
query I
CREATE TABLE integers AS SELECT case when i%2=0 then null else i end AS i, i%2 as j FROM generate_series(0,9999999,1) tbl(i);
----
10000000
query I
CREATE TABLE integers2 AS SELECT * FROM integers GROUP BY GROUPING SETS ((), (i), (i, j), (j));
----
10000005
query IIIII
SELECT SUM(i), SUM(j), COUNT(*), COUNT(i), COUNT(j) FROM integers
----
25000000000000 5000000 10000000 5000000 10000000
query IIIII
SELECT SUM(i), SUM(j), COUNT(*), COUNT(i), COUNT(j) FROM integers2
----
50000000000000 5000001 10000005 10000000 5000003

View File

@@ -0,0 +1,51 @@
# name: test/sql/storage/parallel/insert_non_order_preserving.test_slow
# description: Test parallel non order-preserving insert
# group: [parallel]
load __TEST_DIR__/insert_non_order_preserving.db
statement ok
PRAGMA preserve_insertion_order=false
query I
CREATE TABLE integers AS SELECT * FROM range(10000000) tbl(i);
----
10000000
query I
CREATE TABLE integers2 AS SELECT * FROM integers
----
10000000
query I
SELECT SUM(i) FROM integers
----
49999995000000
query I
SELECT SUM(i) FROM integers2
----
49999995000000
statement ok
DROP TABLE integers
statement ok
DROP TABLE integers2
# now with null values
statement ok
CREATE TABLE integers AS SELECT case when i%2=0 then null else i end AS i FROM range(10000000) tbl(i);
statement ok
CREATE TABLE integers2 AS SELECT * FROM integers
query I
SELECT SUM(i) FROM integers
----
25000000000000
query I
SELECT SUM(i) FROM integers2
----
25000000000000

View File

@@ -0,0 +1,84 @@
# name: test/sql/storage/parallel/insert_order_preserving.test_slow
# description: Test parallel order-preserving insert
# group: [parallel]
load __TEST_DIR__/insert_order_preserving.db
query I
CREATE TABLE integers AS SELECT * FROM range(10000000) tbl(i);
----
10000000
query I
CREATE TABLE integers2 AS SELECT * FROM integers
----
10000000
query I
SELECT SUM(i) FROM integers
----
49999995000000
query I
SELECT SUM(i) FROM integers2
----
49999995000000
query I
SELECT * FROM integers2 LIMIT 5
----
0
1
2
3
4
query I
SELECT * FROM integers2 LIMIT 5 OFFSET 777778
----
777778
777779
777780
777781
777782
statement ok
DROP TABLE integers
statement ok
DROP TABLE integers2
# now with null values
statement ok
CREATE TABLE integers AS SELECT case when i%2=0 then null else i end AS i FROM range(10000000) tbl(i);
statement ok
CREATE TABLE integers2 AS SELECT * FROM integers
query I
SELECT SUM(i) FROM integers
----
25000000000000
query I
SELECT SUM(i) FROM integers2
----
25000000000000
query I
SELECT * FROM integers2 LIMIT 5
----
NULL
1
NULL
3
NULL
query I
SELECT * FROM integers2 LIMIT 5 OFFSET 777778
----
NULL
777779
NULL
777781
NULL

View File

@@ -0,0 +1,81 @@
# name: test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow
# description: Test parallel order-preserving insert
# group: [parallel]
# There are different numbers of distinct blocks for smaller block sizes,
# because the segment size is bound by the block size.
require block_size 262144
require vector_size 512
require parquet
load __TEST_DIR__/insert_odd_sized_batches.db
query I
CREATE TABLE integers AS SELECT * FROM range(10_000_000) tbl(i);
----
10000000
# Check the block count and median number of rows per row group.
query I
SELECT COUNT(DISTINCT block_id) < 4 FROM pragma_storage_info('integers');
----
true
query I
SELECT MEDIAN(count) FROM pragma_storage_info('integers');
----
122880
statement ok
COPY integers TO '__TEST_DIR__/integers.parquet' (ROW_GROUP_SIZE 77777)
statement ok
CREATE TABLE integers_parquet AS FROM '__TEST_DIR__/integers.parquet';
query I
SELECT * FROM integers_parquet LIMIT 5
----
0
1
2
3
4
query I
SELECT * FROM integers_parquet LIMIT 5 OFFSET 773654
----
773654
773655
773656
773657
773658
query I
SELECT COUNT(DISTINCT block_id) < 4 FROM pragma_storage_info('integers_parquet');
----
true
query I
SELECT MEDIAN(count) > 100000 FROM pragma_storage_info('integers_parquet');
----
true
# FIXME: does this even make sense?
# Verify that reading without preserving insertion order creates a same size table.
statement ok
SET preserve_insertion_order=false
statement ok
CREATE TABLE integers_parquet_no_order AS FROM '__TEST_DIR__/integers.parquet'
query I
SELECT COUNT(DISTINCT block_id) < 30 FROM pragma_storage_info('integers_parquet_no_order');
----
true
query I
SELECT MEDIAN(count) > 100000 FROM pragma_storage_info('integers_parquet_no_order');
----
true

View File

@@ -0,0 +1,63 @@
# name: test/sql/storage/parallel/memory_limit_batch_load.test_slow
# description: Test batch streaming to disk with different row group sizes
# group: [parallel]
require parquet
load __TEST_DIR__/memory_limit_batch_load.db
# in this test we load data of around 100M rows - uncompressed this will be 1.4GB~2GB (without/with NULLs)
# we do these operations with a low memory limit to verify the data is streamed to and from disk correctly
statement ok
SET memory_limit='300MB'
foreach row_group_size 5000 150000 1000000
statement ok
COPY (FROM range(100000000) tbl(i)) TO '__TEST_DIR__/giant_row_groups.parquet' (ROW_GROUP_SIZE ${row_group_size})
statement ok
CREATE TABLE integers AS FROM '__TEST_DIR__/giant_row_groups.parquet'
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers
----
4999999950000000 0 99999999 100000000 100000000
query I
SELECT * FROM integers LIMIT 5 OFFSET 99998
----
99998
99999
100000
100001
100002
statement ok
DROP TABLE integers
# now with NULL values
statement ok
COPY (SELECT CASE WHEN i%2=0 THEN NULL ELSE i END AS i FROM range(100000000) tbl(i)) TO '__TEST_DIR__/giant_row_groups_nulls.parquet' (ROW_GROUP_SIZE ${row_group_size})
statement ok
CREATE TABLE integers AS FROM '__TEST_DIR__/giant_row_groups_nulls.parquet'
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers
----
2500000000000000 1 99999999 50000000 100000000
query I
SELECT * FROM integers LIMIT 5 OFFSET 99998
----
NULL
99999
NULL
100001
NULL
statement ok
DROP TABLE integers
endloop

View File

@@ -0,0 +1,89 @@
# name: test/sql/storage/parallel/memory_limit_batch_load_list.test_slow
# description: Test batch streaming to disk with different row group sizes
# group: [parallel]
require parquet
load __TEST_DIR__/memory_limit_batch_load_list.db
# in this test we load data of around 100M rows - uncompressed this will be 1.4GB~2GB (without/with NULLs)
# we do these operations with a low memory limit to verify the data is streamed to and from disk correctly
statement ok
SET memory_limit='300MB'
foreach row_group_size 5000 150000 1000000
statement ok
COPY (
SELECT [i] AS l FROM range(10000000) tbl(i)
) TO '__TEST_DIR__/giant_row_groups.parquet' (
ROW_GROUP_SIZE ${row_group_size}
)
statement ok
CREATE TABLE list AS FROM '__TEST_DIR__/giant_row_groups.parquet'
query IIIII
SELECT
SUM(i),
MIN(i),
MAX(i),
COUNT(i),
COUNT(*)
FROM (
SELECT UNNEST(l) AS i FROM list
)
----
49999995000000 0 9999999 10000000 10000000
query I
SELECT * FROM list LIMIT 5 OFFSET 99998
----
[99998]
[99999]
[100000]
[100001]
[100002]
statement ok
DROP TABLE list
# now with NULL values
statement ok
COPY (
SELECT CASE WHEN i%2=0 THEN NULL ELSE [i] END AS l FROM range(10000000) tbl(i)
) TO '__TEST_DIR__/giant_row_groups_nulls.parquet' (
ROW_GROUP_SIZE ${row_group_size}
)
statement ok
CREATE TABLE list AS FROM '__TEST_DIR__/giant_row_groups_nulls.parquet'
query IIIII
SELECT
SUM(i),
MIN(i),
MAX(i),
COUNT(i),
COUNT(*)
FROM (
SELECT UNNEST(l) AS i FROM list
)
----
25000000000000 1 9999999 5000000 5000000
query I
SELECT
*
FROM list LIMIT 5 OFFSET 99998
----
NULL
[99999]
NULL
[100001]
NULL
statement ok
DROP TABLE list
endloop

View File

@@ -0,0 +1,103 @@
# name: test/sql/storage/parallel/memory_limit_mixed_batches.test_slow
# description: Test batch streaming to disk with mixed batch sizes
# group: [parallel]
require parquet
load __TEST_DIR__/memory_limit_mixed_batches.db
# in this test we load data of around 100M rows - uncompressed this will be 1.4GB~2GB (without/with NULLs)
# we do these operations with a low memory limit to verify the data is streamed to and from disk correctly
statement ok
SET memory_limit='300MB'
statement ok
COPY (FROM range(20000000) tbl(i)) TO '__TEST_DIR__/mixed_batches_1.parquet' (ROW_GROUP_SIZE 5000)
statement ok
COPY (FROM range(20000000,30000000) tbl(i)) TO '__TEST_DIR__/mixed_batches_2.parquet' (ROW_GROUP_SIZE 200000)
statement ok
COPY (FROM range(30000000,50000000) tbl(i)) TO '__TEST_DIR__/mixed_batches_3.parquet' (ROW_GROUP_SIZE 999)
statement ok
COPY (FROM range(50000000,70000000) tbl(i)) TO '__TEST_DIR__/mixed_batches_4.parquet' (ROW_GROUP_SIZE 99979)
statement ok
COPY (FROM range(70000000,90000000) tbl(i)) TO '__TEST_DIR__/mixed_batches_5.parquet' (ROW_GROUP_SIZE 99979)
statement ok
COPY (FROM range(90000000,100000000) tbl(i)) TO '__TEST_DIR__/mixed_batches_6.parquet' (ROW_GROUP_SIZE 33445)
statement ok
CREATE TABLE integers AS FROM read_parquet([
'__TEST_DIR__/mixed_batches_1.parquet',
'__TEST_DIR__/mixed_batches_2.parquet',
'__TEST_DIR__/mixed_batches_3.parquet',
'__TEST_DIR__/mixed_batches_4.parquet',
'__TEST_DIR__/mixed_batches_5.parquet',
'__TEST_DIR__/mixed_batches_6.parquet'
])
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers
----
4999999950000000 0 99999999 100000000 100000000
query I
SELECT * FROM integers LIMIT 5 OFFSET 99998
----
99998
99999
100000
100001
100002
statement ok
DROP TABLE integers
# now with NULL values
statement ok
COPY (SELECT CASE WHEN i%2=0 THEN NULL ELSE i END AS i FROM range(20000000) tbl(i)) TO '__TEST_DIR__/mixed_batches_1.parquet' (ROW_GROUP_SIZE 5000)
statement ok
COPY (SELECT CASE WHEN i%2=0 THEN NULL ELSE i END AS i FROM range(20000000,30000000) tbl(i)) TO '__TEST_DIR__/mixed_batches_2.parquet' (ROW_GROUP_SIZE 200000)
statement ok
COPY (SELECT CASE WHEN i%2=0 THEN NULL ELSE i END AS i FROM range(30000000,50000000) tbl(i)) TO '__TEST_DIR__/mixed_batches_3.parquet' (ROW_GROUP_SIZE 999)
statement ok
COPY (SELECT CASE WHEN i%2=0 THEN NULL ELSE i END AS i FROM range(50000000,70000000) tbl(i)) TO '__TEST_DIR__/mixed_batches_4.parquet' (ROW_GROUP_SIZE 99979)
statement ok
COPY (SELECT CASE WHEN i%2=0 THEN NULL ELSE i END AS i FROM range(70000000,90000000) tbl(i)) TO '__TEST_DIR__/mixed_batches_5.parquet' (ROW_GROUP_SIZE 99979)
statement ok
COPY (SELECT CASE WHEN i%2=0 THEN NULL ELSE i END AS i FROM range(90000000,100000000) tbl(i)) TO '__TEST_DIR__/mixed_batches_6.parquet' (ROW_GROUP_SIZE 33445)
statement ok
CREATE TABLE integers AS FROM read_parquet([
'__TEST_DIR__/mixed_batches_1.parquet',
'__TEST_DIR__/mixed_batches_2.parquet',
'__TEST_DIR__/mixed_batches_3.parquet',
'__TEST_DIR__/mixed_batches_4.parquet',
'__TEST_DIR__/mixed_batches_5.parquet',
'__TEST_DIR__/mixed_batches_6.parquet'
])
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM integers
----
2500000000000000 1 99999999 50000000 100000000
query I
SELECT * FROM integers LIMIT 5 OFFSET 99998
----
NULL
99999
NULL
100001
NULL
statement ok
DROP TABLE integers

View File

@@ -0,0 +1,77 @@
# name: test/sql/storage/parallel/parallel_insert_selective_filter.test_slow
# description: Test parallel insert with a selective filter
# group: [parallel]
load __TEST_DIR__/parallel_insert_selective.db
query I
CREATE TABLE integers AS SELECT * FROM range(10000000) tbl(i);
----
10000000
# loop and test both with and without preserve preserving order
loop attempts 0 2
# insert values with a selective filter
# not many values are inserted
# verify that we are not creating many small row-groups due to the parallel insertion
query I
CREATE TABLE integers2 AS SELECT * FROM integers WHERE i%100=0
----
100000
loop i 0 10
query I
INSERT INTO integers2 SELECT * FROM integers WHERE i%100=0
----
100000
endloop
query I
SELECT SUM(i) FROM integers
----
49999995000000
query I
SELECT SUM(i) FROM integers2
----
5499945000000
# we have a total of 1.1M values - this should not be more than 20 row groups (ideally it is 11)
query I
select count(distinct row_group_id) < 20 from pragma_storage_info('integers2');
----
true
statement ok
DROP TABLE integers2
# now test a mix of selective and non-selective filters
# we insert all values that have a modulo 100 of 0
# AND all values between 3 and 4 million
query I
CREATE TABLE integers2 AS SELECT * FROM integers WHERE i%100=0 OR (i >= 3000000 AND i <= 4000000)
----
1090000
query I
SELECT SUM(i) FROM integers2
----
3964995000000
# we have a total of 1.1M values - this should not be more than 20 row groups (ideally it is 11)
query I
select count(distinct row_group_id) < 20 from pragma_storage_info('integers2');
----
true
statement ok
DROP TABLE integers2
statement ok
PRAGMA preserve_insertion_order=false
endloop

View File

@@ -0,0 +1,83 @@
# name: test/sql/storage/parallel/reclaim_space_batch_insert.test_slow
# description: Test space reclamation of optimistic writing with batch inserts and a primary key.
# group: [parallel]
require parquet
load __TEST_DIR__/reclaim_space_batch_insert.db
statement ok
COPY (FROM range(10000000) t(i)) TO '__TEST_DIR__/integers.parquet' (FORMAT PARQUET, ROW_GROUP_SIZE 200000);
statement ok
CREATE VIEW integers AS FROM '__TEST_DIR__/integers.parquet';
statement ok
CREATE TABLE integers2 (i INTEGER PRIMARY KEY);
statement ok
INSERT INTO integers2 VALUES (9999999);
statement error
INSERT INTO integers2 SELECT * FROM integers;
----
<REGEX>:Constraint Error.*violates primary key constraint.*
statement ok
CREATE TABLE block_count (count INT);
loop i 0 10
statement error
INSERT INTO integers2 SELECT * FROM integers;
----
<REGEX>:Constraint Error.*violates primary key constraint.*
statement ok
BEGIN;
statement ok
INSERT INTO integers2 VALUES (9999998);
statement error
INSERT INTO integers2 SELECT * FROM integers WHERE i <= 9999998;
----
<REGEX>:Constraint Error.*PRIMARY KEY or UNIQUE constraint violation.*
statement ok
ROLLBACK
query I
SELECT COUNT(*) - ${i} FROM integers2;
----
1
statement ok
INSERT INTO integers2 VALUES (10000000 + ${i});
statement ok
CHECKPOINT;
statement ok
INSERT INTO block_count
SELECT total_blocks FROM pragma_database_size();
query I
SELECT COUNT(*) - ${i} FROM integers2;
----
2
# Ensure there is only a small difference between the MIN and MAX block counts.
# Example table contents for 16kB blocks:
# count: 4766, 4788, 4846, 4749, 4770, 4730, 4711, 4732, 4751, 4711
query I
SELECT
CASE WHEN get_block_size('reclaim_space_batch_insert') = 16384
THEN (MAX(count / 100) - MIN(count / 100)) < 3
ELSE (MAX(count) - MIN(count)) < 3
END FROM block_count;
----
True
endloop

View File

@@ -0,0 +1,74 @@
# name: test/sql/storage/parallel/reclaim_space_insert_unique_idx_optimistic.test_slow
# description: Test space reclamation of optimistic writing with a UNIQUE constraint violation.
# group: [parallel]
load __TEST_DIR__/reclaim_space_unique_index.db
statement ok
CREATE TABLE integers AS SELECT * FROM range(1_000_000) t(i);
statement ok
CREATE TABLE integers2 (i INTEGER);
statement ok
INSERT INTO integers2 VALUES (9999999);
statement ok
CREATE UNIQUE INDEX idx ON integers2(i);
# For smaller block sizes (16KB) the total blocks increase (to twice the original amount) in the first
# iteration, and then stay mostly constant.
statement ok
CREATE TABLE total_blocks_tbl (total_blocks UBIGINT);
loop i 0 20
statement ok
BEGIN;
statement ok
CHECKPOINT;
statement ok
INSERT INTO integers2 VALUES (999_998);
# Invalidate the transaction.
statement error
INSERT INTO integers2 SELECT * FROM integers WHERE i <= 999_998;
----
<REGEX>:Constraint Error.*PRIMARY KEY or UNIQUE constraint violation.*
statement ok
ROLLBACK
# Track the block count of each iteration.
statement ok
INSERT INTO total_blocks_tbl SELECT current.total_blocks FROM pragma_database_size() AS current
endloop
# Ensure that the blocks don't grow between iterations.
query I
WITH tbl(w) AS (
SELECT struct_pack(
total_blocks := total_blocks,
sum := SUM (total_blocks) OVER (ROWS BETWEEN 0 PRECEDING AND 1 FOLLOWING)
) AS w
FROM total_blocks_tbl
LIMIT 19)
SELECT list_filter(LIST(w), lambda x: x.total_blocks * 2.5 < x.sum) FROM tbl;
----
[]
# Ensure that the blocks don't grow 'quietly' between iterations.
query I
WITH tbl(l) AS (
SELECT LIST(total_blocks) AS l FROM total_blocks_tbl)
SELECT list_sum(l) < (list_count(l) * l[3] + 2 * l[3]) FROM tbl;
----
True

View File

@@ -0,0 +1,101 @@
# name: test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow
# description: Test space reclamation of optimistic writing with a PK constraint violation.
# group: [parallel]
load __TEST_DIR__/reclaim_space_primary_key.db
statement ok
SET preserve_insertion_order=false;
statement ok
CREATE TABLE integers AS SELECT * FROM range(10000000) t(i);
statement ok
CREATE TABLE integers2 (i INTEGER PRIMARY KEY);
statement ok
INSERT INTO integers2 VALUES (9999999);
statement error
INSERT INTO integers2 SELECT * FROM integers;
----
<REGEX>:Constraint Error.*violates primary key constraint.*
# For smaller block sizes (16KB) the total blocks increase (to twice the original amount) in the first
# iteration, and then stay constant.
statement ok
CREATE TABLE total_blocks_tbl AS SELECT total_blocks FROM pragma_database_size();
statement ok
CREATE TYPE test_result AS UNION (
ok BOOL,
err STRUCT(
old BIGINT,
allowed_max DECIMAL(21,1),
actual BIGINT)
);
loop i 0 10
statement error
INSERT INTO integers2 SELECT * FROM integers;
----
<REGEX>:Constraint Error.*violates primary key constraint.*
statement ok
BEGIN;
statement ok
INSERT INTO integers2 VALUES (9999998);
statement error
INSERT INTO integers2 SELECT * FROM integers WHERE i <= 9999998;
----
<REGEX>:Constraint Error.*PRIMARY KEY or UNIQUE constraint violation.*
statement ok
ROLLBACK
query I
SELECT COUNT(*) - ${i} FROM integers2;
----
1
statement ok
INSERT INTO integers2 VALUES (10000000 + ${i});
statement ok
CHECKPOINT;
query I
SELECT COUNT(*) - ${i} FROM integers2;
----
2
# Ensure that the total blocks don't exceed the total blocks after the first iteration by more than 1.2.
query I
SELECT
CASE WHEN ${i} = 0 THEN True::test_result
WHEN current.total_blocks <= total_blocks_tbl.total_blocks * 1.6 THEN True::test_result
ELSE {
'old': total_blocks_tbl.total_blocks,
'allowed_max': total_blocks_tbl.total_blocks * 1.6,
'actual': current.total_blocks
}::test_result
END
FROM pragma_database_size() AS current, total_blocks_tbl;
----
true
# Adjust the total_blocks_tbl once to the count after the first iteration.
statement ok
UPDATE total_blocks_tbl SET total_blocks = (
SELECT
CASE WHEN ${i} = 0 THEN (SELECT current.total_blocks FROM pragma_database_size() AS current)
ELSE (total_blocks)END
);
endloop

View File

@@ -0,0 +1,67 @@
# name: test/sql/storage/parallel/reclaim_space_rollback_mixed_batches.test_slow
# description: Test space reclamation of optimistic writing when mixing appends of different batch sizes
# group: [parallel]
load __TEST_DIR__/reclaim_space_mixed_batches.db
require parquet
statement ok
COPY (FROM range(100000) tbl(i)) TO '__TEST_DIR__/mix_batches_small.parquet' (ROW_GROUP_SIZE 5000)
statement ok
COPY (FROM range(100000, 400000) tbl(i)) TO '__TEST_DIR__/mix_batches_large.parquet' (ROW_GROUP_SIZE 200000)
statement ok
COPY (FROM range(400000, 700000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd.parquet' (ROW_GROUP_SIZE 999)
statement ok
COPY (FROM range(700000, 1000000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd_again.parquet' (ROW_GROUP_SIZE 99979)
statement ok
CREATE TABLE integers(i INTEGER);
statement ok
CREATE TABLE block_count(count int)
loop i 0 10
# one big insert
statement ok
BEGIN TRANSACTION
statement ok
INSERT INTO integers FROM read_parquet(['__TEST_DIR__/mix_batches_small.parquet', '__TEST_DIR__/mix_batches_large.parquet', '__TEST_DIR__/mix_batches_odd.parquet', '__TEST_DIR__/mix_batches_odd_again.parquet'])
statement ok
ROLLBACK
# multiple separate inserts
statement ok
BEGIN TRANSACTION
statement ok
INSERT INTO integers FROM '__TEST_DIR__/mix_batches_small.parquet'
statement ok
INSERT INTO integers FROM '__TEST_DIR__/mix_batches_large.parquet'
statement ok
INSERT INTO integers FROM '__TEST_DIR__/mix_batches_odd.parquet'
statement ok
INSERT INTO integers FROM '__TEST_DIR__/mix_batches_odd_again.parquet'
statement ok
ROLLBACK
statement ok
insert into block_count select total_blocks from pragma_database_size();
# ensure there is a small diff between min and max block counts
query I
select (max(count)-min(count))<20 from block_count
----
true
endloop

View File

@@ -0,0 +1,42 @@
# name: test/sql/storage/parallel/tiny_row_group_size.test_slow
# description: Test tiny row group size
# group: [parallel]
statement ok
ATTACH '__TEST_DIR__/tiny_row_group_size.db' (ROW_GROUP_SIZE 2048)
statement ok
USE tiny_row_group_size
statement ok
CREATE TABLE t AS FROM range(1000000) t(i)
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM t
----
499999500000 0 999999 1000000 1000000
# we have a total of 1M values, ideally this is 488 row groups
query II
select count(distinct row_group_id) < 1000, max(count) from pragma_storage_info('t');
----
true 2048
query I
SELECT * FROM t OFFSET 77777 LIMIT 5
----
77777
77778
77779
77780
77781
query II
SELECT i, row_number() OVER () FROM t OFFSET 777776 LIMIT 5
----
777776 777777
777777 777778
777778 777779
777779 777780
777780 777781