should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,28 @@
# name: test/sql/copy/parquet/batched_write/batch_memory_usage.test_slow
# description: Batched Parquet write memory usage
# group: [batched_write]
require parquet
set seed 0.72
statement ok
COPY (SELECT uuid()::VARCHAR as varchar, uuid() AS uuid FROM range(10000000) t(i)) TO '__TEST_DIR__/random_uuids.parquet'
# copy from one parquet file to another in a memory constrained environment
statement ok
SET memory_limit='650MB'
statement ok
COPY '__TEST_DIR__/random_uuids.parquet' TO '__TEST_DIR__/random_uuids_copy.parquet';
# ensure the parquet files hold the same content
statement ok
SET memory_limit='2GB';
# ensure the parquet files hold the same content in the same order
query III
SELECT *, row_number() OVER () as rownum FROM '__TEST_DIR__/random_uuids.parquet'
EXCEPT
SELECT *, row_number() OVER () as rownum FROM '__TEST_DIR__/random_uuids_copy.parquet';
----

View File

@@ -0,0 +1,40 @@
# name: test/sql/copy/parquet/batched_write/batch_memory_usage_mixed_batches.test_slow
# description: Batched Parquet write memory usage with mixed batches
# group: [batched_write]
require parquet
statement ok
COPY (FROM range(100000) tbl(i)) TO '__TEST_DIR__/mem_usage_mix_batches_small.parquet' (ROW_GROUP_SIZE 5000)
statement ok
COPY (FROM range(100000, 400000) tbl(i)) TO '__TEST_DIR__/mem_usage_mix_batches_large.parquet' (ROW_GROUP_SIZE 200000)
statement ok
COPY (FROM range(400000, 700000) tbl(i)) TO '__TEST_DIR__/mem_usage_mix_batches_odd.parquet' (ROW_GROUP_SIZE 999)
statement ok
COPY (FROM range(700000, 1000000) tbl(i)) TO '__TEST_DIR__/mem_usage_mix_batches_odd_again.parquet' (ROW_GROUP_SIZE 99979)
statement ok
CREATE VIEW v1 AS SELECT * FROM parquet_scan([
'__TEST_DIR__/mem_usage_mix_batches_small.parquet',
'__TEST_DIR__/mem_usage_mix_batches_large.parquet',
'__TEST_DIR__/mem_usage_mix_batches_odd.parquet',
'__TEST_DIR__/mem_usage_mix_batches_odd_again.parquet'])
statement ok
SET memory_limit='500MB'
statement ok
COPY v1 TO '__TEST_DIR__/mem_usage_mix_result.parquet'
# ensure the parquet files hold the same content in the same order
statement ok
SET memory_limit='2GB';
query II
SELECT *, row_number() OVER () as rownum FROM v1
EXCEPT
SELECT *, row_number() OVER () as rownum FROM '__TEST_DIR__/mem_usage_mix_result.parquet';
----

View File

@@ -0,0 +1,16 @@
# name: test/sql/copy/parquet/batched_write/batch_memory_usage_small.test_slow
# description: Batched Parquet write memory usage
# group: [batched_write]
require parquet
set seed 0.72
statement ok
COPY (SELECT uuid()::VARCHAR as varchar, uuid() AS uuid FROM range(10000000) t(i)) TO '__TEST_DIR__/random_uuids.parquet'
statement ok
SET memory_limit='750MB'
statement ok
COPY '__TEST_DIR__/random_uuids.parquet' TO '__TEST_DIR__/random_uuids_copy.parquet';

View File

@@ -0,0 +1,88 @@
# name: test/sql/copy/parquet/batched_write/batched_parquet_write.test_slow
# description: Batched copy to file
# group: [batched_write]
require parquet
statement ok
CREATE TABLE integers AS SELECT i, i // 5 AS j FROM range(1000000) t(i) ;
statement ok
COPY integers TO '__TEST_DIR__/batched_integers.parquet';
statement ok
CREATE TABLE integers_copied AS FROM '__TEST_DIR__/batched_integers.parquet'
query IIIII
SELECT SUM(i), SUM(j), COUNT(*), COUNT(i), COUNT(j) FROM integers_copied
----
499999500000 99999500000 1000000 1000000 1000000
query II
SELECT * FROM integers_copied ORDER BY i LIMIT 5
----
0 0
1 0
2 0
3 0
4 0
query II
SELECT * FROM integers_copied ORDER BY i LIMIT 5 OFFSET 99997
----
99997 19999
99998 19999
99999 19999
100000 20000
100001 20000
query II
SELECT * FROM integers_copied QUALIFY i<=lag(i) over ()
----
# now with filters
statement ok
CREATE VIEW v1 AS SELECT * FROM integers WHERE (i%2=0 AND i<300000) OR (i BETWEEN 500000 AND 700000)
statement ok
COPY v1 TO '__TEST_DIR__/batched_integers_filters.parquet';
statement ok
CREATE TABLE integers_filtered AS FROM '__TEST_DIR__/batched_integers_filters.parquet'
foreach table v1 integers_filtered
query IIIII
SELECT SUM(i), SUM(j), COUNT(*), COUNT(i), COUNT(j) FROM ${table}
----
142500450000 28499950000 350001 350001 350001
query II
SELECT * FROM ${table} ORDER BY i LIMIT 5
----
0 0
2 0
4 0
6 1
8 1
query II
SELECT * FROM ${table} ORDER BY i LIMIT 5 OFFSET 99997
----
199994 39998
199996 39999
199998 39999
200000 40000
200002 40000
query II
SELECT * FROM ${table} ORDER BY i LIMIT 5 OFFSET 300000
----
650000 130000
650001 130000
650002 130000
650003 130000
650004 130000
endloop

View File

@@ -0,0 +1,34 @@
# name: test/sql/copy/parquet/batched_write/lineitem_memory_usage.test_slow
# description: Batched lineitem write memory usage
# group: [batched_write]
require parquet
require tpch
load __TEST_DIR__/lineitem_memory_test.db
statement ok
CALL dbgen(sf=1)
statement ok
COPY lineitem TO '__TEST_DIR__/lineitem_memory_usage.parquet'
restart
# copy from one parquet file to another in a memory constrained environment
statement ok
SET memory_limit='500MB'
statement ok
COPY '__TEST_DIR__/lineitem_memory_usage.parquet' TO '__TEST_DIR__/lineitem_memory_usage_copy.parquet';
# ensure the parquet files hold the same content in the same order
statement ok
SET memory_limit='2GB';
query IIIIIIIIIIIIIIIII
SELECT *, row_number() OVER () as rownum FROM '__TEST_DIR__/lineitem_memory_usage.parquet'
EXCEPT
SELECT *, row_number() OVER () as rownum FROM '__TEST_DIR__/lineitem_memory_usage_copy.parquet';
----

View File

@@ -0,0 +1,33 @@
# name: test/sql/copy/parquet/batched_write/parquet_verify_row_group_size.test_slow
# description: Verify row group size is respected
# group: [batched_write]
require parquet
loop i 0 2
foreach row_group_size 777 9999 83838 143431 333333
statement ok
CREATE TABLE integers AS SELECT i, i // 5 AS j FROM range(1000000) t(i) ;
statement ok
COPY integers TO '__TEST_DIR__/row_group_size.parquet' (ROW_GROUP_SIZE ${row_group_size});
statement ok
select row_group_num_rows from parquet_metadata('__TEST_DIR__/row_group_size.parquet');
query I
select abs(median(row_group_num_rows)-${row_group_size})<2048 from parquet_metadata('__TEST_DIR__/row_group_size.parquet');
----
true
statement ok
DROP TABLE integers
endloop
statement ok
SET threads=1
endloop

View File

@@ -0,0 +1,181 @@
# name: test/sql/copy/parquet/batched_write/parquet_write_mixed_batches.test_slow
# description: Test batch Parquet write with mixed batches
# group: [batched_write]
require parquet
statement ok
COPY (FROM range(100000) tbl(i)) TO '__TEST_DIR__/mix_batches_small.parquet' (ROW_GROUP_SIZE 5000)
statement ok
COPY (FROM range(100000, 400000) tbl(i)) TO '__TEST_DIR__/mix_batches_large.parquet' (ROW_GROUP_SIZE 200000)
statement ok
COPY (FROM range(400000, 700000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd.parquet' (ROW_GROUP_SIZE 999)
statement ok
COPY (FROM range(700000, 1000000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd_again.parquet' (ROW_GROUP_SIZE 99979)
# create views that read the batches
statement ok
CREATE VIEW v1 AS SELECT * FROM parquet_scan(['__TEST_DIR__/mix_batches_small.parquet', '__TEST_DIR__/mix_batches_large.parquet', '__TEST_DIR__/mix_batches_odd.parquet', '__TEST_DIR__/mix_batches_odd_again.parquet'])
statement ok
CREATE VIEW v2 AS FROM v1 WHERE (i//10000)%2=0;
statement ok
CREATE VIEW v3 AS FROM v1 WHERE (i//10000)%2=0 OR (i>200000 AND i < 400000) OR (i>600000 AND i < 800000);
# empty table
statement ok
CREATE VIEW v4 AS FROM v1 WHERE i>998 AND i<1000 AND i%2=0
loop i 0 2
query I
COPY v1 TO '__TEST_DIR__/mixed_batches_v1.parquet'
----
1000000
query I
CREATE TABLE mixed_batches_v1 AS FROM '__TEST_DIR__/mixed_batches_v1.parquet'
----
1000000
foreach table v1 mixed_batches_v1
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
499999500000 0 999999 1000000 1000000
query I
SELECT * FROM ${table} LIMIT 5 OFFSET 99998
----
99998
99999
100000
100001
100002
endloop
# now do the same, but filter out half of the values
query I
COPY v2 TO '__TEST_DIR__/mixed_batches_v2.parquet'
----
500000
query I
CREATE TABLE mixed_batches_v2 AS FROM '__TEST_DIR__/mixed_batches_v2.parquet'
----
500000
foreach table v2 mixed_batches_v2
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
247499750000 0 989999 500000 500000
query I
SELECT * FROM ${table} LIMIT 5 OFFSET 99998
----
189998
189999
200000
200001
200002
endloop
# do it again, but this time only filter out SOME small batches
query I
COPY v3 TO '__TEST_DIR__/mixed_batches_v3.parquet'
----
700000
query I
CREATE TABLE mixed_batches_v3 AS FROM '__TEST_DIR__/mixed_batches_v3.parquet'
----
700000
foreach table v3 mixed_batches_v3
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
348499650000 0 989999 700000 700000
query I
SELECT * FROM ${table} LIMIT 5 OFFSET 9999
----
9999
20000
20001
20002
20003
endloop
# now with an empty table
query I
COPY v4 TO '__TEST_DIR__/mixed_batches_v4.parquet'
----
0
query I
CREATE TABLE mixed_batches_v4 AS FROM '__TEST_DIR__/mixed_batches_v4.parquet'
----
0
foreach table v4 mixed_batches_v4
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
NULL NULL NULL 0 0
query I
SELECT * FROM ${table} LIMIT 5
----
endloop
statement ok
DROP TABLE mixed_batches_v1
statement ok
DROP TABLE mixed_batches_v2
statement ok
DROP TABLE mixed_batches_v3
statement ok
DROP TABLE mixed_batches_v4
# Drop the VIEWs that depend on V1
statement ok
DROP VIEW IF EXISTS v2
statement ok
DROP VIEW IF EXISTS v3
statement ok
DROP VIEW IF EXISTS v4
# create views that read the batches using unions
statement ok
CREATE OR REPLACE VIEW v1 AS FROM '__TEST_DIR__/mix_batches_small.parquet' UNION ALL FROM '__TEST_DIR__/mix_batches_large.parquet' UNION ALL FROM '__TEST_DIR__/mix_batches_odd.parquet' UNION ALL FROM '__TEST_DIR__/mix_batches_odd_again.parquet'
statement ok
CREATE OR REPLACE VIEW v2 AS FROM v1 WHERE (i//10000)%2=0;
statement ok
CREATE OR REPLACE VIEW v3 AS FROM v1 WHERE (i//10000)%2=0 OR (i>200000 AND i < 400000) OR (i>600000 AND i < 800000);
statement ok
CREATE OR REPLACE VIEW v4 AS FROM v1 WHERE i>998 AND i<1000 AND i%2=0
endloop

View File

@@ -0,0 +1,55 @@
# name: test/sql/copy/parquet/batched_write/tpch_sf1_parquet.test_slow
# description: Test TPC-H SF1 with Parquet
# group: [batched_write]
require tpch
require parquet
statement ok
CALL dbgen(sf=1, suffix='_original');
foreach tpch_tbl orders customer lineitem nation part partsupp region supplier
statement ok
COPY ${tpch_tbl}_original TO '__TEST_DIR__/${tpch_tbl}.parquet';
statement ok
CREATE VIEW ${tpch_tbl} AS FROM read_parquet('__TEST_DIR__/${tpch_tbl}.parquet');
endloop
# verify the data was written/read in the correct order
query IIIIIIIIIIIIIIII
select * from lineitem qualify l_orderkey<lag(l_orderkey) over ();
----
loop i 1 9
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q0${i}.csv
endloop
loop i 10 23
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q${i}.csv
endloop
query IIIIIIIIIIIIIIII
SELECT MAX(COLUMNS(*)) FROM (FROM lineitem LIMIT 100000 OFFSET 5000000)
----
5099235 199996 10000 7 50.00 104649.50 0.10 0.08 R O 1998-11-30 1998-10-30 1998-12-22 TAKE BACK RETURN TRUCK zzle. express, bold deposits was. slyly e
query IIIIIIIIIIIIIIII
select * from lineitem order by l_extendedprice desc, l_shipdate limit 2;
----
2513090 199999 5038 4 50.00 104949.50 0.02 0.04 A F 1993-10-05 1993-10-17 1993-10-28 TAKE BACK RETURN FOB - ironic, pending pinto be
82823 199998 5037 2 50.00 104899.50 0.04 0.05 A F 1992-04-30 1992-07-05 1992-05-29 COLLECT COD SHIP orbits. bold fox

View File

@@ -0,0 +1,51 @@
# name: test/sql/copy/parquet/batched_write/varying_source_target_row_groups.test_slow
# description: Verify source-target row group size pairs
# group: [batched_write]
require parquet
statement ok
CREATE TABLE integers AS SELECT i, i // 5 AS j FROM range(1000000) t(i) ;
foreach src_size 777 9999 83838 143431 333333
foreach tgt_size 777 9999 83838 143431 333333
statement ok
SET threads=1
statement ok
COPY integers TO '__TEST_DIR__/src_size.parquet' (ROW_GROUP_SIZE ${src_size});
statement ok
SET threads=4
query I
select abs(median(row_group_num_rows)-${src_size})<2048 from parquet_metadata('__TEST_DIR__/src_size.parquet');
----
true
statement ok
COPY '__TEST_DIR__/src_size.parquet' TO '__TEST_DIR__/tgt_size.parquet' (ROW_GROUP_SIZE ${tgt_size});
query I
select abs(median(row_group_num_rows)-${tgt_size})<2048 from parquet_metadata('__TEST_DIR__/tgt_size.parquet');
----
true
# verify the groups are actually written in the same order and contain the same data
query III
SELECT *, row_number() OVER () FROM integers
EXCEPT
SELECT *, row_number() OVER () FROM '__TEST_DIR__/src_size.parquet'
----
query III
SELECT *, row_number() OVER () FROM '__TEST_DIR__/src_size.parquet'
EXCEPT
SELECT *, row_number() OVER () FROM '__TEST_DIR__/tgt_size.parquet'
----
endloop
endloop