should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,182 @@
# name: test/sql/copy/csv/batched_write/batch_csv_mixed_batches.test_slow
# description: Test batch CSV write with mixed batches
# group: [batched_write]
require parquet
statement ok
PRAGMA enable_verification
statement ok
COPY (FROM range(100000) tbl(i)) TO '__TEST_DIR__/mix_batches_small.parquet' (ROW_GROUP_SIZE 5000)
statement ok
COPY (FROM range(100000, 400000) tbl(i)) TO '__TEST_DIR__/mix_batches_large.parquet' (ROW_GROUP_SIZE 200000)
statement ok
COPY (FROM range(400000, 700000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd.parquet' (ROW_GROUP_SIZE 999)
statement ok
COPY (FROM range(700000, 1000000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd_again.parquet' (ROW_GROUP_SIZE 99979)
# create views that read the batches
statement ok
CREATE VIEW v1 AS SELECT * FROM parquet_scan(['__TEST_DIR__/mix_batches_small.parquet', '__TEST_DIR__/mix_batches_large.parquet', '__TEST_DIR__/mix_batches_odd.parquet', '__TEST_DIR__/mix_batches_odd_again.parquet'])
statement ok
CREATE VIEW v2 AS FROM v1 WHERE (i//10000)%2=0;
statement ok
CREATE VIEW v3 AS FROM v1 WHERE (i//10000)%2=0 OR (i>200000 AND i < 400000) OR (i>600000 AND i < 800000);
# empty table
statement ok
CREATE VIEW v4 AS FROM v1 WHERE i>998 AND i<1000 AND i%2=0
loop i 0 2
query I
COPY v1 TO '__TEST_DIR__/mixed_batches_v1.csv' (HEADER)
----
1000000
query I
CREATE TABLE mixed_batches_v1 AS FROM '__TEST_DIR__/mixed_batches_v1.csv'
----
1000000
foreach table v1 mixed_batches_v1
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
499999500000 0 999999 1000000 1000000
query I
SELECT * FROM ${table} LIMIT 5 OFFSET 99998
----
99998
99999
100000
100001
100002
endloop
# now do the same, but filter out half of the values
query I
COPY v2 TO '__TEST_DIR__/mixed_batches_v2.csv' (HEADER)
----
500000
query I
CREATE TABLE mixed_batches_v2 AS FROM '__TEST_DIR__/mixed_batches_v2.csv'
----
500000
foreach table v2 mixed_batches_v2
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
247499750000 0 989999 500000 500000
query I
SELECT * FROM ${table} LIMIT 5 OFFSET 99998
----
189998
189999
200000
200001
200002
endloop
# do it again, but this time only filter out SOME small batches
query I
COPY v3 TO '__TEST_DIR__/mixed_batches_v3.csv' (HEADER)
----
700000
query I
CREATE TABLE mixed_batches_v3 AS FROM '__TEST_DIR__/mixed_batches_v3.csv'
----
700000
foreach table v3 mixed_batches_v3
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
348499650000 0 989999 700000 700000
query I
SELECT * FROM ${table} LIMIT 5 OFFSET 9999
----
9999
20000
20001
20002
20003
endloop
# now with an empty table
query I
COPY v4 TO '__TEST_DIR__/mixed_batches_v4.csv' (HEADER)
----
0
query I
CREATE TABLE mixed_batches_v4 AS SELECT i::BIGINT as i FROM read_csv_auto('__TEST_DIR__/mixed_batches_v4.csv')
----
0
foreach table v4 mixed_batches_v4
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
NULL NULL NULL 0 0
query I
SELECT * FROM ${table} LIMIT 5
----
endloop
statement ok
DROP TABLE mixed_batches_v1
statement ok
DROP TABLE mixed_batches_v2
statement ok
DROP TABLE mixed_batches_v3
statement ok
DROP TABLE mixed_batches_v4
statement ok
drop view if exists v2;
statement ok
drop view if exists v3;
statement ok
drop view if exists v4;
# create views that read the batches using unions
statement ok
CREATE OR REPLACE VIEW v1 AS FROM '__TEST_DIR__/mix_batches_small.parquet' UNION ALL FROM '__TEST_DIR__/mix_batches_large.parquet' UNION ALL FROM '__TEST_DIR__/mix_batches_odd.parquet' UNION ALL FROM '__TEST_DIR__/mix_batches_odd_again.parquet'
statement ok
CREATE OR REPLACE VIEW v2 AS FROM v1 WHERE (i//10000)%2=0;
statement ok
CREATE OR REPLACE VIEW v3 AS FROM v1 WHERE (i//10000)%2=0 OR (i>200000 AND i < 400000) OR (i>600000 AND i < 800000);
statement ok
CREATE OR REPLACE VIEW v4 AS FROM v1 WHERE i>998 AND i<1000 AND i%2=0
endloop

View File

@@ -0,0 +1,89 @@
# name: test/sql/copy/csv/batched_write/batch_csv_write.test_slow
# description: Batched copy to file
# group: [batched_write]
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE integers AS SELECT i, i // 5 AS j FROM range(1000000) t(i) ;
statement ok
COPY integers TO '__TEST_DIR__/batched_integers.csv' (HEADER);
statement ok
CREATE TABLE integers_copied AS FROM '__TEST_DIR__/batched_integers.csv'
query IIIII
SELECT SUM(i), SUM(j), COUNT(*), COUNT(i), COUNT(j) FROM integers_copied
----
499999500000 99999500000 1000000 1000000 1000000
query II
SELECT * FROM integers_copied LIMIT 5
----
0 0
1 0
2 0
3 0
4 0
query II
SELECT * FROM integers_copied LIMIT 5 OFFSET 99997
----
99997 19999
99998 19999
99999 19999
100000 20000
100001 20000
query II
SELECT * FROM integers_copied QUALIFY i<=lag(i) over ()
----
# now with filters
statement ok
CREATE VIEW v1 AS SELECT * FROM integers WHERE (i%2=0 AND i<300000) OR (i BETWEEN 500000 AND 700000)
statement ok
COPY v1 TO '__TEST_DIR__/batched_integers_filters.csv' (HEADER);
statement ok
CREATE TABLE integers_filtered AS FROM '__TEST_DIR__/batched_integers_filters.csv'
foreach table v1 integers_filtered
query IIIII
SELECT SUM(i), SUM(j), COUNT(*), COUNT(i), COUNT(j) FROM ${table}
----
142500450000 28499950000 350001 350001 350001
query II
SELECT * FROM ${table} LIMIT 5
----
0 0
2 0
4 0
6 1
8 1
query II
SELECT * FROM ${table} LIMIT 5 OFFSET 99997
----
199994 39998
199996 39999
199998 39999
200000 40000
200002 40000
query II
SELECT * FROM ${table} LIMIT 5 OFFSET 300000
----
650000 130000
650001 130000
650002 130000
650003 130000
650004 130000
endloop

View File

@@ -0,0 +1,161 @@
# name: test/sql/copy/csv/batched_write/batch_json_mixed_batches.test_slow
# description: Test batch CSV write with mixed batches
# group: [batched_write]
require parquet
require json
statement ok
PRAGMA enable_verification
statement ok
COPY (FROM range(100000) tbl(i)) TO '__TEST_DIR__/mix_batches_small.parquet' (ROW_GROUP_SIZE 5000)
statement ok
COPY (FROM range(100000, 400000) tbl(i)) TO '__TEST_DIR__/mix_batches_large.parquet' (ROW_GROUP_SIZE 200000)
statement ok
COPY (FROM range(400000, 700000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd.parquet' (ROW_GROUP_SIZE 999)
statement ok
COPY (FROM range(700000, 1000000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd_again.parquet' (ROW_GROUP_SIZE 99979)
# create views that read the batches
statement ok
CREATE VIEW v1 AS SELECT * FROM parquet_scan(['__TEST_DIR__/mix_batches_small.parquet', '__TEST_DIR__/mix_batches_large.parquet', '__TEST_DIR__/mix_batches_odd.parquet', '__TEST_DIR__/mix_batches_odd_again.parquet'])
statement ok
CREATE VIEW v2 AS FROM v1 WHERE (i//10000)%2=0;
statement ok
CREATE VIEW v3 AS FROM v1 WHERE (i//10000)%2=0 OR (i>200000 AND i < 400000) OR (i>600000 AND i < 800000);
# empty table
statement ok
CREATE VIEW v4 AS FROM v1 WHERE i>998 AND i<1000 AND i%2=0
foreach ARRAY_SETTING TRUE FALSE
query I
COPY v1 TO '__TEST_DIR__/mixed_batches_v1.json' (ARRAY ${ARRAY_SETTING})
----
1000000
query I
CREATE TABLE mixed_batches_v1 AS FROM '__TEST_DIR__/mixed_batches_v1.json'
----
1000000
foreach table v1 mixed_batches_v1
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
499999500000 0 999999 1000000 1000000
query I
SELECT * FROM ${table} LIMIT 5 OFFSET 99998
----
99998
99999
100000
100001
100002
endloop
# now do the same, but filter out half of the values
query I
COPY v2 TO '__TEST_DIR__/mixed_batches_v2.json' (ARRAY ${ARRAY_SETTING})
----
500000
query I
CREATE TABLE mixed_batches_v2 AS FROM '__TEST_DIR__/mixed_batches_v2.json'
----
500000
foreach table v2 mixed_batches_v2
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
247499750000 0 989999 500000 500000
query I
SELECT * FROM ${table} LIMIT 5 OFFSET 99998
----
189998
189999
200000
200001
200002
endloop
# do it again, but this time only filter out SOME small batches
query I
COPY v3 TO '__TEST_DIR__/mixed_batches_v3.json' (ARRAY ${ARRAY_SETTING})
----
700000
query I
CREATE TABLE mixed_batches_v3 AS FROM '__TEST_DIR__/mixed_batches_v3.json'
----
700000
foreach table v3 mixed_batches_v3
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
348499650000 0 989999 700000 700000
query I
SELECT * FROM ${table} LIMIT 5 OFFSET 9999
----
9999
20000
20001
20002
20003
endloop
query I
COPY v4 TO '__TEST_DIR__/mixed_batches_v4.json' (ARRAY ${ARRAY_SETTING})
----
0
query I
CREATE TABLE mixed_batches_v4 AS SELECT i::BIGINT as i FROM '__TEST_DIR__/mixed_batches_v4.json' t(i)
----
0
foreach table v4 mixed_batches_v4
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
NULL NULL NULL 0 0
query I
SELECT * FROM ${table} LIMIT 5
----
endloop
statement ok
DROP TABLE mixed_batches_v1
statement ok
DROP TABLE mixed_batches_v2
statement ok
DROP TABLE mixed_batches_v3
statement ok
DROP TABLE mixed_batches_v4
endloop

View File

@@ -0,0 +1,32 @@
# name: test/sql/copy/csv/batched_write/csv_write_memory_limit.test_slow
# description: Verify data is streamed and memory limit is not exceeded in CSV write
# group: [batched_write]
require parquet
require 64bit
statement ok
PRAGMA enable_verification
# 100M rows, 2 BIGINT columns = 1.6GB uncompressed
statement ok
COPY (SELECT i, i // 5 AS j FROM range(100000000) t(i)) TO '__TEST_DIR__/large_integers.parquet'
# set a memory limit of 300MB
statement ok
SET memory_limit='300MB'
# stream from one parquet file to another
query I
COPY '__TEST_DIR__/large_integers.parquet' TO '__TEST_DIR__/large_integers.csv'
----
100000000
# verify that the file is correctly written
statement ok
SET memory_limit='-1'
query II
SELECT * FROM '__TEST_DIR__/large_integers.parquet' EXCEPT FROM '__TEST_DIR__/large_integers.csv'
----