should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,91 @@
# name: test/sql/copy/parquet/writer/list_of_bools.test_slow
# description: Parquet write list of bools
# group: [writer]
require parquet
# big list of bools
statement ok
CREATE TABLE list_of_bools AS
SELECT LIST(i%2==0) l FROM range(1373) tbl(i)
UNION ALL
SELECT [true, false, NULL, false, true]
UNION ALL
SELECT []
UNION ALL
SELECT NULL
UNION ALL
SELECT LIST(i%3==0) l FROM range(9937) tbl(i)
UNION ALL
SELECT [true, false, NULL, false, true]
query III
SELECT COUNT(*), COUNT(b), SUM(CASE WHEN b THEN 1 ELSE 0 END)
FROM (SELECT unnest(l) b FROM list_of_bools)
----
11320 11318 4004
statement ok
COPY list_of_bools TO '__TEST_DIR__/list_of_bools.parquet' (FORMAT PARQUET)
query III
SELECT COUNT(*), COUNT(b), SUM(CASE WHEN b THEN 1 ELSE 0 END)
FROM (SELECT unnest(l) b FROM '__TEST_DIR__/list_of_bools.parquet')
----
11320 11318 4004
# many lists of integers
statement ok
CREATE TABLE many_ints AS
SELECT [1, 0, 1] AS l FROM range(1373)
UNION ALL
SELECT []
UNION ALL
SELECT NULL
UNION ALL
SELECT [1, 0, NULL, 0, 1]
UNION ALL
SELECT [1, 0, NULL, 1] l FROM range(9937) tbl(i)
query III
SELECT COUNT(*), COUNT(b), SUM(b)
FROM (SELECT unnest(l) b FROM many_ints)
----
43872 33934 22622
statement ok
COPY many_ints TO '__TEST_DIR__/many_ints.parquet' (FORMAT PARQUET)
query III
SELECT COUNT(*), COUNT(b), SUM(b)
FROM (SELECT unnest(l) b FROM '__TEST_DIR__/many_ints.parquet')
----
43872 33934 22622
# many lists of bools
statement ok
CREATE TABLE many_bools AS
SELECT [true, false, true] AS l FROM range(1373)
UNION ALL
SELECT []
UNION ALL
SELECT NULL
UNION ALL
SELECT [true, false, NULL, false, true]
UNION ALL
SELECT [true, false, NULL, true] l FROM range(9937) tbl(i)
query III
SELECT COUNT(*), COUNT(b), SUM(CASE WHEN b THEN 1 ELSE 0 END)
FROM (SELECT unnest(l) b FROM many_bools)
----
43872 33934 22622
statement ok
COPY many_bools TO '__TEST_DIR__/many_bools.parquet' (FORMAT PARQUET)
query III
SELECT COUNT(*), COUNT(b), SUM(CASE WHEN b THEN 1 ELSE 0 END)
FROM (SELECT unnest(l) b FROM '__TEST_DIR__/many_bools.parquet')
----
43872 33934 22622

View File

@@ -0,0 +1,20 @@
# name: test/sql/copy/parquet/writer/parquet_large_blobs.test_slow
# description: Test writing of large blobs into parquet files
# group: [writer]
require parquet
statement ok
CREATE TABLE large_strings AS SELECT repeat('duckduck', 10000+i) i FROM range(4000) tbl(i);
query III nosort minmaxstrlen
SELECT MIN(strlen(i)), MAX(strlen(i)), AVG(strlen(i)) FROM large_strings;
statement ok
COPY large_strings TO '__TEST_DIR__/largestrings.parquet' (FORMAT PARQUET);
statement ok
SELECT * FROM parquet_metadata('__TEST_DIR__/largestrings.parquet');
query III nosort minmaxstrlen
SELECT MIN(strlen(i)), MAX(strlen(i)), AVG(strlen(i)) FROM large_strings;

View File

@@ -0,0 +1,58 @@
# name: test/sql/copy/parquet/writer/parquet_test_all_types.test
# description: Parquet test_all_types function
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
# intervals are saved with ms precision -> truncate microsecond precision to millisecond
statement ok
CREATE TABLE all_types AS
SELECT * EXCLUDE (bit, "union") REPLACE (
case when extract(month from interval) <> 0 then interval '1 month 1 day 12:13:34.123' else interval end AS interval
)
FROM test_all_types();
statement ok
COPY all_types TO "__TEST_DIR__/all_types.parquet" (FORMAT PARQUET);
# we have to make some replacements to get result equivalence
# hugeint/uhugeint is stored as double -> we have to cast
# TIME WITH TIME ZONE loses the offset
query I nosort alltypes
SELECT * REPLACE (
hugeint::DOUBLE AS hugeint,
uhugeint::DOUBLE AS uhugeint,
time_tz::TIME::TIMETZ AS time_tz
)
FROM all_types
----
query I nosort alltypes
SELECT *
FROM '__TEST_DIR__/all_types.parquet'
----
foreach type TINYINT SMALLINT INT BIGINT UTINYINT USMALLINT UINT UBIGINT HUGEINT UHUGEINT FLOAT DOUBLE
query II
explain select "${type}" from '__TEST_DIR__/all_types.parquet'
WHERE "${type}" IN (127);
----
physical_plan <REGEX>:.*PARQUET_SCAN.*Filters.*
endloop
query II
explain select "VARCHAR" from '__TEST_DIR__/all_types.parquet'
WHERE "VARCHAR" IN ('🦆🦆🦆🦆🦆🦆');
----
physical_plan <REGEX>:.*PARQUET_SCAN.*Filters.*
query II
explain select "bool" from '__TEST_DIR__/all_types.parquet'
WHERE "bool" IN (true);
----
physical_plan <REGEX>:.*PARQUET_SCAN.*Filters.*

View File

@@ -0,0 +1,34 @@
# name: test/sql/copy/parquet/writer/parquet_write_booleans.test
# description: Parquet bools round trip
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE bools(b BOOL)
statement ok
INSERT INTO bools SELECT CASE WHEN i%2=0 THEN NULL ELSE i%7=0 OR i%3=0 END b FROM range(10000) tbl(i);
query IIIIII
SELECT COUNT(*), COUNT(b), BOOL_AND(b), BOOL_OR(b), SUM(CASE WHEN b THEN 1 ELSE 0 END) true_count, SUM(CASE WHEN b THEN 0 ELSE 1 END) false_count
FROM bools
----
10000 5000 False True 2143 7857
statement ok
COPY bools TO '__TEST_DIR__/bools.parquet' (FORMAT 'parquet');
query IIIIII
SELECT COUNT(*), COUNT(b), BOOL_AND(b), BOOL_OR(b), SUM(CASE WHEN b THEN 1 ELSE 0 END) true_count, SUM(CASE WHEN b THEN 0 ELSE 1 END) false_count
FROM '__TEST_DIR__/bools.parquet'
----
10000 5000 False True 2143 7857
query I
SELECT typeof(b) FROM '__TEST_DIR__/bools.parquet' LIMIT 1
----
BOOLEAN

View File

@@ -0,0 +1,49 @@
# name: test/sql/copy/parquet/writer/parquet_write_compression_level.test
# description: Parquet compression level
# group: [writer]
require parquet
# NOTE: since updating ZSTD, compression levels between -131072 and 22 are now supported
# We now also support this, and this test has been updated accordingly
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE integers AS FROM range(100) t(i)
statement error
COPY integers TO '__TEST_DIR__/compress_level.parquet' (FORMAT 'parquet', COMPRESSION_LEVEL 10);
----
only supported
statement ok
COPY integers TO '__TEST_DIR__/compress_level.parquet' (FORMAT 'parquet', CODEC ZSTD, COMPRESSION_LEVEL 0);
statement error
COPY integers TO '__TEST_DIR__/compress_level.parquet' (FORMAT 'parquet', CODEC ZSTD, COMPRESSION_LEVEL 23);
----
level must be between
statement ok
COPY integers TO '__TEST_DIR__/compress_level.parquet' (FORMAT 'parquet', CODEC ZSTD, COMPRESSION_LEVEL -131072);
statement error
COPY integers TO '__TEST_DIR__/compress_level.parquet' (FORMAT 'parquet', CODEC ZSTD, COMPRESSION_LEVEL -131073);
----
level must be between
statement ok
COPY integers TO '__TEST_DIR__/compress_level.parquet' (FORMAT 'parquet', CODEC ZSTD, COMPRESSION_LEVEL 1);
statement ok
COPY integers TO '__TEST_DIR__/compress_level2.parquet' (FORMAT 'parquet', CODEC ZSTD, COMPRESSION_LEVEL 22);
query I nosort clevel
SELECT * FROM '__TEST_DIR__/compress_level.parquet'
----
query I nosort clevel
SELECT * FROM '__TEST_DIR__/compress_level2.parquet'
----

View File

@@ -0,0 +1,35 @@
# name: test/sql/copy/parquet/writer/parquet_write_date.test
# description: Parquet dates round trip
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE dates(d DATE)
statement ok
INSERT INTO dates VALUES (DATE '1992-01-01'), (DATE '1900-01-01'), (NULL), (DATE '2020-09-27')
query I nosort date_scan
SELECT * FROM dates
----
statement ok
COPY dates TO '__TEST_DIR__/dates.parquet' (FORMAT 'parquet');
query I nosort date_scan
SELECT * FROM '__TEST_DIR__/dates.parquet'
----
query I
SELECT typeof(d) FROM '__TEST_DIR__/dates.parquet' LIMIT 1
----
DATE
query I
SELECT * FROM '__TEST_DIR__/dates.parquet' WHERE d='1992-01-01'
----
1992-01-01

View File

@@ -0,0 +1,87 @@
# name: test/sql/copy/parquet/writer/parquet_write_decimals.test
# description: Parquet decimal types round trip
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE decimals(
dec4 DECIMAL(4,1),
dec9 DECIMAL(9,2),
dec18 DECIMAL(18,3),
dec38 DECIMAL(38,4)
);
statement ok
INSERT INTO decimals VALUES (
-999.9,
-9999999.99,
-999999999999999.999,
-999999999999999999999999999999999.9999
), (
NULL, NULL, NULL, NULL
), (
42, 42, 42, 42
), (
-42, -42, -42, -42
), (
0, 0, 0, 0
), (
999.9,
9999999.99,
999999999999999.999,
999999999999999999999999999999999.9999
);
statement ok
COPY decimals TO '__TEST_DIR__/decimals.parquet';
query IIII nosort decimal_scan
SELECT * FROM decimals;
query IIII nosort decimal_scan
SELECT * FROM '__TEST_DIR__/decimals.parquet';
query IIII
SELECT stats_min, stats_max, stats_min_value, stats_max_value FROM parquet_metadata('__TEST_DIR__/decimals.parquet');
----
-999.9 999.9 -999.9 999.9
-9999999.99 9999999.99 -9999999.99 9999999.99
-999999999999999.999 999999999999999.999 -999999999999999.999 999999999999999.999
-999999999999999999999999999999999.9999 999999999999999999999999999999999.9999 -999999999999999999999999999999999.9999 999999999999999999999999999999999.9999
# filter pushdown
statement ok
DELETE FROM decimals WHERE dec4<-42 OR dec4>42
statement ok
COPY decimals TO '__TEST_DIR__/decimals.parquet';
foreach dec_column dec4 dec9 dec18 dec38
query IIII
SELECT * FROM '__TEST_DIR__/decimals.parquet' WHERE ${dec_column}=42
----
42 42 42 42
query IIII
SELECT * FROM '__TEST_DIR__/decimals.parquet' WHERE ${dec_column}=-43
----
query IIII
SELECT * FROM '__TEST_DIR__/decimals.parquet' WHERE ${dec_column}=43
----
endloop
# check statistics
statement ok
PRAGMA disable_verification
query IIII
SELECT stats(dec4), stats(dec9), stats(dec18), stats(dec38) FROM '__TEST_DIR__/decimals.parquet' LIMIT 1
----
[Min: -42.0, Max: 42.0][Has Null: true, Has No Null: true] [Min: -42.00, Max: 42.00][Has Null: true, Has No Null: true] [Min: -42.000, Max: 42.000][Has Null: true, Has No Null: true] [Min: -42.0000, Max: 42.0000][Has Null: true, Has No Null: true]

View File

@@ -0,0 +1,141 @@
# name: test/sql/copy/parquet/writer/parquet_write_enums.test
# description: ENUM tests
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
# standard enum
statement ok
CREATE TYPE mood AS ENUM ('joy', 'ok', 'happy');
statement ok
CREATE TABLE enums(m mood);
statement ok
INSERT INTO enums VALUES
('happy'), ('happy'), ('joy'), ('joy'),
('happy'), ('happy'), ('joy'), ('joy'),
('happy'), ('happy'), ('joy'), ('joy'),
('happy'), ('happy'), ('joy'), ('joy'),
('happy'), ('happy'), ('joy'), ('joy'),
('happy'), ('happy'), ('joy'), ('joy'),
('happy'), ('happy'), ('joy'), ('joy'), ('joy')
statement ok
COPY enums TO '__TEST_DIR__/enums.parquet' (FORMAT PARQUET);
query I
SELECT * FROM '__TEST_DIR__/enums.parquet'
----
happy
happy
joy
joy
happy
happy
joy
joy
happy
happy
joy
joy
happy
happy
joy
joy
happy
happy
joy
joy
happy
happy
joy
joy
happy
happy
joy
joy
joy
# enum with null values
statement ok
UPDATE enums SET m=NULL WHERE m='joy'
statement ok
COPY enums TO '__TEST_DIR__/enums.parquet' (FORMAT PARQUET);
query I
SELECT * FROM '__TEST_DIR__/enums.parquet'
----
happy
happy
NULL
NULL
happy
happy
NULL
NULL
happy
happy
NULL
NULL
happy
happy
NULL
NULL
happy
happy
NULL
NULL
happy
happy
NULL
NULL
happy
happy
NULL
NULL
NULL
# all values are null
statement ok
UPDATE enums SET m=NULL
statement ok
COPY enums TO '__TEST_DIR__/enums.parquet' (FORMAT PARQUET);
query I
SELECT * FROM '__TEST_DIR__/enums.parquet'
----
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL

View File

@@ -0,0 +1,365 @@
# name: test/sql/copy/parquet/writer/parquet_write_field_id.test
# description: Parquet writer FIELD_IDS tests
# group: [writer]
require parquet
# need to supply an argument
statement error
copy (select range as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS)
----
Binder Error
# j is not present so we can't have a field id
statement error
copy (select range as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {j:42})
----
Binder Error
# this should work
statement ok
copy (select range as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:42})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i'
----
42
# needs to be castable to integer, so this works
statement ok
copy (select range as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:'42'})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i'
----
42
# but this doesn't
statement error
copy (select range as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:'abc'})
----
Invalid Input Error
# we can do casts
statement ok
copy (select range as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:42::hugeint})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i'
----
42
# wrong casts should lead to ConversionException
statement error
copy (select range as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:1024::utinyint})
----
Conversion Error
# field id can't be a colref
statement error
copy (select range as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:i})
----
Could not convert string 'i' to INT32
# this shouldn't work
statement error
copy (select range as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS 'oops')
----
Binder Error
# can't have duplicate field id keys
statement error
copy (select range as i, range as j from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:42,i:43})
----
Binder Error
# can't have duplicate field id values either
statement error
copy (select range as i, range as j from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:42,j:41+1})
----
Binder Error
# we don't have to supply a field_id for all columns
statement ok
copy (select range as i, range as j from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:42})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i'
----
42
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'j'
----
NULL
# but we can
statement ok
copy (select range as i, range as j from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:42,j:43})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i'
----
42
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'j'
----
43
# we can also specify the col like this
statement ok
copy (select range as i, range as j from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{__duckdb_field_id:42},j:{__duckdb_field_id:43}})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i'
----
42
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'j'
----
43
# i is not a nested type, so we can't specify nested field ids
statement error
copy (select range as i, range as j from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{__duckdb_field_id:42,j:43}})
----
Binder Error
# we tested a non-nested column type, now do all the nested types so we test all the code paths
# list
statement ok
copy (select range(range, range + 3) as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{__duckdb_field_id:42,element:43}})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i'
----
42
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'element'
----
43
# we don't have to specify a field_id for the top-level list, we can also just specify for the nested children
statement ok
copy (select range(range, range + 3) as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{element:43}})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i'
----
NULL
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'element'
----
43
# list child is always called "element"
statement error
copy (select range(range, range + 3) as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{__duckdb_field_id:42,elem:43}})
----
Binder Error: Column name "elem" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this column is a partition column. Available column names: [element]
# struct
statement ok
copy (select {f : range} as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{__duckdb_field_id:42,f:43}})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i' and num_children > 0
----
42
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'f'
----
43
# struct does not have child "g"
statement error
copy (select {f : range} as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{__duckdb_field_id:42,g:43}})
----
Binder Error: Column name "g" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this column is a partition column. Available column names: [f]
# map
statement ok
copy (select map {range : 10 - range} as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{__duckdb_field_id:42,key:43,value:44}})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i' and num_children > 0
----
42
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'key'
----
43
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'value'
----
44
# map type children need to be called "key" and "value"
statement error
copy (select map {range : 10 - range} as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{__duckdb_field_id:42,k:43,v:44}})
----
Binder Error: Column name "k" specified in FIELD_IDS not found.
# test auto-generation (flat)
statement ok
copy (select range as i, range as j from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS 'auto')
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i'
----
0
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'j'
----
1
# big nestedness
statement ok
set variable field_id_values={i:{__duckdb_field_id:42,key:43,value:{__duckdb_field_id:44,element:{__duckdb_field_id:45,j:46}}}}
statement ok
copy (select map {'my_key' : [{j : 42}]} as i) to '__TEST_DIR__/my.parquet' (FIELD_IDS getvariable('field_id_values'))
query II
select name, field_id from parquet_schema('__TEST_DIR__/my.parquet') where name in ('i', 'key', 'value', 'element', 'j') order by field_id
----
i 42
key 43
value 44
element 45
j 46
# we can't specify "f" in the first level struct
statement error
copy (select {f : range} as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{__duckdb_field_id:42}, f:43})
----
Binder Error
# needs to be called exactly "__duckdb_field_id"
statement error
copy (select {f : range} as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i:{field_id:42, f:43}})
----
Binder Error
# test auto-generation (list)
statement ok
copy (select range(range, range + 3) as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS 'auto')
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i'
----
0
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'element'
----
1
# test auto-generation (struct)
statement ok
copy (select {f : range} as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS 'auto')
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i' and num_children > 0
----
0
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'f'
----
1
# test auto-generation (map)
statement ok
copy (select map {range : 10 - range} as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS 'auto')
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i' and num_children > 0
----
0
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'key'
----
1
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'value'
----
2
# test auto-generation (big nestedness)
statement ok
copy (select map {'my_key' : [{j : 42}]} as i) to '__TEST_DIR__/my.parquet' (FIELD_IDS 'auto')
query II
select name, field_id from parquet_schema('__TEST_DIR__/my.parquet') where name in ('i', 'key', 'value', 'element', 'j') order by field_id
----
i 0
key 1
value 2
element 3
j 4
# cannot have a column named "__duckdb_field_id"
statement error
copy (select range as __duckdb_field_id from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {__duckdb_field_id : 42})
----
Binder Error
statement error
copy (select {__duckdb_field_id : range} as __duckdb_field_id from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {__duckdb_field_id : {__duckdb_field_id : 42}})
----
Binder Error
# we should be case insensitive here (it's just DuckDB col names / struct col names)
statement ok
copy (select range as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {"I" : 42})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i'
----
42
statement ok
copy (select range as "I" from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i : 42})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'I'
----
42
statement ok
copy (select {f : range} as i from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {"I" : {__duckdb_field_id: 42, "F": 43}})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'i' and num_children > 0
----
42
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'f'
----
43
statement ok
copy (select {"F" : range} as "I" from range(10)) to '__TEST_DIR__/my.parquet' (FIELD_IDS {i : {__duckdb_field_id: 42, f: 43}})
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'I' and num_children > 0
----
42
query I
select field_id from parquet_schema('__TEST_DIR__/my.parquet') where name = 'F'
----
43

View File

@@ -0,0 +1,63 @@
# name: test/sql/copy/parquet/writer/parquet_write_home_directory.test
# description: Parquet writer home directory
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
statement ok
SET home_directory='__TEST_DIR__'
statement ok
CREATE TABLE integers AS SELECT * FROM range(10)
statement ok
COPY integers TO '__TEST_DIR__/integers.parquet' (FORMAT PARQUET);
query I
SELECT * FROM '~/integers.parquet'
----
0
1
2
3
4
5
6
7
8
9
statement ok
CREATE TABLE integers_load(i INTEGER);
statement ok
COPY integers_load FROM '~/integers.parquet'
query I
SELECT * FROM integers_load
----
0
1
2
3
4
5
6
7
8
9
# glob from home directory
statement ok
COPY integers TO '__TEST_DIR__/homedir_integers1.parquet'
statement ok
COPY integers TO '__TEST_DIR__/homedir_integers2.parquet'
query I
SELECT COUNT(*) FROM '~/homedir_integers*.parquet'
----
20

View File

@@ -0,0 +1,31 @@
# name: test/sql/copy/parquet/writer/parquet_write_hugeint.test
# description: Parquet hugeint round trip
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE hugeints(h HUGEINT)
statement ok
INSERT INTO hugeints VALUES (-1180591620717411303424), (0), (NULL), (1180591620717411303424)
statement ok
COPY hugeints TO '__TEST_DIR__/hugeints.parquet' (FORMAT 'parquet');
query I
SELECT * FROM '__TEST_DIR__/hugeints.parquet'
----
-1180591620717411303424
0
NULL
1180591620717411303424
query I
SELECT typeof(h) FROM '__TEST_DIR__/hugeints.parquet' LIMIT 1
----
DOUBLE

View File

@@ -0,0 +1,39 @@
# name: test/sql/copy/parquet/writer/parquet_write_interval.test
# description: Parquet interval round trip
# group: [writer]
statement ok
SET default_null_order='nulls_first';
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE IF NOT EXISTS intervals (i interval);
statement ok
INSERT INTO intervals VALUES
(interval '1' day),
(interval '00:00:01'),
(NULL),
(interval '0' month),
(interval '1' month)
statement ok
COPY intervals TO '__TEST_DIR__/intervals.parquet'
query I
SELECT * FROM '__TEST_DIR__/intervals.parquet' ORDER BY 1
----
NULL
00:00:00
00:00:01
1 day
1 month
statement error
COPY (SELECT -interval '1 day') TO '__TEST_DIR__/intervals.parquet'
----
<REGEX>:.*IO Error.*do not support negative intervals.*

View File

@@ -0,0 +1,74 @@
# name: test/sql/copy/parquet/writer/parquet_write_issue_5779.test
# description: Fix #5779: write subsection of list vector to Parquet
# group: [writer]
require parquet
statement ok
CREATE TABLE empty_lists(i INTEGER[]);
statement ok
INSERT INTO empty_lists SELECT [] FROM range(10) UNION ALL SELECT [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
statement ok
COPY (SELECT * FROM empty_lists LIMIT 10) TO '__TEST_DIR__/emptylist_int.parquet';
query I
SELECT * FROM '__TEST_DIR__/emptylist_int.parquet'
----
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
statement ok
CREATE TABLE empty_lists_varchar(i VARCHAR[]);
statement ok
INSERT INTO empty_lists_varchar SELECT [] FROM range(10) UNION ALL SELECT ['hello', 'world', 'this', 'is', 'a', 'varchar', 'list']
statement ok
COPY (SELECT * FROM empty_lists_varchar LIMIT 10) TO '__TEST_DIR__/emptylist_varchar.parquet';
query I
SELECT * FROM '__TEST_DIR__/emptylist_varchar.parquet'
----
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
statement ok
CREATE TABLE empty_list_nested(i INT[][]);
statement ok
INSERT INTO empty_list_nested SELECT [] FROM range(10) UNION ALL SELECT [[1, 2, 3], [4, 5], [6, 7, 8]]
statement ok
COPY (SELECT * FROM empty_list_nested LIMIT 10) TO '__TEST_DIR__/empty_list_nested.parquet';
query I
SELECT * FROM '__TEST_DIR__/empty_list_nested.parquet'
----
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]

View File

@@ -0,0 +1,35 @@
# name: test/sql/copy/parquet/writer/parquet_write_memory_limit.test_slow
# description: Verify data is streamed and memory limit is not exceeded in Parquet write
# group: [writer]
require parquet
require 64bit
load __TEST_DIR__/parquet_write_memory_limit.db
# 100M rows, 2 BIGINT columns = 1.6GB uncompressed
statement ok
COPY (SELECT i, i // 5 AS j FROM range(100000000) t(i)) TO '__TEST_DIR__/large_integers.parquet'
statement ok
SET memory_limit='0.3GB'
# we need to do this otherwise we buffer a lot more data in a BatchedDataCollection
# by disable order preservation we can immediately flush the ColumnDataCollections
statement ok
set preserve_insertion_order=false
# stream from one parquet file to another
query I
COPY '__TEST_DIR__/large_integers.parquet' TO '__TEST_DIR__/large_integers2.parquet'
----
100000000
# verify that the file is correctly written
statement ok
SET memory_limit='-1'
query II
SELECT * FROM '__TEST_DIR__/large_integers.parquet' EXCEPT FROM '__TEST_DIR__/large_integers2.parquet'
----

View File

@@ -0,0 +1,28 @@
# name: test/sql/copy/parquet/writer/parquet_write_memory_usage.test
# description: Parquet writer memory usage
# group: [writer]
require parquet
load __TEST_DIR__/parquet_write_memory_usage.db
statement ok
set threads=1
foreach memory_limit,row_group_size 0.8mb,20480 1.6mb,40960
statement ok
set memory_limit='${memory_limit}'
statement ok
copy (select * from range(163840)) to '__TEST_DIR__/parquet_write_memory_usage.parquet' (row_group_size ${row_group_size})
statement ok
set memory_limit='4gb'
query T
select sum(range) = (count(*) * (count(*) - 1)) // 2 from '__TEST_DIR__/parquet_write_memory_usage.parquet'
----
true
endloop

View File

@@ -0,0 +1,75 @@
# name: test/sql/copy/parquet/writer/parquet_write_signed.test
# description: Parquet signed types round trip
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE values_TINYINT AS SELECT d::TINYINT d FROM (VALUES
(-128), (42), (NULL), (127)) tbl (d);
statement ok
CREATE TABLE values_SMALLINT AS SELECT d::SMALLINT d FROM (VALUES
(-32768), (42), (NULL), (32767)) tbl (d);
statement ok
CREATE TABLE values_INTEGER AS SELECT d::INTEGER d FROM (VALUES
(-2147483648), (42), (NULL), (2147483647)) tbl (d);
statement ok
CREATE TABLE values_BIGINT AS SELECT d::BIGINT d FROM (VALUES
(-9223372036854775808), (42), (NULL), (9223372036854775807)) tbl (d);
foreach type TINYINT SMALLINT INTEGER BIGINT
statement ok
CREATE OR REPLACE TABLE signed(d ${type})
statement ok
INSERT INTO signed SELECT * FROM values_${type}
statement ok
COPY signed TO '__TEST_DIR__/signed.parquet' (FORMAT 'parquet');
query I
SELECT * FROM '__TEST_DIR__/signed.parquet' EXCEPT SELECT * FROM signed
----
query I
SELECT * FROM signed EXCEPT SELECT * FROM '__TEST_DIR__/signed.parquet'
----
query I
SELECT * FROM '__TEST_DIR__/signed.parquet' WHERE d=42
----
42
query I
SELECT COUNT(*) FROM '__TEST_DIR__/signed.parquet' WHERE d>42
----
1
query I
SELECT COUNT(*) FROM '__TEST_DIR__/signed.parquet' WHERE d>=42
----
2
query I
SELECT COUNT(*) FROM '__TEST_DIR__/signed.parquet' WHERE d<42
----
1
query I
SELECT COUNT(*) FROM '__TEST_DIR__/signed.parquet' WHERE d<=42
----
2
query I
SELECT typeof(d)='${type}' FROM '__TEST_DIR__/signed.parquet' LIMIT 1
----
true
endloop

View File

@@ -0,0 +1,16 @@
# name: test/sql/copy/parquet/writer/parquet_write_string_distinct.test
# description: Write distinct stats for strings
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
statement ok
COPY (SELECT 'hello' FROM range(10)) TO '__TEST_DIR__/string_dict.parquet';
query I
SELECT stats_distinct_count FROM parquet_metadata('__TEST_DIR__/string_dict.parquet');
----
1

View File

@@ -0,0 +1,263 @@
# name: test/sql/copy/parquet/writer/parquet_write_strings.test
# description: Strings tests
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE strings(s VARCHAR);
statement ok
INSERT INTO strings VALUES
('happy'), ('happy'), ('joy'), ('joy'),
('happy'), ('happy'), ('joy'), ('joy'),
('happy'), ('happy'), ('joy'), ('joy'),
('happy'), ('happy'), ('joy'), ('joy'),
('happy'), ('happy'), ('joy'), ('joy'),
('happy'), ('happy'), ('joy'), ('joy'),
('happy'), ('happy'), ('joy'), ('joy'), ('surprise');
statement ok
COPY strings TO '__TEST_DIR__/strings.parquet' (FORMAT PARQUET);
query I
SELECT encodings FROM parquet_metadata('__TEST_DIR__/strings.parquet')
----
RLE_DICTIONARY
query I
SELECT * FROM '__TEST_DIR__/strings.parquet'
----
happy
happy
joy
joy
happy
happy
joy
joy
happy
happy
joy
joy
happy
happy
joy
joy
happy
happy
joy
joy
happy
happy
joy
joy
happy
happy
joy
joy
surprise
query I
SELECT stats_distinct_count FROM parquet_metadata('__TEST_DIR__/strings.parquet')
----
3
# strings with null values
statement ok
UPDATE strings SET s=NULL WHERE s='joy'
statement ok
COPY strings TO '__TEST_DIR__/strings.parquet' (FORMAT PARQUET);
query I
SELECT * FROM '__TEST_DIR__/strings.parquet'
----
happy
happy
NULL
NULL
happy
happy
NULL
NULL
happy
happy
NULL
NULL
happy
happy
NULL
NULL
happy
happy
NULL
NULL
happy
happy
NULL
NULL
happy
happy
NULL
NULL
surprise
# all values are null
statement ok
UPDATE strings SET s=NULL
statement ok
COPY strings TO '__TEST_DIR__/strings.parquet' (FORMAT PARQUET);
query I
SELECT * FROM '__TEST_DIR__/strings.parquet'
----
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
# empty table
statement ok
DELETE FROM strings
statement ok
COPY strings TO '__TEST_DIR__/strings.parquet' (FORMAT PARQUET);
query I
SELECT * FROM '__TEST_DIR__/strings.parquet'
----
# non-dictionary table, also no distinct count
statement ok
DELETE FROM strings
statement ok
INSERT INTO strings VALUES
('0'), ('1'), ('2'), ('3'), ('4'), ('5'), ('6'), ('7'), ('8'), ('9'),
('10'), ('11'), ('12'), ('13'), ('14'), ('15'), ('16'), ('17'), ('18'), ('19'),
('20'), ('21'), ('22'), ('23'), ('24'), ('25'), ('26'), ('27'), ('28'), ('29')
statement ok
COPY strings TO '__TEST_DIR__/strings.parquet' (FORMAT PARQUET);
query I
SELECT encodings FROM parquet_metadata('__TEST_DIR__/strings.parquet')
----
PLAIN
query I
SELECT * FROM '__TEST_DIR__/strings.parquet'
----
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
query I
SELECT stats_distinct_count FROM parquet_metadata('__TEST_DIR__/strings.parquet')
----
NULL
# non-dictionary table with null
statement ok
DELETE FROM strings
statement ok
INSERT INTO strings VALUES
('0'), ('1'), ('2'), (NULL), ('4'), ('5'), ('6'), (NULL), ('8'), ('9'),
('10'), ('11'), ('12'), ('13'), ('14'), ('15'), ('16'), ('17'), ('18'), ('19'),
('20'), (NULL), ('22'), ('23'), ('24'), ('25'), (NULL), ('27'), ('28'), ('29')
statement ok
COPY strings TO '__TEST_DIR__/strings.parquet' (FORMAT PARQUET);
query I
SELECT * FROM '__TEST_DIR__/strings.parquet'
----
0
1
2
NULL
4
5
6
NULL
8
9
10
11
12
13
14
15
16
17
18
19
20
NULL
22
23
24
25
NULL
27
28
29

View File

@@ -0,0 +1,79 @@
# name: test/sql/copy/parquet/writer/parquet_write_timestamp.test
# description: Parquet timestamp round trip
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
foreach type TIMESTAMP TIMESTAMP_MS TIMESTAMP_S
statement ok
CREATE OR REPLACE TABLE timestamps(d ${type})
statement ok
INSERT INTO timestamps VALUES
(TIMESTAMP '1992-01-01 12:03:27'),
(TIMESTAMP '1900-01-01 03:08:47'),
(NULL),
(TIMESTAMP '2020-09-27 13:12:01')
query I nosort ts_scan
SELECT * FROM timestamps
----
statement ok
COPY timestamps TO '__TEST_DIR__/timestamps.parquet' (FORMAT 'parquet');
query I nosort ts_scan
SELECT * FROM '__TEST_DIR__/timestamps.parquet'
----
query I
SELECT * FROM '__TEST_DIR__/timestamps.parquet' WHERE d='1992-01-01 12:03:27'
----
1992-01-01 12:03:27
query I
SELECT typeof(d) FROM '__TEST_DIR__/timestamps.parquet' LIMIT 1
----
TIMESTAMP
endloop
# Nanoseconds are their own type
statement ok
CREATE OR REPLACE TABLE timestamps(d TIMESTAMP_NS)
statement ok
INSERT INTO timestamps VALUES
('1992-01-01 12:03:27.123456789'),
('1900-01-01 03:08:47.987654321'),
(NULL),
('2020-09-27 13:12:01')
query I nosort ns_scan
SELECT * FROM timestamps
----
1992-01-01 12:03:27.123456789
1900-01-01 03:08:47.987654321
NULL
2020-09-27 13:12:01
statement ok
COPY timestamps TO '__TEST_DIR__/timestamps.parquet' (FORMAT 'parquet');
query I nosort ns_scan
SELECT * FROM '__TEST_DIR__/timestamps.parquet'
----
query I
SELECT * FROM '__TEST_DIR__/timestamps.parquet' WHERE d='1992-01-01 12:03:27.123456789'
----
1992-01-01 12:03:27.123456789
query I
SELECT typeof(d) FROM '__TEST_DIR__/timestamps.parquet' LIMIT 1
----
TIMESTAMP_NS

View File

@@ -0,0 +1,67 @@
# name: test/sql/copy/parquet/writer/parquet_write_tpcds.test_slow
# description: Parquet TPC-DS tests
# group: [writer]
require parquet
require tpcds
# answers are generated from postgres
# hence check with NULLS LAST flag
statement ok
PRAGMA default_null_order='NULLS LAST'
statement ok
CREATE SCHEMA tpcds;
statement ok
CALL dsdgen(sf=1, schema='tpcds');
foreach tbl call_center catalog_page catalog_returns catalog_sales customer customer_demographics customer_address date_dim household_demographics inventory income_band item promotion reason ship_mode store store_returns store_sales time_dim warehouse web_page web_returns web_sales web_site
statement ok
COPY tpcds.${tbl} TO '__TEST_DIR__/${tbl}.parquet' (FORMAT 'PARQUET', COMPRESSION 'ZSTD');
statement ok
CREATE VIEW ${tbl} AS SELECT * FROM parquet_scan('__TEST_DIR__/${tbl}.parquet');
endloop
# too slow queries:
# 64, 85
loop i 1 9
query I
PRAGMA tpcds(${i})
----
<FILE>:extension/tpcds/dsdgen/answers/sf1/0${i}.csv
endloop
loop i 10 64
query I
PRAGMA tpcds(${i})
----
<FILE>:extension/tpcds/dsdgen/answers/sf1/${i}.csv
endloop
loop i 65 85
query I
PRAGMA tpcds(${i})
----
<FILE>:extension/tpcds/dsdgen/answers/sf1/${i}.csv
endloop
loop i 86 99
query I
PRAGMA tpcds(${i})
----
<FILE>:extension/tpcds/dsdgen/answers/sf1/${i}.csv
endloop

View File

@@ -0,0 +1,41 @@
# name: test/sql/copy/parquet/writer/parquet_write_tpch.test_slow
# description: Parquet TPC-H tests
# group: [writer]
require parquet
require tpch
statement ok
CREATE SCHEMA tpch;
statement ok
CALL dbgen(sf=1, schema='tpch');
foreach tbl lineitem nation orders supplier part partsupp region customer
statement ok
COPY tpch.${tbl} TO '__TEST_DIR__/${tbl}.parquet' (FORMAT 'PARQUET', COMPRESSION 'ZSTD');
statement ok
CREATE VIEW ${tbl} AS SELECT * FROM parquet_scan('__TEST_DIR__/${tbl}.parquet');
endloop
loop i 1 9
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q0${i}.csv
endloop
loop i 10 23
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q${i}.csv
endloop

View File

@@ -0,0 +1,84 @@
# name: test/sql/copy/parquet/writer/parquet_write_tpch_nested.test_slow
# description: Parquet TPC-H tests
# group: [writer]
require parquet
require tpch
statement ok
CREATE SCHEMA tpch;
statement ok
CALL dbgen(sf=0.1, schema='tpch');
# transform lineitem into a list of structs
statement ok
CREATE VIEW lineitem_array_view AS SELECT LIST({'l_orderkey': l_orderkey,
'l_partkey': l_partkey,
'l_suppkey': l_suppkey,
'l_linenumber': l_linenumber,
'l_quantity': l_quantity,
'l_extendedprice': l_extendedprice,
'l_discount': l_discount,
'l_tax': l_tax,
'l_returnflag': l_returnflag,
'l_linestatus': l_linestatus,
'l_shipdate': l_shipdate,
'l_commitdate': l_commitdate,
'l_receiptdate': l_receiptdate,
'l_shipinstruct': l_shipinstruct,
'l_shipmode': l_shipmode,
'l_comment': l_comment}) lineitem_array FROM tpch.lineitem
statement ok
COPY lineitem_array_view TO '__TEST_DIR__/lineitem.parquet' (FORMAT 'PARQUET', COMPRESSION 'ZSTD');
statement ok
CREATE VIEW lineitem AS SELECT
s.l_orderkey AS l_orderkey,
s.l_partkey AS l_partkey,
s.l_suppkey AS l_suppkey,
s.l_linenumber AS l_linenumber,
s.l_quantity AS l_quantity,
s.l_extendedprice AS l_extendedprice,
s.l_discount AS l_discount,
s.l_tax AS l_tax,
s.l_returnflag AS l_returnflag,
s.l_linestatus AS l_linestatus,
s.l_shipdate AS l_shipdate,
s.l_commitdate AS l_commitdate,
s.l_receiptdate AS l_receiptdate,
s.l_shipinstruct AS l_shipinstruct,
s.l_shipmode AS l_shipmode,
s.l_comment AS l_comment
FROM (SELECT UNNEST(lineitem_array) s FROM parquet_scan('__TEST_DIR__/lineitem.parquet'));
foreach tbl nation orders supplier part partsupp region customer
statement ok
COPY tpch.${tbl} TO '__TEST_DIR__/${tbl}.parquet' (FORMAT 'PARQUET', COMPRESSION 'ZSTD');
statement ok
CREATE VIEW ${tbl} AS SELECT * FROM parquet_scan('__TEST_DIR__/${tbl}.parquet');
endloop
loop i 1 9
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf0.1/q0${i}.csv
endloop
loop i 10 23
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf0.1/q${i}.csv
endloop

View File

@@ -0,0 +1,31 @@
# name: test/sql/copy/parquet/writer/parquet_write_uhugeint.test
# description: Parquet uhugeint round trip
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE hugeints(h UHUGEINT)
statement ok
INSERT INTO hugeints VALUES (0), (1), (NULL), (1180591620717411303424)
statement ok
COPY hugeints TO '__TEST_DIR__/hugeints.parquet' (FORMAT 'parquet');
query I
SELECT * FROM '__TEST_DIR__/hugeints.parquet'
----
0
1
NULL
1180591620717411303424
query I
SELECT typeof(h) FROM '__TEST_DIR__/hugeints.parquet' LIMIT 1
----
DOUBLE

View File

@@ -0,0 +1,75 @@
# name: test/sql/copy/parquet/writer/parquet_write_unsigned.test
# description: Parquet unsigned types round trip
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE values_UTINYINT AS SELECT d::UTINYINT d FROM (VALUES
(0), (42), (NULL), (255)) tbl (d);
statement ok
CREATE TABLE values_USMALLINT AS SELECT d::USMALLINT d FROM (VALUES
(0), (42), (NULL), (65535)) tbl (d);
statement ok
CREATE TABLE values_UINTEGER AS SELECT d::UINTEGER d FROM (VALUES
(0), (42), (NULL), (4294967295)) tbl (d);
statement ok
CREATE TABLE values_UBIGINT AS SELECT d::UBIGINT d FROM (VALUES
(0), (42), (NULL), (18446744073709551615)) tbl (d);
foreach type UTINYINT USMALLINT UINTEGER UBIGINT
statement ok
CREATE OR REPLACE TABLE unsigned(d ${type})
statement ok
INSERT INTO unsigned SELECT * FROM values_${type}
statement ok
COPY unsigned TO '__TEST_DIR__/unsigned.parquet' (FORMAT 'parquet');
query I
SELECT * FROM '__TEST_DIR__/unsigned.parquet' EXCEPT SELECT * FROM unsigned
----
query I
SELECT * FROM unsigned EXCEPT SELECT * FROM '__TEST_DIR__/unsigned.parquet'
----
query I
SELECT * FROM '__TEST_DIR__/unsigned.parquet' WHERE d=42
----
42
query I
SELECT COUNT(*) FROM '__TEST_DIR__/unsigned.parquet' WHERE d>42
----
1
query I
SELECT COUNT(*) FROM '__TEST_DIR__/unsigned.parquet' WHERE d>=42
----
2
query I
SELECT COUNT(*) FROM '__TEST_DIR__/unsigned.parquet' WHERE d<42
----
1
query I
SELECT COUNT(*) FROM '__TEST_DIR__/unsigned.parquet' WHERE d<=42
----
2
query I
SELECT typeof(d)='${type}' FROM '__TEST_DIR__/unsigned.parquet' LIMIT 1
----
true
endloop

View File

@@ -0,0 +1,82 @@
# name: test/sql/copy/parquet/writer/parquet_write_uuid.test
# description: Parquet UUID round trip
# group: [writer]
statement ok
SET default_null_order='nulls_first';
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE IF NOT EXISTS uuid (u uuid);
statement ok
INSERT INTO uuid VALUES
('A0EEBC99-9C0B-4EF8-BB6D-6BB9BD380A11'),
(NULL),
('47183823-2574-4bfd-b411-99ed177d3e43'),
('{10203040506070800102030405060708}'),
('A0EEBC99-9C0B-4EF8-BB6D-6BB9BD380A11'),
(NULL),
('00112233-4455-6677-8899-aabbccddeeff'),
('47183823-2574-4bfd-b411-99ed177d3e43'),
('{10203040506070800102030405060708}'),
('00000000-0000-0000-0000-000000000000'),
('00000000-0000-0000-0000-000000000001'),
('00000000-0000-0000-8000-000000000001'),
('80000000-0000-0000-0000-000000000000'),
('80000000-0000-0000-8000-000000000000'),
('80000000-0000-0000-8fff-ffffffffffff'),
('80000000-0000-0000-ffff-ffffffffffff'),
('8fffffff-ffff-ffff-0000-000000000000'),
('8fffffff-ffff-ffff-8000-000000000000'),
('8fffffff-ffff-ffff-8fff-ffffffffffff'),
('8fffffff-ffff-ffff-ffff-ffffffffffff'),
('ffffffff-ffff-ffff-ffff-ffffffffffff');
statement ok
COPY uuid TO '__TEST_DIR__/uuid.parquet'
query I
SELECT * FROM '__TEST_DIR__/uuid.parquet' ORDER BY 1
----
NULL
NULL
00000000-0000-0000-0000-000000000000
00000000-0000-0000-0000-000000000001
00000000-0000-0000-8000-000000000001
00112233-4455-6677-8899-aabbccddeeff
10203040-5060-7080-0102-030405060708
10203040-5060-7080-0102-030405060708
47183823-2574-4bfd-b411-99ed177d3e43
47183823-2574-4bfd-b411-99ed177d3e43
80000000-0000-0000-0000-000000000000
80000000-0000-0000-8000-000000000000
80000000-0000-0000-8fff-ffffffffffff
80000000-0000-0000-ffff-ffffffffffff
8fffffff-ffff-ffff-0000-000000000000
8fffffff-ffff-ffff-8000-000000000000
8fffffff-ffff-ffff-8fff-ffffffffffff
8fffffff-ffff-ffff-ffff-ffffffffffff
a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11
a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11
ffffffff-ffff-ffff-ffff-ffffffffffff
query IIII
SELECT stats_min, stats_max, stats_min_value, stats_max_value FROM parquet_metadata('__TEST_DIR__/uuid.parquet')
----
00000000-0000-0000-0000-000000000000 ffffffff-ffff-ffff-ffff-ffffffffffff 00000000-0000-0000-0000-000000000000 ffffffff-ffff-ffff-ffff-ffffffffffff
statement ok
CREATE TABLE uuid2 AS SELECT uuid '47183823-2574-4bfd-b411-99ed177d3e43' uuid_val union all select uuid '00112233-4455-6677-8899-aabbccddeeff';
statement ok
COPY uuid2 TO '__TEST_DIR__/uuid2.parquet'
query IIII
SELECT stats_min, stats_max, stats_min_value, stats_max_value FROM parquet_metadata('__TEST_DIR__/uuid2.parquet')
----
00112233-4455-6677-8899-aabbccddeeff 47183823-2574-4bfd-b411-99ed177d3e43 00112233-4455-6677-8899-aabbccddeeff 47183823-2574-4bfd-b411-99ed177d3e43

View File

@@ -0,0 +1,40 @@
# name: test/sql/copy/parquet/writer/parquet_zstd_sequence.test_slow
# description: Test writing of large blobs into parquet files
# group: [writer]
require parquet
require 64bit
statement ok
COPY (SELECT * FROM read_csv_auto('data/csv/sequences.csv.gz', delim=',', header=True) LIMIT 25000) TO '__TEST_DIR__/duckseq.parquet' (FORMAT 'PARQUET', CODEC 'ZSTD', ROW_GROUP_SIZE 25000);
query IIIIII
select count(*), min(strain), max(strain), min(strlen(sequence)), max(strlen(sequence)), avg(strlen(sequence))
from '__TEST_DIR__/duckseq.parquet';
----
25000 AUS/NT01/2020 canine/HKG/20-03695/2020 17340 30018 29855.647080
statement ok
COPY
(
SELECT lstrain::VARCHAR[] lstrain, lsequence::VARCHAR[] lsequence FROM (VALUES ([], []), (NULL, NULL), ([], [])) tbl(lstrain, lsequence)
UNION ALL
SELECT * FROM (
SELECT LIST(strain) AS lstrain, LIST(sequence) AS lsequence FROM '__TEST_DIR__/duckseq.parquet' LIMIT 10000
)
UNION ALL
SELECT * FROM (VALUES ([], []), (NULL, NULL), ([], []))
)
TO '__TEST_DIR__/duckseq2.parquet' (FORMAT 'PARQUET', CODEC 'ZSTD');
query I
SELECT COUNT(*) FROM '__TEST_DIR__/duckseq2.parquet'
----
7
query IIIIII nosort querylabel
select count(*), min(strain), max(strain), min(strlen(sequence)), max(strlen(sequence)), avg(strlen(sequence))
from (SELECT UNNEST(lstrain) AS strain, UNNEST(lsequence) AS sequence FROM '__TEST_DIR__/duckseq2.parquet');
----
100000 ARG/Cordoba-1006-155/2020 tiger/NY/040420/2020 17340 30643 29821.264410

View File

@@ -0,0 +1,24 @@
# name: test/sql/copy/parquet/writer/partition_without_hive.test
# description: Test writing partitioned files WITHOUT hive partitioning
# group: [writer]
require parquet
statement ok
CREATE TABLE t1(part_key INT, val INT);
statement ok
INSERT INTO t1 SELECT i%2, i FROM range(10) t(i);
statement ok
COPY t1 TO '__TEST_DIR__/hive_filters' (FORMAT PARQUET, PARTITION_BY part_key, HIVE_FILE_PATTERN false, WRITE_PARTITION_COLUMNS true);
query I
SELECT file.replace('__TEST_DIR__', '').replace('\', '/') FROM GLOB('__TEST_DIR__/hive_filters/*.parquet') ORDER BY ALL
----
/hive_filters/data_0.parquet
/hive_filters/data_1.parquet
query II
FROM '__TEST_DIR__/hive_filters/*.parquet' EXCEPT ALL FROM t1
----

View File

@@ -0,0 +1,106 @@
# name: test/sql/copy/parquet/writer/row_group_size_bytes.test
# description: Parquet writer ROW_GROUP_SIZE_BYTES tests
# group: [writer]
require parquet
require vector_size 1024
# different vector sizes result in different numbers of rows
require no_vector_verification
statement ok
SET preserve_insertion_order=false
statement error
copy (select 42) to '__TEST_DIR__/tbl.parquet' (ROW_GROUP_SIZE_BYTES)
----
# we can use human-readable memory limits
statement ok
copy (
select range c0,
range c1,
range c2,
range c3,
range c4,
range c5,
range c6,
range c7,
from range(50000)
) to '__TEST_DIR__/tbl.parquet' (ROW_GROUP_SIZE_BYTES '1mb')
query T
select max(row_group_num_rows) from parquet_metadata('__TEST_DIR__/tbl.parquet')
----
16384
# also test that we set this thing
query T
select min(row_group_bytes) != 0 from parquet_metadata('__TEST_DIR__/tbl.parquet')
----
1
# and also just integer values
# we set the memory limit to be half as big, and we get a max row group size of half what we had before
statement ok
copy (
select range c0,
range c1,
range c2,
range c3,
range c4,
range c5,
range c6,
range c7,
from range(50000)
) to '__TEST_DIR__/tbl.parquet' (ROW_GROUP_SIZE_BYTES 500000)
query T
select max(row_group_num_rows) from parquet_metadata('__TEST_DIR__/tbl.parquet')
----
8192
# either limit is checked, so we should get row groups of 10240 even though we set a 1GB limit
statement ok
copy (
select range c0,
range c1,
range c2,
range c3,
range c4,
range c5,
range c6,
range c7,
from range(50000)
) to '__TEST_DIR__/tbl.parquet' (ROW_GROUP_SIZE 10000, ROW_GROUP_SIZE_BYTES '1GB')
query T
select max(row_group_num_rows) from parquet_metadata('__TEST_DIR__/tbl.parquet')
----
10240
# these strings take around 16 + 50 = 66 bytes per string, so 2048 * 66 = 135168 per chunk
# if we set the limit to 200000, then we should get row groups of 4096
statement ok
copy (
select range || repeat('0', 50) c0
from range(50000)
) to '__TEST_DIR__/tbl.parquet' (ROW_GROUP_SIZE_BYTES 200000)
query T
select max(row_group_num_rows) from parquet_metadata('__TEST_DIR__/tbl.parquet')
----
4096
# if we set it to 650000 we should get 10240 row groups
statement ok
copy (
select range || repeat('0', 50) c0
from range(50000)
) to '__TEST_DIR__/tbl.parquet' (ROW_GROUP_SIZE_BYTES 650000)
query T
select max(row_group_num_rows) from parquet_metadata('__TEST_DIR__/tbl.parquet')
----
10240

View File

@@ -0,0 +1,74 @@
# name: test/sql/copy/parquet/writer/skip_empty_write.test
# description: Parquet writer WRITE_EMPTY_FILE false option
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE empty_tbl(i INT, j VARCHAR);
statement ok
CREATE TABLE tbl AS FROM range(10000) t(i) UNION ALL SELECT 100000
# basic usage
statement ok
copy (select 42 where 42=84) to '__TEST_DIR__/empty.parquet' (WRITE_EMPTY_FILE false)
query I
SELECT COUNT(*) FROM glob('__TEST_DIR__/empty.parquet')
----
0
foreach preserve_order true false
statement ok
SET preserve_insertion_order=${preserve_order}
# no file name returned
query IIIIII
copy (select 42 where 42=84) to '__TEST_DIR__/empty.parquet' (WRITE_EMPTY_FILE false, RETURN_STATS)
----
# now with a table
query IIIIII
copy empty_tbl to '__TEST_DIR__/empty.parquet' (WRITE_EMPTY_FILE false, RETURN_STATS)
----
query II
copy empty_tbl to '__TEST_DIR__/empty.parquet' (WRITE_EMPTY_FILE false, RETURN_FILES)
----
0 []
query IIIIII
copy (from tbl where i = 20000) to '__TEST_DIR__/empty.parquet' (WRITE_EMPTY_FILE false, RETURN_STATS)
----
endloop
# write_empty_file with file_size_bytes
query I
copy (select 42 where 42=84) to '__TEST_DIR__/empty_file_size_bytes/' (FORMAT PARQUET, WRITE_EMPTY_FILE false, FILENAME_PATTERN '{uuidv7}.parquet', FILE_SIZE_BYTES 128)
----
0
query I
SELECT COUNT(*) FROM glob('__TEST_DIR__/empty_file_size_bytes/*.parquet')
----
0
statement ok
copy tbl to '__TEST_DIR__/empty_row_groups_per_file.parquet' (WRITE_EMPTY_FILE false, ROW_GROUPS_PER_FILE 1)
# these combinations are not allowed
statement error
copy empty_tbl to '__TEST_DIR__/empty.parquet' (WRITE_EMPTY_FILE false, PARTITION_BY (i))
----
Can't combine
statement error
copy tbl to '__TEST_DIR__/empty.parquet' (WRITE_EMPTY_FILE false, PER_THREAD_OUTPUT)
----
Can't combine

View File

@@ -0,0 +1,75 @@
# name: test/sql/copy/parquet/writer/test_copy_overwrite_parquet.test
# description: Test copy statement with file overwrite on parquet
# group: [writer]
require parquet
# create a table and insert some values
statement ok
CREATE TABLE test (a INTEGER, b VARCHAR(10));
statement ok
INSERT INTO test VALUES (1, 'hello'), (2, 'world '), (3, ' xx');
query IT
SELECT * FROM test ORDER BY 1;
----
1 hello
2 world
3 xx
# copy to the parquet file
query I
COPY test TO '__TEST_DIR__/overwrite.parquet' (FORMAT PARQUET)
----
3
# now copy to the file again
query I
COPY (SELECT * FROM test LIMIT 2) TO '__TEST_DIR__/overwrite.parquet' (FORMAT PARQUET);
----
2
# reload the data from the file: it should only have two rows
statement ok
DELETE FROM test;
query I
COPY test FROM '__TEST_DIR__/overwrite.parquet' (FORMAT PARQUET);
----
2
query IT
SELECT * FROM test ORDER BY 1;
----
1 hello
2 world
# test query returning error does not export to file
statement error
COPY (SELECT i FROM range(1) tbl(i) UNION ALL SELECT concat('hello', i)::INT i FROM range(1) tbl(i)) to '__TEST_DIR__/overwrite.parquet' (FORMAT PARQUET);
----
statement ok
DELETE FROM test;
query I
COPY test FROM '__TEST_DIR__/overwrite.parquet' (FORMAT PARQUET);
----
2
# this test should still pass as data was not overwritten
query IT
SELECT * FROM test ORDER BY 1;
----
1 hello
2 world
# Test USE_TMP_FILE flag with parquet
statement error
COPY (SELECT i FROM range(1) tbl(i) UNION ALL SELECT concat('hello', i)::INT i FROM range(1) tbl(i)) to '__TEST_DIR__/overwrite.parquet' (FORMAT PARQUET, USE_TMP_FILE FALSE);
----
statement error
SELECT * FROM '__TEST_DIR__/overwrite.parquet';
----

View File

@@ -0,0 +1,37 @@
# name: test/sql/copy/parquet/writer/test_parquet_write.test
# description: Parquet basic write
# group: [writer]
require parquet
statement ok
COPY (SELECT 42) TO '__TEST_DIR__/scalar.parquet' (FORMAT 'parquet');
query I
SELECT * FROM parquet_scan('__TEST_DIR__/scalar.parquet');
----
42
# empty result set, single thread
statement ok
CREATE TABLE empty(i INTEGER)
statement ok
COPY (SELECT * FROM empty) TO '__TEST_DIR__/empty.parquet' (FORMAT 'parquet')
query I
SELECT COUNT(*) FROM parquet_scan('__TEST_DIR__/empty.parquet')
----
0
statement ok
SET threads=4;
# empty result set, multi thread
statement ok
COPY (SELECT * FROM empty) TO '__TEST_DIR__/empty_multithread' (FORMAT 'parquet', PER_THREAD_OUTPUT True)
query I
SELECT COUNT(*) FROM parquet_scan('__TEST_DIR__/empty_multithread/*.parquet')
----
0

View File

@@ -0,0 +1,77 @@
# name: test/sql/copy/parquet/writer/test_parquet_write_complex.test
# description: Parquet read and re-write various files
# group: [writer]
require parquet
# alltypes_dictionary: scan as parquet
query I nosort alltypes_dictionary
SELECT * FROM parquet_scan('data/parquet-testing/arrow/alltypes_dictionary.parquet');
----
# rewrite the file
statement ok
COPY (SELECT * FROM parquet_scan('data/parquet-testing/arrow/alltypes_dictionary.parquet')) TO '__TEST_DIR__/alltypes_dictionary.parquet' (FORMAT 'PARQUET')
# verify that the rewritten file has the same values again
query I nosort alltypes_dictionary
SELECT * FROM parquet_scan('__TEST_DIR__/alltypes_dictionary.parquet');
----
# bug687_nulls.parquet
query I nosort bug687_nulls
SELECT * FROM parquet_scan('data/parquet-testing/bug687_nulls.parquet') LIMIT 10;
----
statement ok
COPY (SELECT * FROM parquet_scan('data/parquet-testing/bug687_nulls.parquet')) TO '__TEST_DIR__/bug687_nulls.parquet' (FORMAT 'PARQUET')
query I nosort bug687_nulls
SELECT * FROM parquet_scan('__TEST_DIR__/bug687_nulls.parquet') LIMIT 10;
----
# Issue #1637: booleans encoded incorrectly
statement ok
COPY (SELECT true as x UNION ALL SELECT true) TO '__TEST_DIR__/bug1637_booleans.parquet' (FORMAT 'PARQUET');
# Prior to the #1637 fix, duckdb wrote a parquet file containing true, false
query I
SELECT COUNT(*) FROM parquet_scan('__TEST_DIR__/bug1637_booleans.parquet') WHERE x;
----
2
# userdata1.parquet
query I nosort userdata1.parquet
SELECT * FROM parquet_scan('data/parquet-testing/userdata1.parquet') ORDER BY 1 LIMIT 10;
----
statement ok
COPY (SELECT * FROM parquet_scan('data/parquet-testing/userdata1.parquet')) TO '__TEST_DIR__/userdata1.parquet' (FORMAT 'PARQUET')
query I nosort userdata1.parquet
SELECT * FROM parquet_scan('__TEST_DIR__/userdata1.parquet') ORDER BY 1 LIMIT 10;
----
# gzip codec
statement ok
COPY (SELECT * FROM parquet_scan('data/parquet-testing/userdata1.parquet')) TO '__TEST_DIR__/userdata1-gzip.parquet' (FORMAT 'PARQUET', CODEC 'GZIP')
query I nosort userdata1.parquet
SELECT * FROM parquet_scan('__TEST_DIR__/userdata1-gzip.parquet') ORDER BY 1 LIMIT 10;
----
# uncompressed codec
statement ok
COPY (SELECT * FROM parquet_scan('data/parquet-testing/userdata1.parquet')) TO '__TEST_DIR__/userdata1-uncompressed.parquet' (FORMAT 'PARQUET', CODEC 'UNCOMPRESSED')
query I nosort userdata1.parquet
SELECT * FROM parquet_scan('__TEST_DIR__/userdata1-uncompressed.parquet') ORDER BY 1 LIMIT 10;
----
# zstd codec
statement ok
COPY (SELECT * FROM parquet_scan('data/parquet-testing/userdata1.parquet')) TO '__TEST_DIR__/userdata1-zstd.parquet' (FORMAT 'PARQUET', CODEC 'ZSTD')
query I nosort userdata1.parquet
SELECT * FROM parquet_scan('__TEST_DIR__/userdata1-zstd.parquet') ORDER BY 1 LIMIT 10;
----

View File

@@ -0,0 +1,22 @@
# name: test/sql/copy/parquet/writer/write_big_list.test_slow
# description: Parquet write big list
# group: [writer]
require parquet
# big list (> vector size)
statement ok
CREATE TABLE big_list AS SELECT LIST(CASE WHEN i%2=0 THEN NULL ELSE i END) l FROM range(20000) tbl(i);
query I
SELECT SUM(i) FROM (SELECT UNNEST(l) FROM big_list) t(i)
----
100000000
statement ok
COPY big_list TO '__TEST_DIR__/big_list.parquet' (FORMAT 'parquet');
query I
SELECT SUM(i) FROM (SELECT UNNEST(l) FROM '__TEST_DIR__/big_list.parquet') t(i)
----
100000000

View File

@@ -0,0 +1,137 @@
# name: test/sql/copy/parquet/writer/write_complex_nested.test
# description: Parquet write complex structures
# group: [writer]
require parquet
# struct of lists
statement ok
CREATE TABLE struct_of_lists AS SELECT * FROM (VALUES
({'a': [1, 2, 3], 'b': ['hello', 'world']}),
({'a': [4, NULL, 5], 'b': ['duckduck', 'goose']}),
({'a': NULL, 'b': ['longlonglonglonglonglong', NULL, NULL]}),
(NULL),
({'a': [], 'b': []}),
({'a': [1, 2, 3], 'b': NULL})
) tbl(i);
statement ok
COPY struct_of_lists TO '__TEST_DIR__/complex_list.parquet' (FORMAT 'parquet');
query I
SELECT i FROM parquet_scan('__TEST_DIR__/complex_list.parquet');
----
{'a': [1, 2, 3], 'b': [hello, world]}
{'a': [4, NULL, 5], 'b': [duckduck, goose]}
{'a': NULL, 'b': [longlonglonglonglonglong, NULL, NULL]}
NULL
{'a': [], 'b': []}
{'a': [1, 2, 3], 'b': NULL}
# list of structs
statement ok
CREATE TABLE list_of_structs AS SELECT * FROM (VALUES
([{'a': 1, 'b': 100}, NULL, {'a': 2, 'b': 101}]),
(NULL),
([]),
([{'a': NULL, 'b': 102}, {'a': 3, 'b': NULL}, NULL])
) tbl(i);
statement ok
COPY list_of_structs TO '__TEST_DIR__/complex_list.parquet' (FORMAT 'parquet');
query I
SELECT i FROM parquet_scan('__TEST_DIR__/complex_list.parquet');
----
[{'a': 1, 'b': 100}, NULL, {'a': 2, 'b': 101}]
NULL
[]
[{'a': NULL, 'b': 102}, {'a': 3, 'b': NULL}, NULL]
# list of structs of structs
statement ok
CREATE TABLE list_of_struct_of_structs AS SELECT * FROM (VALUES
([{'a': {'x': 33}, 'b': {'y': 42, 'z': 99}}, NULL, {'a': {'x': NULL}, 'b': {'y': 43, 'z': 100}}]),
(NULL),
([]),
([{'a': NULL, 'b': {'y': NULL, 'z': 101}}, {'a': {'x': 34}, 'b': {'y': 43, 'z': NULL}}]),
([{'a': NULL, 'b': NULL}])
) tbl(i);
statement ok
COPY list_of_struct_of_structs TO '__TEST_DIR__/complex_list.parquet' (FORMAT 'parquet');
query I
SELECT i FROM parquet_scan('__TEST_DIR__/complex_list.parquet');
----
[{'a': {'x': 33}, 'b': {'y': 42, 'z': 99}}, NULL, {'a': {'x': NULL}, 'b': {'y': 43, 'z': 100}}]
NULL
[]
[{'a': NULL, 'b': {'y': NULL, 'z': 101}}, {'a': {'x': 34}, 'b': {'y': 43, 'z': NULL}}]
[{'a': NULL, 'b': NULL}]
# list of lists
# no empty lists or nulls
statement ok
CREATE TABLE list_of_lists_simple AS SELECT * FROM (VALUES
([[1, 2, 3], [4, 5]]),
([[6, 7]]),
([[8, 9, 10], [11, 12]])
) tbl(i);
statement ok
COPY list_of_lists_simple TO '__TEST_DIR__/complex_list.parquet' (FORMAT 'parquet');
query I
SELECT i FROM parquet_scan('__TEST_DIR__/complex_list.parquet');
----
[[1, 2, 3], [4, 5]]
[[6, 7]]
[[8, 9, 10], [11, 12]]
# list of lists with nulls and empty lists
statement ok
CREATE TABLE list_of_lists AS SELECT * FROM (VALUES
([[1, 2, 3], [4, 5], [], [6, 7]]),
([[8, NULL, 10], NULL, []]),
([]),
(NULL),
([[11, 12, 13, 14], [], NULL, [], [], [15], [NULL, NULL, NULL]])
) tbl(i);
statement ok
COPY list_of_lists TO '__TEST_DIR__/complex_list.parquet' (FORMAT 'parquet');
query I
SELECT i FROM parquet_scan('__TEST_DIR__/complex_list.parquet');
----
[[1, 2, 3], [4, 5], [], [6, 7]]
[[8, NULL, 10], NULL, []]
[]
NULL
[[11, 12, 13, 14], [], NULL, [], [], [15], [NULL, NULL, NULL]]
# list of lists of lists of lists
statement ok
CREATE TABLE list_of_lists_of_lists_of_lists AS
SELECT [LIST(i)] i FROM list_of_lists
UNION ALL
SELECT NULL
UNION ALL
SELECT [NULL]
UNION ALL
SELECT [[], NULL, [], []]
UNION ALL
SELECT [[[NULL, NULL, [NULL]], NULL, [[], [7, 8, 9], [NULL], NULL, []]], [], [NULL]]
statement ok
COPY list_of_lists_of_lists_of_lists TO '__TEST_DIR__/complex_list.parquet' (FORMAT 'parquet');
query I
SELECT i FROM parquet_scan('__TEST_DIR__/complex_list.parquet');
----
[[[[1, 2, 3], [4, 5], [], [6, 7]], [[8, NULL, 10], NULL, []], [], NULL, [[11, 12, 13, 14], [], NULL, [], [], [15], [NULL, NULL, NULL]]]]
NULL
[NULL]
[[], NULL, [], []]
[[[NULL, NULL, [NULL]], NULL, [[], [7, 8, 9], [NULL], NULL, []]], [], [NULL]]

View File

@@ -0,0 +1,75 @@
# name: test/sql/copy/parquet/writer/write_list.test
# description: Parquet write list
# group: [writer]
require parquet
# standard list
statement ok
CREATE TABLE list AS SELECT * FROM (VALUES
([1, 2, 3]),
([4, 5]),
([6, 7]),
([8, 9, 10, 11])
) tbl(i);
statement ok
COPY list TO '__TEST_DIR__/test_list.parquet' (FORMAT 'parquet');
query I
SELECT i FROM parquet_scan('__TEST_DIR__/test_list.parquet');
----
[1, 2, 3]
[4, 5]
[6, 7]
[8, 9, 10, 11]
# empty and NULL lists
statement ok
CREATE TABLE null_empty_list AS SELECT * FROM (VALUES
([1, 2, 3]),
([4, 5]),
([6, 7]),
([NULL]),
([]),
([]),
([]),
([]),
([8, NULL, 10, 11]),
(NULL)
) tbl(i);
statement ok
COPY null_empty_list TO '__TEST_DIR__/test_list.parquet' (FORMAT 'parquet');
query I
SELECT * FROM parquet_scan('__TEST_DIR__/test_list.parquet');
----
[1, 2, 3]
[4, 5]
[6, 7]
[NULL]
[]
[]
[]
[]
[8, NULL, 10, 11]
NULL
# empty list
statement ok
COPY (SELECT []::INT[]) TO '__TEST_DIR__/test_empty_list.parquet' (FORMAT 'parquet');
query I
SELECT * FROM '__TEST_DIR__/test_empty_list.parquet'
----
[]
# null list
statement ok
COPY (SELECT NULL::INT[]) TO '__TEST_DIR__/test_null_list.parquet' (FORMAT 'parquet');
query I
SELECT * FROM '__TEST_DIR__/test_null_list.parquet'
----
NULL

View File

@@ -0,0 +1,102 @@
# name: test/sql/copy/parquet/writer/write_map.test
# description: Write maps
# group: [writer]
require parquet
# int -> int map
statement ok
CREATE TABLE int_maps(m MAP(INTEGER,INTEGER));
statement ok
INSERT INTO int_maps VALUES
(MAP([42, 84], [1, 2])),
(MAP([101, 201, 301], [3, NULL, 5])),
(MAP([55, 66, 77], [6, 7, NULL]))
;
statement ok
COPY int_maps TO '__TEST_DIR__/int_map.parquet' (FORMAT PARQUET)
query I
SELECT * FROM '__TEST_DIR__/int_map.parquet'
----
{42=1, 84=2}
{101=3, 201=NULL, 301=5}
{55=6, 66=7, 77=NULL}
statement error
INSERT INTO int_maps VALUES
(MAP([NULL], [NULL]))
;
----
<REGEX>:.*Invalid Input Error: Map keys can not be NULL.*
# parquet does not support keys with null values
statement error
COPY string_map TO '__TEST_DIR__/int_maps.parquet' (FORMAT PARQUET)
----
<REGEX>:.*Catalog Error.*does not exist!.*
# string -> string map
statement ok
CREATE TABLE string_map(m MAP(VARCHAR,VARCHAR));
statement ok
INSERT INTO string_map VALUES
(MAP(['key1', 'key2'], ['value1', 'value2'])),
(MAP(['best band', 'best boyband', 'richest person'], ['Tenacious D', 'Backstreet Boys', 'Jon Lajoie'])),
(MAP([], [])),
(NULL),
(MAP(['option'], [NULL]))
;
statement ok
COPY string_map TO '__TEST_DIR__/string_map.parquet' (FORMAT PARQUET)
query I
SELECT * FROM '__TEST_DIR__/string_map.parquet'
----
{key1=value1, key2=value2}
{best band=Tenacious D, best boyband=Backstreet Boys, richest person=Jon Lajoie}
{}
NULL
{option=NULL}
statement error
INSERT INTO string_map VALUES
(MAP([NULL], [NULL]))
;
----
<REGEX>:.*Invalid Input Error: Map keys can not be NULL.*
# list -> list map
statement ok
CREATE TABLE list_map(m MAP(INT[],INT[]));
statement ok
INSERT INTO list_map VALUES
(MAP([[1, 2, 3], [], [4, 5]], [[6, 7, 8], NULL, [NULL]])),
(MAP([], [])),
(MAP([[1]], [NULL])),
(MAP([[10, 12, 14, 16, 18, 20], []], [[1], [2]]))
;
statement ok
COPY list_map TO '__TEST_DIR__/list_map.parquet' (FORMAT PARQUET)
query I
SELECT * FROM '__TEST_DIR__/list_map.parquet'
----
{[1, 2, 3]=[6, 7, 8], []=NULL, [4, 5]=[NULL]}
{}
{[1]=NULL}
{[10, 12, 14, 16, 18, 20]=[1], []=[2]}
# Keys can not be NULL;
statement error
INSERT INTO list_map VALUES
(MAP([NULL], [NULL]))
;
----
<REGEX>:.*Invalid Input Error: Map keys can not be NULL.*

View File

@@ -0,0 +1,35 @@
# name: test/sql/copy/parquet/writer/write_stats_big_string.test
# description: We avoid writing min/max stats of large strings
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification;
statement ok
CREATE TABLE varchar(v VARCHAR);
statement ok
INSERT INTO varchar VALUES (NULL), ('hello'), (NULL), ('world'), (NULL)
# we write stats when there are only small strings
statement ok
COPY varchar TO '__TEST_DIR__/bigvarchar.parquet'
query IIIIII
SELECT stats_min_value, stats_max_value, stats_min, stats_max, min_is_exact, max_is_exact FROM parquet_metadata('__TEST_DIR__/bigvarchar.parquet')
----
hello world hello world true true
# we truncate stats of large strings
statement ok
INSERT INTO varchar SELECT repeat('A', 100000) v
statement ok
COPY varchar TO '__TEST_DIR__/bigvarchar.parquet'
query IIIIII
SELECT stats_min_value, stats_max_value, stats_min, stats_max, min_is_exact, max_is_exact FROM parquet_metadata('__TEST_DIR__/bigvarchar.parquet')
----
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA world AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA world false true

View File

@@ -0,0 +1,226 @@
# name: test/sql/copy/parquet/writer/write_stats_min_max.test_slow
# description: Write min/max stats to Parquet files
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification;
statement ok
PRAGMA explain_output = OPTIMIZED_ONLY;
statement ok
CREATE TABLE boolean_limits AS SELECT (false)::BOOLEAN min, true::BOOLEAN max
statement ok
CREATE TABLE tinyint_limits AS SELECT (-128)::TINYINT min, 127::TINYINT max
statement ok
CREATE TABLE smallint_limits AS SELECT (-32768)::SMALLINT min, 32767::SMALLINT max
statement ok
CREATE TABLE integer_limits AS SELECT (-2147483648)::INTEGER min, 2147483647::INTEGER max
statement ok
CREATE TABLE bigint_limits AS SELECT (-9223372036854775808)::BIGINT min, 9223372036854775807::BIGINT max
statement ok
CREATE TABLE float_limits AS SELECT (-0.5)::FLOAT min, 0.5::FLOAT max
statement ok
CREATE TABLE double_limits AS SELECT (-0.5)::DOUBLE min, 0.5::DOUBLE max
statement ok
CREATE TABLE varchar_limits AS SELECT 'hello world 👤🏠📕' min, 'look at my ducks 🦆🦆🦆' max;
statement ok
CREATE TABLE blob_limits AS SELECT blob '\x00hello\x00world\x00' min, blob '\x00look\x00at\x00my\x00nullbytes\x00' max;
statement ok
CREATE TABLE date_limits AS SELECT date '1900-01-01' min, date '2030-12-31' max;
statement ok
CREATE TABLE time_limits AS SELECT time '00:00:00' min, time '23:59:59' max;
statement ok
CREATE TABLE timestamp_limits AS SELECT timestamp '1900-01-01 00:00:00' min, timestamp '2030-12-31 23:59:59' max;
statement ok
CREATE TABLE timestamp_s_limits AS SELECT '1900-01-01 00:00:00'::timestamp_s min, '2030-12-31 23:59:59'::timestamp_s max;
statement ok
CREATE TABLE timestamp_ms_limits AS SELECT '1900-01-01 00:00:00'::timestamp_ms min, '2030-12-31 23:59:59'::timestamp_ms max;
statement ok
CREATE TABLE timestamp_ns_limits AS SELECT '1900-01-01 00:00:00'::timestamp_ns min, '2030-12-31 23:59:59'::timestamp_ns max;
# min/max/min_value/max_value for signed tables
foreach type date time timestamp timestamp_s timestamp_ms timestamp_ns varchar blob boolean tinyint smallint integer bigint float double
statement ok
CREATE TABLE tbl(i ${type});
# empty stats (all values are NULL)
statement ok
INSERT INTO tbl SELECT NULL
statement ok
COPY tbl TO '__TEST_DIR__/${type}_stats.parquet' (FORMAT PARQUET);
query IIII
SELECT stats_min_value::${type}, stats_max_value::${type}, stats_min::${type}, stats_max::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet')
----
NULL NULL NULL NULL
# min/max stats
statement ok
INSERT INTO tbl SELECT min FROM ${type}_limits
statement ok
INSERT INTO tbl SELECT max FROM ${type}_limits
statement ok
COPY tbl TO '__TEST_DIR__/${type}_stats.parquet' (FORMAT PARQUET);
query I
SELECT stats_min_value::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet') EXCEPT SELECT min FROM ${type}_limits
----
query I
SELECT stats_max_value::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet') EXCEPT SELECT max FROM ${type}_limits
----
query I
SELECT stats_min::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet') EXCEPT SELECT min FROM ${type}_limits
----
query I
SELECT stats_max::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet') EXCEPT SELECT max FROM ${type}_limits
----
statement ok
DROP TABLE tbl
endloop
statement ok
CREATE TABLE utinyint_limits AS SELECT (0)::UTINYINT min, 255::UTINYINT max
statement ok
CREATE TABLE usmallint_limits AS SELECT (0)::USMALLINT min, 65535::USMALLINT max
statement ok
CREATE TABLE uinteger_limits AS SELECT 0::UINTEGER min, 4294967295::UINTEGER max
statement ok
CREATE TABLE ubigint_limits AS SELECT 0::UBIGINT min, 18446744073709551615::UBIGINT max
# unsigned types only define min_value/max_value
foreach type utinyint usmallint uinteger ubigint
statement ok
CREATE TABLE tbl(i ${type});
# empty stats (all values are NULL)
statement ok
INSERT INTO tbl SELECT NULL
statement ok
COPY tbl TO '__TEST_DIR__/${type}_stats.parquet' (FORMAT PARQUET);
query I
SELECT stats_min_value::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet')
----
NULL
query I
SELECT stats_max_value::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet')
----
NULL
query I
SELECT stats_min::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet')
----
NULL
query I
SELECT stats_max::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet')
----
NULL
# min/max stats
statement ok
INSERT INTO tbl SELECT min FROM ${type}_limits
statement ok
INSERT INTO tbl SELECT max FROM ${type}_limits
statement ok
COPY tbl TO '__TEST_DIR__/${type}_stats.parquet' (FORMAT PARQUET);
query I
SELECT stats_min_value::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet') EXCEPT SELECT min FROM ${type}_limits
----
query I
SELECT stats_max_value::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet') EXCEPT SELECT max FROM ${type}_limits
----
query I
SELECT stats_min::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet')
----
NULL
query I
SELECT stats_max::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet')
----
NULL
statement ok
DROP TABLE tbl
endloop
# no stats for these types
statement ok
CREATE TABLE hugeint_limits AS SELECT (-170141183460469231731687303715884105728)::HUGEINT min, 170141183460469231731687303715884105727::HUGEINT max
foreach type hugeint
statement ok
CREATE TABLE tbl(i ${type});
statement ok
INSERT INTO tbl SELECT min FROM ${type}_limits
statement ok
INSERT INTO tbl SELECT max FROM ${type}_limits
statement ok
COPY tbl TO '__TEST_DIR__/${type}_stats.parquet' (FORMAT PARQUET);
query I
SELECT stats_min_value::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet')
----
NULL
query I
SELECT stats_max_value::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet')
----
NULL
query I
SELECT stats_min::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet')
----
NULL
query I
SELECT stats_max::${type} FROM parquet_metadata('__TEST_DIR__/${type}_stats.parquet')
----
NULL
statement ok
DROP TABLE tbl
endloop

View File

@@ -0,0 +1,96 @@
# name: test/sql/copy/parquet/writer/write_stats_null_count.test
# description: Write null_count stats to Parquet files
# group: [writer]
require parquet
statement ok
PRAGMA enable_verification;
statement ok
PRAGMA explain_output = OPTIMIZED_ONLY;
# null count
statement ok
COPY (SELECT 42 i) TO '__TEST_DIR__/stats.parquet' (FORMAT PARQUET);
query I
SELECT stats_null_count FROM parquet_metadata('__TEST_DIR__/stats.parquet')
----
0
# we can filter out the IS NULL clause if there are no NULL values
query II
EXPLAIN SELECT COUNT(*) FROM '__TEST_DIR__/stats.parquet' WHERE i IS NULL
----
logical_opt <!REGEX>:.*IS.*NULL.*
query I
SELECT COUNT(*) FROM '__TEST_DIR__/stats.parquet' WHERE i IS NULL
----
0
statement ok
COPY (SELECT NULL i) TO '__TEST_DIR__/stats.parquet' (FORMAT PARQUET);
query I
SELECT stats_null_count FROM parquet_metadata('__TEST_DIR__/stats.parquet')
----
1
# we can also filter out the IS NULL clause when everything is NULL
query II
EXPLAIN SELECT COUNT(*) FROM '__TEST_DIR__/stats.parquet' WHERE i IS NULL
----
logical_opt <!REGEX>:.*IS.*NULL.*
query I
SELECT COUNT(*) FROM '__TEST_DIR__/stats.parquet' WHERE i IS NULL
----
1
statement ok
COPY (SELECT * FROM VALUES (42), (NULL) tbl(i)) TO '__TEST_DIR__/stats.parquet' (FORMAT PARQUET);
# we cannot filter out the IS NULL clause when there are mixed NULL/valid
query II
EXPLAIN SELECT COUNT(*) FROM '__TEST_DIR__/stats.parquet' WHERE i IS NULL
----
logical_opt <REGEX>:.*IS.*NULL.*
query I
SELECT COUNT(*) FROM '__TEST_DIR__/stats.parquet' WHERE i IS NULL
----
1
# list null count not supported (i.e. we don't write the null count in this case)
statement ok
COPY (SELECT [42, NULL, 43] i) TO '__TEST_DIR__/stats.parquet' (FORMAT PARQUET);
query I
SELECT stats_null_count FROM parquet_metadata('__TEST_DIR__/stats.parquet')
----
NULL
statement ok
COPY (SELECT {'a': NULL, 'b': 42} i) TO '__TEST_DIR__/stats.parquet' (FORMAT PARQUET);
query I
SELECT stats_null_count FROM parquet_metadata('__TEST_DIR__/stats.parquet')
----
1
0
# struct null count is propagated to the children
# i.e. if a struct itself is null, this counts as NULL for the children
statement ok
CREATE TABLE structs AS SELECT {'a': NULL, 'b': 'hello'} i UNION ALL SELECT NULL UNION ALL SELECT {'a': 84, 'b': 'world'};
statement ok
COPY structs TO '__TEST_DIR__/stats.parquet' (FORMAT PARQUET);
query I
SELECT stats_null_count FROM parquet_metadata('__TEST_DIR__/stats.parquet')
----
2
1

View File

@@ -0,0 +1,139 @@
# name: test/sql/copy/parquet/writer/write_struct.test
# description: Parquet write struct
# group: [writer]
require parquet
# standard struct
statement ok
CREATE TABLE struct AS SELECT * FROM (VALUES
({'a': 42, 'b': 84}),
({'a': 33, 'b': 32}),
({'a': 42, 'b': 27})
) tbl(i);
statement ok
COPY struct TO '__TEST_DIR__/test_struct.parquet' (FORMAT 'parquet');
query I
SELECT * FROM parquet_scan('__TEST_DIR__/test_struct.parquet');
----
{'a': 42, 'b': 84}
{'a': 33, 'b': 32}
{'a': 42, 'b': 27}
# struct with nulls
statement ok
CREATE TABLE struct_nulls AS SELECT * FROM (VALUES
({'a': 42, 'b': 84}),
({'a': NULL, 'b': 32}),
(NULL),
({'a': 42, 'b': NULL})
) tbl(i);
statement ok
COPY struct_nulls TO '__TEST_DIR__/test_struct_nulls.parquet' (FORMAT 'parquet');
query I
SELECT * FROM parquet_scan('__TEST_DIR__/test_struct_nulls.parquet');
----
{'a': 42, 'b': 84}
{'a': NULL, 'b': 32}
NULL
{'a': 42, 'b': NULL}
# nested structs
statement ok
CREATE TABLE struct_nested AS SELECT * FROM (VALUES
({'a': {'x': 3, 'x1': 22}, 'b': {'y': 27, 'y1': 44}}),
({'a': {'x': 9, 'x1': 26}, 'b': {'y': 1, 'y1': 999}}),
({'a': {'x': 17, 'x1': 23}, 'b': {'y': 3, 'y1': 9999}})
) tbl(i);
statement ok
COPY struct_nested TO '__TEST_DIR__/struct_nested.parquet' (FORMAT 'parquet');
query I
SELECT * FROM parquet_scan('__TEST_DIR__/struct_nested.parquet');
----
{'a': {'x': 3, 'x1': 22}, 'b': {'y': 27, 'y1': 44}}
{'a': {'x': 9, 'x1': 26}, 'b': {'y': 1, 'y1': 999}}
{'a': {'x': 17, 'x1': 23}, 'b': {'y': 3, 'y1': 9999}}
# nested structs
statement ok
CREATE TABLE struct_nested_null AS SELECT * FROM (VALUES
({'a': {'x': 3, 'x1': 22}, 'b': {'y': NULL, 'y1': 44}}),
({'a': {'x': NULL, 'x1': 26}, 'b': {'y': 1, 'y1': NULL}}),
({'a': {'x': 17, 'x1': NULL}, 'b': {'y': 3, 'y1': 9999}}),
(NULL),
({'a': NULL, 'b': NULL})
) tbl(i);
statement ok
COPY struct_nested_null TO '__TEST_DIR__/struct_nested_null.parquet' (FORMAT 'parquet');
query I
SELECT * FROM parquet_scan('__TEST_DIR__/struct_nested_null.parquet');
----
{'a': {'x': 3, 'x1': 22}, 'b': {'y': NULL, 'y1': 44}}
{'a': {'x': NULL, 'x1': 26}, 'b': {'y': 1, 'y1': NULL}}
{'a': {'x': 17, 'x1': NULL}, 'b': {'y': 3, 'y1': 9999}}
NULL
{'a': NULL, 'b': NULL}
# single struct
statement ok
CREATE TABLE single_struct AS SELECT * FROM (VALUES
({'a': 42}),
({'a': 33}),
({'a': 42})
) tbl(i);
statement ok
COPY single_struct TO '__TEST_DIR__/single_struct.parquet' (FORMAT 'parquet');
query I
SELECT * FROM parquet_scan('__TEST_DIR__/single_struct.parquet');
----
{'a': 42}
{'a': 33}
{'a': 42}
# single struct nulls
statement ok
CREATE TABLE single_struct_null AS SELECT * FROM (VALUES
({'a': 42}),
({'a': NULL}),
(NULL)
) tbl(i);
statement ok
COPY single_struct_null TO '__TEST_DIR__/single_struct_null.parquet' (FORMAT 'parquet');
query I
SELECT * FROM parquet_scan('__TEST_DIR__/single_struct_null.parquet');
----
{'a': 42}
{'a': NULL}
NULL
# nested single struct
statement ok
CREATE TABLE nested_single_struct AS SELECT * FROM (VALUES
({'a': {'b': 42}}),
({'a': {'b': NULL}}),
({'a': NULL}),
(NULL)
) tbl(i);
statement ok
COPY nested_single_struct TO '__TEST_DIR__/nested_single_struct.parquet' (FORMAT 'parquet');
query I
SELECT * FROM parquet_scan('__TEST_DIR__/nested_single_struct.parquet');
----
{'a': {'b': 42}}
{'a': {'b': NULL}}
{'a': NULL}
NULL

View File

@@ -0,0 +1,35 @@
# name: test/sql/copy/parquet/writer/writer_round_trip.test_slow
# description: Parquet read and re-write various files
# group: [writer]
require parquet
foreach parquet_file data/parquet-testing/manyrowgroups.parquet data/parquet-testing/map.parquet data/parquet-testing/arrow/int32_decimal.parquet data/parquet-testing/arrow/nonnullable.impala.parquet data/parquet-testing/bug687_nulls.parquet data/parquet-testing/bug1554.parquet data/parquet-testing/apkwan.parquet data/parquet-testing/arrow/nested_lists.snappy.parquet data/parquet-testing/arrow/nulls.snappy.parquet data/parquet-testing/nan-float.parquet data/parquet-testing/manyrowgroups2.parquet data/parquet-testing/struct.parquet data/parquet-testing/arrow/list_columns.parquet data/parquet-testing/timestamp-ms.parquet data/parquet-testing/arrow/alltypes_dictionary.parquet data/parquet-testing/arrow/binary.parquet data/parquet-testing/arrow/nation.dict-malformed.parquet data/parquet-testing/lineitem-top10000.gzip.parquet data/parquet-testing/arrow/nested_maps.snappy.parquet data/parquet-testing/arrow/dict-page-offset-zero.parquet data/parquet-testing/silly-names.parquet data/parquet-testing/zstd.parquet data/parquet-testing/bug1618_struct_strings.parquet data/parquet-testing/arrow/single_nan.parquet data/parquet-testing/arrow/int64_decimal.parquet data/parquet-testing/filter_bug1391.parquet data/parquet-testing/arrow/fixed_length_decimal_legacy.parquet data/parquet-testing/timestamp.parquet data/parquet-testing/arrow/fixed_length_decimal.parquet data/parquet-testing/leftdate3_192_loop_1.parquet data/parquet-testing/blob.parquet data/parquet-testing/bug1588.parquet data/parquet-testing/bug1589.parquet data/parquet-testing/arrow/alltypes_plain.parquet data/parquet-testing/arrow/repeated_no_annotation.parquet data/parquet-testing/data-types.parquet data/parquet-testing/unsigned.parquet data/parquet-testing/pandas-date.parquet data/parquet-testing/date.parquet data/parquet-testing/arrow/nullable.impala.parquet data/parquet-testing/fixed.parquet data/parquet-testing/arrow/alltypes_plain.snappy.parquet data/parquet-testing/decimal/int32_decimal.parquet data/parquet-testing/decimal/pandas_decimal.parquet data/parquet-testing/decimal/decimal_dc.parquet data/parquet-testing/decimal/int64_decimal.parquet data/parquet-testing/decimal/fixed_length_decimal_legacy.parquet data/parquet-testing/decimal/fixed_length_decimal.parquet data/parquet-testing/glob2/t1.parquet data/parquet-testing/cache/cache1.parquet data/parquet-testing/cache/cache2.parquet data/parquet-testing/glob/t2.parquet data/parquet-testing/glob/t1.parquet data/parquet-testing/bug2557.parquet
statement ok
CREATE TABLE parquet_read AS SELECT * FROM parquet_scan('${parquet_file}');
statement ok
COPY parquet_read TO '__TEST_DIR__/test_round_trip.parquet'
statement ok
CREATE TABLE parquet_write AS SELECT * FROM parquet_scan('__TEST_DIR__/test_round_trip.parquet');
# verify that the count is the same
query I
SELECT COUNT(*) FROM parquet_read EXCEPT SELECT COUNT(*) FROM parquet_write
----
# verify that the data is the same
query I
SELECT COUNT(*) FROM (SELECT * FROM parquet_read EXCEPT SELECT * FROM parquet_write)
----
0
statement ok
DROP TABLE parquet_read
statement ok
DROP TABLE parquet_write
endloop