should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,30 @@
# name: test/sql/copy/parquet/afl.test
# description: Read afl-generated parquet files
# group: [parquet]
mode skip
require parquet
statement ok
PRAGMA enable_verification
foreach i 1 2 6
statement error
select * from parquet_scan('data/parquet-testing/afl/${i}.parquet')
----
Invalid dictionary page header
endloop
foreach i 3 4 5 7
statement error
select * from parquet_scan('data/parquet-testing/afl/3.parquet')
----
Invalid Error: Parquet file is likely corrupted, missing dictionary
endloop

View File

@@ -0,0 +1,32 @@
# name: test/sql/copy/parquet/alltypes-dictionaries.test
# group: [parquet]
require parquet
foreach type TINYINT SMALLINT INTEGER BIGINT HUGEINT UTINYINT USMALLINT UINTEGER UBIGINT UHUGEINT FLOAT DOUBLE VARCHAR
statement ok
copy (select (r1.range * 10)::${type} r from range(10) r1, range(1000) r2) to '__TEST_DIR__/dict-${type}.parquet' (row_group_size 2048);
query I
select first(encodings) from parquet_metadata('__TEST_DIR__/dict-${type}.parquet') group by encodings;
----
RLE_DICTIONARY
query I
SELECT COUNT(*) from '__TEST_DIR__/dict-${type}.parquet' WHERE r='20'
----
1000
query III
select column_id, BOOL_AND(bloom_filter_offset > 4), BOOL_AND(bloom_filter_length > 1) from parquet_metadata('__TEST_DIR__/dict-${type}.parquet') group by column_id order by column_id;
----
0 true true
#query I
#SELECT bloom_filter_excludes FROM parquet_bloom_probe('__TEST_DIR__/dict-${type}.parquet', 'r', '11');
#----
#true
endloop

View File

@@ -0,0 +1,31 @@
# name: test/sql/copy/parquet/attach_parquet.test
# description: Attach a Parquet file
# group: [parquet]
require parquet
require skip_reload
statement ok
COPY (SELECT 42 val) TO '__TEST_DIR__/file.parquet';
statement ok
ATTACH '__TEST_DIR__/file.parquet' AS attached_parquet
statement ok
USE attached_parquet
query I
SELECT * FROM file
----
42
query I
SELECT * FROM attached_parquet
----
42
statement error
ATTACH 'duckdb:__TEST_DIR__/file.parquet' AS duck_attach
----
not a valid DuckDB database file

View File

@@ -0,0 +1,14 @@
# name: test/sql/copy/parquet/auto_glob_directory.test
# description: Test auto globbing a directory
# group: [parquet]
require parquet
statement ok
COPY (SELECT i%2 AS grp, i FROM range(1000) t(i)) TO '__TEST_DIR__/partitioned_glob.parquet' (PARTITION_BY (grp));
query II
SELECT grp, COUNT(*) FROM '__TEST_DIR__/partitioned_glob.parquet' GROUP BY ALL ORDER BY ALL
----
0 500
1 500

View File

@@ -0,0 +1,13 @@
# name: test/sql/copy/parquet/aws2.test
# description: Read a file created by AWS (#3981)
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
query I
SELECT * FROM 'data/parquet-testing/aws2.parquet'
----
READY

View File

@@ -0,0 +1,22 @@
# name: test/sql/copy/parquet/aws_kinesis.test
# description: Read a file created by AWS Kinesis Firehose DeliveryStreams (#3981)
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
query IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
SELECT * FROM 'data/parquet-testing/aws_kinesis.parquet'
----
2022 11 22 2022-11-22 00:01:00.871 2022-11-22 00:01:01 -129 Hamburg NULL Germany 53.6304 9.98823 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL http://localhost:3000/ / t=tr&ts=1669075260871&u=http%253A%252F%252Flocalhost%253A3000%252F&hn=localhost&pa=%252F&en=tabVisible&pr=%257B%257D 495 200 Hit 0 3320 NULL tabVisible NULL NULL NULL de3bc04229406da23ee45e234a42a66cc542b335517ab585ca43f55cd2dcf781 3bc2c0a60f9f2dd212db07ed80e817f9dd43aa999d16b0b2b8db91ab092a8102 ab40d20596d7595049399578929ffc598abdb8f539bdfa7637cb509f8613dcc7 track
2022 11 22 2022-11-22 00:01:07.67 2022-11-22 00:01:10 -2330 Hamburg NULL Germany 53.6304 9.98823 NULL NULL Chrome 107.0.0.0 Mac OS 10.15.7 Europe/Berlin de-DE NULL NULL 3440x1440 1356x902 24.0 MacIntel 8.0 8.0 NULL NULL NULL NULL NULL http://localhost:3000/azure /azure t=pv&ts=1669075267670&u=http%253A%252F%252Flocalhost%253A3000%252Fazure&hn=localhost&pa=%252Fazure&ua=Mozilla%252F5.0%2520(Macintosh%253B%2520Intel%2520Mac%2520OS%2520X%252010_15_7)%2520AppleWebKit%252F537.36%2520(KHTML%252C%2520like%2520Gecko)%2520Chrome%252F107.0.0.0%2520Safari%252F537.36&iw=1356&ih=902&ti=Map%2520the%2520Cloud&w=3440&h=1440&d=24&l=de-DE&p=MacIntel&m=8&c=8&tz=Europe%252FBerlin 291 200 Hit 1 3320 NULL NULL NULL NULL NULL cb7736f1c3ce9b9a21bb9a7b17edfd9507e7c6261ded2a8ebc7f19189d6de8c6 4e5aea60a9b73aa0f2aa69967e9de9a58b3341d5bc342f94d2eef07859b658da ab40d20596d7595049399578929ffc598abdb8f539bdfa7637cb509f8613dcc7 pageview
2022 11 22 2022-11-22 00:01:13.175 2022-11-22 00:01:16 -2825 Hamburg NULL Germany 53.6304 9.98823 NULL NULL Chrome 107.0.0.0 Mac OS 10.15.7 Europe/Berlin de-DE NULL NULL 3440x1440 1356x902 24.0 MacIntel 8.0 8.0 NULL NULL NULL NULL NULL http://localhost:3000/aws /aws t=pv&ts=1669075273175&u=http%253A%252F%252Flocalhost%253A3000%252Faws&hn=localhost&pa=%252Faws&ua=Mozilla%252F5.0%2520(Macintosh%253B%2520Intel%2520Mac%2520OS%2520X%252010_15_7)%2520AppleWebKit%252F537.36%2520(KHTML%252C%2520like%2520Gecko)%2520Chrome%252F107.0.0.0%2520Safari%252F537.36&iw=1356&ih=902&ti=Map%2520the%2520Cloud%2520-%2520Azure%2520Services%2520%2526%2520Regions&w=3440&h=1440&d=24&l=de-DE&p=MacIntel&m=8&c=8&tz=Europe%252FBerlin 315 200 Hit 0 3320 NULL NULL NULL NULL NULL 4326dbc4bbfbef6aec0584b3d6437625551ab22323ed0f81ff79ab54bcfb97db cf7ee5dae81cbe75b0e78aaf200b8f1fb93349dc33fe65d3623df79ff31c53fd ab40d20596d7595049399578929ffc598abdb8f539bdfa7637cb509f8613dcc7 pageview
2022 11 22 2022-11-22 00:01:20.2 2022-11-22 00:01:21 -800 Hamburg NULL Germany 53.6304 9.98823 NULL NULL Chrome 107.0.0.0 Mac OS 10.15.7 Europe/Berlin de-DE NULL NULL 3440x1440 1356x902 24.0 MacIntel 8.0 8.0 NULL NULL NULL NULL NULL http://localhost:3000/googlecloud /googlecloud t=pv&ts=1669075280200&u=http%253A%252F%252Flocalhost%253A3000%252Fgooglecloud&hn=localhost&pa=%252Fgooglecloud&ua=Mozilla%252F5.0%2520(Macintosh%253B%2520Intel%2520Mac%2520OS%2520X%252010_15_7)%2520AppleWebKit%252F537.36%2520(KHTML%252C%2520like%2520Gecko)%2520Chrome%252F107.0.0.0%2520Safari%252F537.36&iw=1356&ih=902&ti=Map%2520the%2520Cloud%2520-%2520AWS%2520Services%2520%2526%2520Regions&w=3440&h=1440&d=24&l=de-DE&p=MacIntel&m=8&c=8&tz=Europe%252FBerlin 325 200 Hit 0 3320 NULL NULL NULL NULL NULL 949aa1b5dc5869b315686bb8ab1e211469af10ad8c5cc23c9fec3a43b3f39e5d ee17f38d66e51fa2647ad07e21c847d82221baad23a587a51868d9ee5bf642f4 ab40d20596d7595049399578929ffc598abdb8f539bdfa7637cb509f8613dcc7 pageview
query IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
SELECT * FROM 'data/parquet-testing/aws_kinesis.parquet' WHERE event_timestamp=TIMESTAMP '2022-11-22 00:01:13.175';
----
2022 11 22 2022-11-22 00:01:13.175 2022-11-22 00:01:16 -2825 Hamburg NULL Germany 53.6304 9.98823 NULL NULL Chrome 107.0.0.0 Mac OS 10.15.7 Europe/Berlin de-DE NULL NULL 3440x1440 1356x902 24.0 MacIntel 8.0 8.0 NULL NULL NULL NULL NULL http://localhost:3000/aws /aws t=pv&ts=1669075273175&u=http%253A%252F%252Flocalhost%253A3000%252Faws&hn=localhost&pa=%252Faws&ua=Mozilla%252F5.0%2520(Macintosh%253B%2520Intel%2520Mac%2520OS%2520X%252010_15_7)%2520AppleWebKit%252F537.36%2520(KHTML%252C%2520like%2520Gecko)%2520Chrome%252F107.0.0.0%2520Safari%252F537.36&iw=1356&ih=902&ti=Map%2520the%2520Cloud%2520-%2520Azure%2520Services%2520%2526%2520Regions&w=3440&h=1440&d=24&l=de-DE&p=MacIntel&m=8&c=8&tz=Europe%252FBerlin 315 200 Hit 0 3320 NULL NULL NULL NULL NULL 4326dbc4bbfbef6aec0584b3d6437625551ab22323ed0f81ff79ab54bcfb97db cf7ee5dae81cbe75b0e78aaf200b8f1fb93349dc33fe65d3623df79ff31c53fd ab40d20596d7595049399578929ffc598abdb8f539bdfa7637cb509f8613dcc7 pageview

View File

@@ -0,0 +1,28 @@
# name: test/sql/copy/parquet/batched_write/batch_memory_usage.test_slow
# description: Batched Parquet write memory usage
# group: [batched_write]
require parquet
set seed 0.72
statement ok
COPY (SELECT uuid()::VARCHAR as varchar, uuid() AS uuid FROM range(10000000) t(i)) TO '__TEST_DIR__/random_uuids.parquet'
# copy from one parquet file to another in a memory constrained environment
statement ok
SET memory_limit='650MB'
statement ok
COPY '__TEST_DIR__/random_uuids.parquet' TO '__TEST_DIR__/random_uuids_copy.parquet';
# ensure the parquet files hold the same content
statement ok
SET memory_limit='2GB';
# ensure the parquet files hold the same content in the same order
query III
SELECT *, row_number() OVER () as rownum FROM '__TEST_DIR__/random_uuids.parquet'
EXCEPT
SELECT *, row_number() OVER () as rownum FROM '__TEST_DIR__/random_uuids_copy.parquet';
----

View File

@@ -0,0 +1,40 @@
# name: test/sql/copy/parquet/batched_write/batch_memory_usage_mixed_batches.test_slow
# description: Batched Parquet write memory usage with mixed batches
# group: [batched_write]
require parquet
statement ok
COPY (FROM range(100000) tbl(i)) TO '__TEST_DIR__/mem_usage_mix_batches_small.parquet' (ROW_GROUP_SIZE 5000)
statement ok
COPY (FROM range(100000, 400000) tbl(i)) TO '__TEST_DIR__/mem_usage_mix_batches_large.parquet' (ROW_GROUP_SIZE 200000)
statement ok
COPY (FROM range(400000, 700000) tbl(i)) TO '__TEST_DIR__/mem_usage_mix_batches_odd.parquet' (ROW_GROUP_SIZE 999)
statement ok
COPY (FROM range(700000, 1000000) tbl(i)) TO '__TEST_DIR__/mem_usage_mix_batches_odd_again.parquet' (ROW_GROUP_SIZE 99979)
statement ok
CREATE VIEW v1 AS SELECT * FROM parquet_scan([
'__TEST_DIR__/mem_usage_mix_batches_small.parquet',
'__TEST_DIR__/mem_usage_mix_batches_large.parquet',
'__TEST_DIR__/mem_usage_mix_batches_odd.parquet',
'__TEST_DIR__/mem_usage_mix_batches_odd_again.parquet'])
statement ok
SET memory_limit='500MB'
statement ok
COPY v1 TO '__TEST_DIR__/mem_usage_mix_result.parquet'
# ensure the parquet files hold the same content in the same order
statement ok
SET memory_limit='2GB';
query II
SELECT *, row_number() OVER () as rownum FROM v1
EXCEPT
SELECT *, row_number() OVER () as rownum FROM '__TEST_DIR__/mem_usage_mix_result.parquet';
----

View File

@@ -0,0 +1,16 @@
# name: test/sql/copy/parquet/batched_write/batch_memory_usage_small.test_slow
# description: Batched Parquet write memory usage
# group: [batched_write]
require parquet
set seed 0.72
statement ok
COPY (SELECT uuid()::VARCHAR as varchar, uuid() AS uuid FROM range(10000000) t(i)) TO '__TEST_DIR__/random_uuids.parquet'
statement ok
SET memory_limit='750MB'
statement ok
COPY '__TEST_DIR__/random_uuids.parquet' TO '__TEST_DIR__/random_uuids_copy.parquet';

View File

@@ -0,0 +1,88 @@
# name: test/sql/copy/parquet/batched_write/batched_parquet_write.test_slow
# description: Batched copy to file
# group: [batched_write]
require parquet
statement ok
CREATE TABLE integers AS SELECT i, i // 5 AS j FROM range(1000000) t(i) ;
statement ok
COPY integers TO '__TEST_DIR__/batched_integers.parquet';
statement ok
CREATE TABLE integers_copied AS FROM '__TEST_DIR__/batched_integers.parquet'
query IIIII
SELECT SUM(i), SUM(j), COUNT(*), COUNT(i), COUNT(j) FROM integers_copied
----
499999500000 99999500000 1000000 1000000 1000000
query II
SELECT * FROM integers_copied ORDER BY i LIMIT 5
----
0 0
1 0
2 0
3 0
4 0
query II
SELECT * FROM integers_copied ORDER BY i LIMIT 5 OFFSET 99997
----
99997 19999
99998 19999
99999 19999
100000 20000
100001 20000
query II
SELECT * FROM integers_copied QUALIFY i<=lag(i) over ()
----
# now with filters
statement ok
CREATE VIEW v1 AS SELECT * FROM integers WHERE (i%2=0 AND i<300000) OR (i BETWEEN 500000 AND 700000)
statement ok
COPY v1 TO '__TEST_DIR__/batched_integers_filters.parquet';
statement ok
CREATE TABLE integers_filtered AS FROM '__TEST_DIR__/batched_integers_filters.parquet'
foreach table v1 integers_filtered
query IIIII
SELECT SUM(i), SUM(j), COUNT(*), COUNT(i), COUNT(j) FROM ${table}
----
142500450000 28499950000 350001 350001 350001
query II
SELECT * FROM ${table} ORDER BY i LIMIT 5
----
0 0
2 0
4 0
6 1
8 1
query II
SELECT * FROM ${table} ORDER BY i LIMIT 5 OFFSET 99997
----
199994 39998
199996 39999
199998 39999
200000 40000
200002 40000
query II
SELECT * FROM ${table} ORDER BY i LIMIT 5 OFFSET 300000
----
650000 130000
650001 130000
650002 130000
650003 130000
650004 130000
endloop

View File

@@ -0,0 +1,34 @@
# name: test/sql/copy/parquet/batched_write/lineitem_memory_usage.test_slow
# description: Batched lineitem write memory usage
# group: [batched_write]
require parquet
require tpch
load __TEST_DIR__/lineitem_memory_test.db
statement ok
CALL dbgen(sf=1)
statement ok
COPY lineitem TO '__TEST_DIR__/lineitem_memory_usage.parquet'
restart
# copy from one parquet file to another in a memory constrained environment
statement ok
SET memory_limit='500MB'
statement ok
COPY '__TEST_DIR__/lineitem_memory_usage.parquet' TO '__TEST_DIR__/lineitem_memory_usage_copy.parquet';
# ensure the parquet files hold the same content in the same order
statement ok
SET memory_limit='2GB';
query IIIIIIIIIIIIIIIII
SELECT *, row_number() OVER () as rownum FROM '__TEST_DIR__/lineitem_memory_usage.parquet'
EXCEPT
SELECT *, row_number() OVER () as rownum FROM '__TEST_DIR__/lineitem_memory_usage_copy.parquet';
----

View File

@@ -0,0 +1,33 @@
# name: test/sql/copy/parquet/batched_write/parquet_verify_row_group_size.test_slow
# description: Verify row group size is respected
# group: [batched_write]
require parquet
loop i 0 2
foreach row_group_size 777 9999 83838 143431 333333
statement ok
CREATE TABLE integers AS SELECT i, i // 5 AS j FROM range(1000000) t(i) ;
statement ok
COPY integers TO '__TEST_DIR__/row_group_size.parquet' (ROW_GROUP_SIZE ${row_group_size});
statement ok
select row_group_num_rows from parquet_metadata('__TEST_DIR__/row_group_size.parquet');
query I
select abs(median(row_group_num_rows)-${row_group_size})<2048 from parquet_metadata('__TEST_DIR__/row_group_size.parquet');
----
true
statement ok
DROP TABLE integers
endloop
statement ok
SET threads=1
endloop

View File

@@ -0,0 +1,181 @@
# name: test/sql/copy/parquet/batched_write/parquet_write_mixed_batches.test_slow
# description: Test batch Parquet write with mixed batches
# group: [batched_write]
require parquet
statement ok
COPY (FROM range(100000) tbl(i)) TO '__TEST_DIR__/mix_batches_small.parquet' (ROW_GROUP_SIZE 5000)
statement ok
COPY (FROM range(100000, 400000) tbl(i)) TO '__TEST_DIR__/mix_batches_large.parquet' (ROW_GROUP_SIZE 200000)
statement ok
COPY (FROM range(400000, 700000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd.parquet' (ROW_GROUP_SIZE 999)
statement ok
COPY (FROM range(700000, 1000000) tbl(i)) TO '__TEST_DIR__/mix_batches_odd_again.parquet' (ROW_GROUP_SIZE 99979)
# create views that read the batches
statement ok
CREATE VIEW v1 AS SELECT * FROM parquet_scan(['__TEST_DIR__/mix_batches_small.parquet', '__TEST_DIR__/mix_batches_large.parquet', '__TEST_DIR__/mix_batches_odd.parquet', '__TEST_DIR__/mix_batches_odd_again.parquet'])
statement ok
CREATE VIEW v2 AS FROM v1 WHERE (i//10000)%2=0;
statement ok
CREATE VIEW v3 AS FROM v1 WHERE (i//10000)%2=0 OR (i>200000 AND i < 400000) OR (i>600000 AND i < 800000);
# empty table
statement ok
CREATE VIEW v4 AS FROM v1 WHERE i>998 AND i<1000 AND i%2=0
loop i 0 2
query I
COPY v1 TO '__TEST_DIR__/mixed_batches_v1.parquet'
----
1000000
query I
CREATE TABLE mixed_batches_v1 AS FROM '__TEST_DIR__/mixed_batches_v1.parquet'
----
1000000
foreach table v1 mixed_batches_v1
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
499999500000 0 999999 1000000 1000000
query I
SELECT * FROM ${table} LIMIT 5 OFFSET 99998
----
99998
99999
100000
100001
100002
endloop
# now do the same, but filter out half of the values
query I
COPY v2 TO '__TEST_DIR__/mixed_batches_v2.parquet'
----
500000
query I
CREATE TABLE mixed_batches_v2 AS FROM '__TEST_DIR__/mixed_batches_v2.parquet'
----
500000
foreach table v2 mixed_batches_v2
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
247499750000 0 989999 500000 500000
query I
SELECT * FROM ${table} LIMIT 5 OFFSET 99998
----
189998
189999
200000
200001
200002
endloop
# do it again, but this time only filter out SOME small batches
query I
COPY v3 TO '__TEST_DIR__/mixed_batches_v3.parquet'
----
700000
query I
CREATE TABLE mixed_batches_v3 AS FROM '__TEST_DIR__/mixed_batches_v3.parquet'
----
700000
foreach table v3 mixed_batches_v3
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
348499650000 0 989999 700000 700000
query I
SELECT * FROM ${table} LIMIT 5 OFFSET 9999
----
9999
20000
20001
20002
20003
endloop
# now with an empty table
query I
COPY v4 TO '__TEST_DIR__/mixed_batches_v4.parquet'
----
0
query I
CREATE TABLE mixed_batches_v4 AS FROM '__TEST_DIR__/mixed_batches_v4.parquet'
----
0
foreach table v4 mixed_batches_v4
query IIIII
SELECT SUM(i), MIN(i), MAX(i), COUNT(i), COUNT(*) FROM ${table}
----
NULL NULL NULL 0 0
query I
SELECT * FROM ${table} LIMIT 5
----
endloop
statement ok
DROP TABLE mixed_batches_v1
statement ok
DROP TABLE mixed_batches_v2
statement ok
DROP TABLE mixed_batches_v3
statement ok
DROP TABLE mixed_batches_v4
# Drop the VIEWs that depend on V1
statement ok
DROP VIEW IF EXISTS v2
statement ok
DROP VIEW IF EXISTS v3
statement ok
DROP VIEW IF EXISTS v4
# create views that read the batches using unions
statement ok
CREATE OR REPLACE VIEW v1 AS FROM '__TEST_DIR__/mix_batches_small.parquet' UNION ALL FROM '__TEST_DIR__/mix_batches_large.parquet' UNION ALL FROM '__TEST_DIR__/mix_batches_odd.parquet' UNION ALL FROM '__TEST_DIR__/mix_batches_odd_again.parquet'
statement ok
CREATE OR REPLACE VIEW v2 AS FROM v1 WHERE (i//10000)%2=0;
statement ok
CREATE OR REPLACE VIEW v3 AS FROM v1 WHERE (i//10000)%2=0 OR (i>200000 AND i < 400000) OR (i>600000 AND i < 800000);
statement ok
CREATE OR REPLACE VIEW v4 AS FROM v1 WHERE i>998 AND i<1000 AND i%2=0
endloop

View File

@@ -0,0 +1,55 @@
# name: test/sql/copy/parquet/batched_write/tpch_sf1_parquet.test_slow
# description: Test TPC-H SF1 with Parquet
# group: [batched_write]
require tpch
require parquet
statement ok
CALL dbgen(sf=1, suffix='_original');
foreach tpch_tbl orders customer lineitem nation part partsupp region supplier
statement ok
COPY ${tpch_tbl}_original TO '__TEST_DIR__/${tpch_tbl}.parquet';
statement ok
CREATE VIEW ${tpch_tbl} AS FROM read_parquet('__TEST_DIR__/${tpch_tbl}.parquet');
endloop
# verify the data was written/read in the correct order
query IIIIIIIIIIIIIIII
select * from lineitem qualify l_orderkey<lag(l_orderkey) over ();
----
loop i 1 9
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q0${i}.csv
endloop
loop i 10 23
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q${i}.csv
endloop
query IIIIIIIIIIIIIIII
SELECT MAX(COLUMNS(*)) FROM (FROM lineitem LIMIT 100000 OFFSET 5000000)
----
5099235 199996 10000 7 50.00 104649.50 0.10 0.08 R O 1998-11-30 1998-10-30 1998-12-22 TAKE BACK RETURN TRUCK zzle. express, bold deposits was. slyly e
query IIIIIIIIIIIIIIII
select * from lineitem order by l_extendedprice desc, l_shipdate limit 2;
----
2513090 199999 5038 4 50.00 104949.50 0.02 0.04 A F 1993-10-05 1993-10-17 1993-10-28 TAKE BACK RETURN FOB - ironic, pending pinto be
82823 199998 5037 2 50.00 104899.50 0.04 0.05 A F 1992-04-30 1992-07-05 1992-05-29 COLLECT COD SHIP orbits. bold fox

View File

@@ -0,0 +1,51 @@
# name: test/sql/copy/parquet/batched_write/varying_source_target_row_groups.test_slow
# description: Verify source-target row group size pairs
# group: [batched_write]
require parquet
statement ok
CREATE TABLE integers AS SELECT i, i // 5 AS j FROM range(1000000) t(i) ;
foreach src_size 777 9999 83838 143431 333333
foreach tgt_size 777 9999 83838 143431 333333
statement ok
SET threads=1
statement ok
COPY integers TO '__TEST_DIR__/src_size.parquet' (ROW_GROUP_SIZE ${src_size});
statement ok
SET threads=4
query I
select abs(median(row_group_num_rows)-${src_size})<2048 from parquet_metadata('__TEST_DIR__/src_size.parquet');
----
true
statement ok
COPY '__TEST_DIR__/src_size.parquet' TO '__TEST_DIR__/tgt_size.parquet' (ROW_GROUP_SIZE ${tgt_size});
query I
select abs(median(row_group_num_rows)-${tgt_size})<2048 from parquet_metadata('__TEST_DIR__/tgt_size.parquet');
----
true
# verify the groups are actually written in the same order and contain the same data
query III
SELECT *, row_number() OVER () FROM integers
EXCEPT
SELECT *, row_number() OVER () FROM '__TEST_DIR__/src_size.parquet'
----
query III
SELECT *, row_number() OVER () FROM '__TEST_DIR__/src_size.parquet'
EXCEPT
SELECT *, row_number() OVER () FROM '__TEST_DIR__/tgt_size.parquet'
----
endloop
endloop

View File

@@ -0,0 +1,18 @@
# name: test/sql/copy/parquet/bigdecimal.test
# description: Read a file created by Google BigQuery with a BIGDECIMAL column (i.e. DECIMAL(77,38))
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
query I
FROM 'data/parquet-testing/bigdecimal.parquet'
----
0.5
-0.5
1.2345678912345679e+26
-1.2345678912345679e+26
5.7896044618658096e+38
-5.7896044618658096e+38

View File

@@ -0,0 +1,293 @@
# name: test/sql/copy/parquet/bloom_filters.test
# group: [parquet]
require parquet
statement ok
copy (select
(r1.range*10)::BIGINT r,
r::smallint r_int16,
r::integer r_int32,
r::double r_double,
r::float r_float,
'string_' || r::VARCHAR r_string,
('blob_' || r::VARCHAR)::BLOB r_blob
from range(100) r1, range(1000) order by r) to '__TEST_DIR__/bloom1.parquet' (format parquet, ROW_GROUP_SIZE 10000, dictionary_size_limit 1000);
# we don't check the actual offsets since they might change due to filters being moved around
query III
select column_id, BOOL_AND(bloom_filter_offset > 4), BOOL_AND(bloom_filter_length > 1) from parquet_metadata('__TEST_DIR__/bloom1.parquet') group by column_id order by column_id;
----
0 true true
1 true true
2 true true
3 true true
4 true true
5 true true
6 true true
# this value is not in the domain but within min/max
query I
SELECT BOOL_AND(bloom_filter_excludes) FROM parquet_bloom_probe('__TEST_DIR__/bloom1.parquet', 'r', '201');
----
true
# this value is outside min/max
query I
SELECT BOOL_AND(bloom_filter_excludes) FROM parquet_bloom_probe('__TEST_DIR__/bloom1.parquet', 'r', '112121212');
----
true
statement ok
CREATE MACRO assert_bloom_filter_hit(file, col, val) AS TABLE
SELECT COUNT(*) > 0 AND COUNT(*) < MAX(row_group_id+1) FROM parquet_bloom_probe(file, col, val) WHERE NOT bloom_filter_excludes;
# this in-domain should only be in a subset of row groups since its ordered
query I
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom1.parquet', 'r', '200');
----
true
# same dance but with probe not being a string
query I
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom1.parquet', 'r', 200);
----
true
# non-existent file
statement error
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom10000.parquet', 'r', '200');
----
No files found
# non-existent column
statement error
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom1.parquet', 'r2', '200');
----
Column r2 not found
# NULL colname
statement error
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom1.parquet', NULL, '200');
----
Can't have NULL parameters
# NULL probe
statement error
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom1.parquet', 'r', NULL);
----
Can't have NULL parameters
statement error
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom1.parquet', 'r', 'a');
----
Failed to cast value
# more types
query I
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom1.parquet', 'r_int16', 200);
----
true
query I
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom1.parquet', 'r_int32', 200);
----
true
query I
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom1.parquet', 'r_float', 200);
----
true
query I
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom1.parquet', 'r_double', 200);
----
true
query I
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom1.parquet', 'r_string', 'string_200');
----
true
query I
FROM assert_bloom_filter_hit('__TEST_DIR__/bloom1.parquet', 'r_blob', 'blob_200'::BLOB);
----
true
# some tests for dictionary_size_limit
# no bloom filter, dict limit too low
statement ok
copy (select (r1.range*10)::BIGINT r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom2.parquet' (format parquet, ROW_GROUP_SIZE 10000, dictionary_size_limit 10);
query III
select row_group_id, bloom_filter_offset IS NOT NULL, bloom_filter_length IS NOT NULL from parquet_metadata('__TEST_DIR__/bloom2.parquet') order by row_group_id;
----
0 false false
# no bloom filter - disabled explicitly
statement ok
copy (select (r1.range*10)::BIGINT r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/disable_bloom_filter.parquet' (format parquet, ROW_GROUP_SIZE 10000, write_bloom_filter false);
query III
select row_group_id, bloom_filter_offset IS NOT NULL, bloom_filter_length IS NOT NULL from parquet_metadata('__TEST_DIR__/disable_bloom_filter.parquet') order by row_group_id;
----
0 false false
# still no bloom filter, limit off-by-one
statement ok
copy (select (r1.range*10)::BIGINT r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom3.parquet' (format parquet, ROW_GROUP_SIZE 10000, dictionary_size_limit 99);
query III
select row_group_id, bloom_filter_offset IS NOT NULL, bloom_filter_length IS NOT NULL from parquet_metadata('__TEST_DIR__/bloom3.parquet') order by row_group_id;
----
0 false false
# should have a filter here!
statement ok
copy (select (r1.range*10)::BIGINT r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom4.parquet' (format parquet, ROW_GROUP_SIZE 10000, dictionary_size_limit 100);
query III
select row_group_id, bloom_filter_offset IS NOT NULL, bloom_filter_length IS NOT NULL from parquet_metadata('__TEST_DIR__/bloom4.parquet') order by row_group_id;
----
0 true true
# should have a filter here, too
statement ok
copy (select (r1.range*10)::BIGINT r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom5.parquet' (format parquet, ROW_GROUP_SIZE 10000, dictionary_size_limit 1000);
query III
select row_group_id, bloom_filter_offset IS NOT NULL, bloom_filter_length IS NOT NULL from parquet_metadata('__TEST_DIR__/bloom5.parquet') order by row_group_id;
----
0 true true
# lets mess with the false positive ratio and measue bf size
# the default 0.01
statement ok
copy (select (r1.range*10)::BIGINT r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom6.parquet' (format parquet, ROW_GROUP_SIZE 10000, dictionary_size_limit 1000, bloom_filter_false_positive_ratio 0.01);
query II
select row_group_id, bloom_filter_length from parquet_metadata('__TEST_DIR__/bloom6.parquet') order by row_group_id;
----
0 144
# higher prob: 0.5 should lead to a smaller filter
statement ok
copy (select (r1.range*10)::BIGINT r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom7.parquet' (format parquet, ROW_GROUP_SIZE 10000, dictionary_size_limit 1000, bloom_filter_false_positive_ratio 0.5);
query II
select row_group_id, bloom_filter_length from parquet_metadata('__TEST_DIR__/bloom7.parquet') order by row_group_id;
----
0 80
# lower prob: 0.001 should lead to a bigger filter
statement ok
copy (select (r1.range*10)::BIGINT r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom8.parquet' (format parquet, ROW_GROUP_SIZE 10000, dictionary_size_limit 1000, bloom_filter_false_positive_ratio 0.001);
query II
select row_group_id, bloom_filter_length from parquet_metadata('__TEST_DIR__/bloom8.parquet') order by row_group_id;
----
0 272
# even lower prob: 0.0001 should lead to an even bigger filter
statement ok
copy (select (r1.range*10)::BIGINT r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom8.parquet' (format parquet, ROW_GROUP_SIZE 10000, dictionary_size_limit 1000, bloom_filter_false_positive_ratio 0.0001);
query II
select row_group_id, bloom_filter_length from parquet_metadata('__TEST_DIR__/bloom8.parquet') order by row_group_id;
----
0 528
# some error cases for the new parameters
statement error
copy (select (r1.range*10)::BIGINT r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom8.parquet' (format parquet, ROW_GROUP_SIZE 10000, dictionary_size_limit -1, bloom_filter_false_positive_ratio 0.0001);
----
dictionary_size_limit must be greater than 0
statement error
copy (select (r1.range*10)::BIGINT r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom8.parquet' (format parquet, ROW_GROUP_SIZE 10000, dictionary_size_limit 1000, bloom_filter_false_positive_ratio 0);
----
bloom_filter_false_positive_ratio must be greater than 0
# some tests for string_dictionary_page_size_limit
# no bloom filter, limit too low
statement ok
copy (select (r1.range*10)::VARCHAR r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom9.parquet' (format parquet, ROW_GROUP_SIZE 10000, string_dictionary_page_size_limit 10);
query III
select row_group_id, bloom_filter_offset IS NOT NULL, bloom_filter_length IS NOT NULL from parquet_metadata('__TEST_DIR__/bloom9.parquet') order by row_group_id;
----
0 false false
# big enough
statement ok
copy (select (r1.range*10)::VARCHAR r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom9.parquet' (format parquet, ROW_GROUP_SIZE 10000, string_dictionary_page_size_limit 100000);
query III
select row_group_id, bloom_filter_offset IS NOT NULL, bloom_filter_length IS NOT NULL from parquet_metadata('__TEST_DIR__/bloom9.parquet') order by row_group_id;
----
0 true true
# too big
statement error
copy (select (r1.range*10)::VARCHAR r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom9.parquet' (format parquet, ROW_GROUP_SIZE 10000, string_dictionary_page_size_limit 4294967295);
----
Binder Error
# cannot be 0
statement error
copy (select (r1.range*10)::VARCHAR r,
from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom9.parquet' (format parquet, ROW_GROUP_SIZE 10000, string_dictionary_page_size_limit 0);
----
Binder Error
# test some repeated large strings
# this should give dictionary
statement ok
copy (select repeat('abc', 500_000) || (range % 10) s from range(100)) to '__TEST_DIR__/my.parquet';
query I
select encodings from parquet_metadata('__TEST_DIR__/my.parquet');
----
RLE_DICTIONARY
# this cannot do dictionary because the strings exceed the limit
statement ok
copy (select repeat('abc', 500_000) || (range % 10) s from range(100)) to '__TEST_DIR__/my.parquet' (STRING_DICTIONARY_PAGE_SIZE_LIMIT 4_000_000);
query I
select encodings = 'RLE_DICTIONARY' from parquet_metadata('__TEST_DIR__/my.parquet');
----
false

View File

@@ -0,0 +1 @@
PAR1

Binary file not shown.

View File

@@ -0,0 +1 @@
PAR1<EFBFBD><EFBFBD><EFBFBD><EFBFBD>PAR1

View File

@@ -0,0 +1 @@
PAR1iojqerwiojqwqhqrwiuhRIUHQWriuwHQRW

View File

@@ -0,0 +1 @@
RJIOJWRIOJQWriojqrqwJRWPAR1

View File

@@ -0,0 +1 @@
PAR1PAR1

View File

@@ -0,0 +1,43 @@
# name: test/sql/copy/parquet/broken_parquet.test
# description: Read several broken parquet files
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
statement error
select count(*) from parquet_scan('test/sql/copy/parquet/broken/missingmagicatfront.parquet')
----
statement error
select count(*) from parquet_scan('test/sql/copy/parquet/broken/missingmagicatend.parquet')
----
statement error
select count(*) from parquet_scan('test/sql/copy/parquet/broken/firstmarker.parquet')
----
statement error
select count(*) from parquet_scan('test/sql/copy/parquet/broken/twomarkers.parquet')
----
statement error
select count(*) from parquet_scan('test/sql/copy/parquet/broken/footerlengthzero.parquet')
----
statement error
select count(*) from parquet_scan('test/sql/copy/parquet/broken/hugefooter.parquet')
----
statement error
select count(*) from parquet_scan('test/sql/copy/parquet/broken/garbledfooter.parquet')
----
mode skip
statement error
from parquet_scan('test/sql/copy/parquet/broken/broken_structure.parquet')
----
Parquet file is likely corrupted

View File

@@ -0,0 +1,13 @@
# name: test/sql/copy/parquet/byte_stream_split.test
# description: Read a Parquet file with floats and doubles encoded using the byte stream split encoding
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
query III
SELECT * FROM 'data/parquet-testing/byte_stream_split.parquet'
----
<FILE>:data/parquet-testing/byte_stream_split.csv

View File

@@ -0,0 +1,10 @@
# name: test/sql/copy/parquet/case_insensitive_replacement.test
# description: Issue #2543: Case insensitive replacement scan
# group: [parquet]
require parquet
query I
SELECT data FROM 'data/parquet-testing/CASE_INSENSITIVE.PARQUET'
----
\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F

View File

@@ -0,0 +1,15 @@
# name: test/sql/copy/parquet/copy_option_suggestion.test
# description: Test suggestion of unknown copy options
# group: [parquet]
require parquet
statement error
copy (select 42) to 'file.parquet' (partition_b (a));
----
partition_by
statement error
copy (select 42) to 'file.csv' (partition_b (a));
----
partition_by

View File

@@ -0,0 +1,61 @@
# name: test/sql/copy/parquet/copy_preserve_order.test_slow
# description: Test order preservation with the PRESERVE_ORDER flag
# group: [parquet]
require parquet
load __TEST_DIR__/insert_order_preserving.db
# test the PRESERVE_ORDER option
statement ok
SET preserve_insertion_order=false
query I
CREATE TABLE integers AS SELECT * FROM range(10000000) tbl(i);
----
10000000
query I
COPY integers TO '__TEST_DIR__/force_order_preserve.parquet' (PRESERVE_ORDER);
----
10000000
statement ok
CREATE VIEW integers2 AS FROM '__TEST_DIR__/force_order_preserve.parquet'
query I
SELECT SUM(i) FROM integers
----
49999995000000
query I
SELECT SUM(i) FROM integers2
----
49999995000000
# verify the file was written in the correct order - for this we set the preserve_insertion_order back to true
statement ok
SET preserve_insertion_order=true
query I
SELECT * FROM '__TEST_DIR__/force_order_preserve.parquet' LIMIT 5
----
0
1
2
3
4
query I
SELECT * FROM '__TEST_DIR__/force_order_preserve.parquet' LIMIT 5 OFFSET 777778
----
777778
777779
777780
777781
777782
statement error
COPY integers TO '__TEST_DIR__/force_order_preserve_2.parquet' (PRESERVE_ORDER, PARTITION_BY (i), WRITE_PARTITION_COLUMNS);
----
PRESERVE_ORDER is not supported with these parameters

View File

@@ -0,0 +1,18 @@
# name: test/sql/copy/parquet/corrupt_stats.test
# description: Issue #14430: group by a timestamp column in a parquet file can cause the process to crash
# group: [parquet]
require parquet
statement error
SELECT a FROM 'data/parquet-testing/corrupt_stats.parquet' GROUP BY a;
----
This likely means that the statistics in your data source are corrupt
statement ok
PRAGMA disable_optimizer
query I
SELECT a FROM 'data/parquet-testing/corrupt_stats.parquet' GROUP BY a;
----
2021-01-01 12:00:00

View File

@@ -0,0 +1,28 @@
# name: test/sql/copy/parquet/decimal_filter.test
# description: Decimal filter pushdown into Parquet
# group: [parquet]
require parquet
statement ok
pragma enable_verification
query IIII
select * from 'data/parquet-testing/decimals.parquet'
----
0.1 0.1 0.1 0.1
-0.1 -0.1 -0.1 -0.1
loop i 1 5
query IIII
select * from 'data/parquet-testing/decimals.parquet' WHERE l${i}=0.1
----
0.1 0.1 0.1 0.1
query IIII
select * from 'data/parquet-testing/decimals.parquet' WHERE l${i}=-0.1
----
-0.1 -0.1 -0.1 -0.1
endloop

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,14 @@
# name: test/sql/copy/parquet/describe_parquet.test
# description: Test DESCRIBE on a parquet file
# group: [parquet]
require parquet
query IIIIII nosort describeresult
DESCRIBE 'data/parquet-testing/delta_byte_array.parquet'
query IIIIII nosort describeresult
DESCRIBE "data/parquet-testing/delta_byte_array.parquet"
query IIIIII nosort describeresult
DESCRIBE FROM read_parquet("data/parquet-testing/delta_byte_array.parquet")

View File

@@ -0,0 +1,76 @@
# name: test/sql/copy/parquet/dictionary_compression_ratio_threshold.test
# description: Test Parquet dictionary_compression_ratio_threshold parameter
# group: [parquet]
# the setting dictionary_compression_ratio_threshold is DEPRECATED, the tests are here to make sure it can be set without issue
require parquet
statement ok
CREATE TABLE test AS SELECT 'thisisaverylongstringbutitrepeatsmanytimessoitshighlycompressible' || (range % 10) i FROM range(100000)
statement ok
COPY test TO '__TEST_DIR__/dictionary_compression_ratio_threshold.parquet' (dictionary_compression_ratio_threshold -2)
# default is 1.0
statement ok
COPY test TO '__TEST_DIR__/dictionary_compression_ratio_threshold.parquet'
# dictionary compression is applied so page offset is non-null
query I
SELECT dictionary_page_offset IS NULL FROM parquet_metadata('__TEST_DIR__/dictionary_compression_ratio_threshold.parquet')
----
false
# -1 to disable
statement ok
COPY test TO '__TEST_DIR__/dictionary_compression_ratio_threshold.parquet' (dictionary_compression_ratio_threshold -1)
# ignored, still dictionary compression
query I
SELECT dictionary_page_offset IS NULL FROM parquet_metadata('__TEST_DIR__/dictionary_compression_ratio_threshold.parquet')
----
false
# the data compresses more than 10x
statement ok
COPY test TO '__TEST_DIR__/dictionary_compression_ratio_threshold.parquet' (dictionary_compression_ratio_threshold 10)
# dictionary compression should be enabled
query I
SELECT dictionary_page_offset IS NULL FROM parquet_metadata('__TEST_DIR__/dictionary_compression_ratio_threshold.parquet')
----
false
# compresses less than 20x
statement ok
COPY test TO '__TEST_DIR__/dictionary_compression_ratio_threshold.parquet' (dictionary_compression_ratio_threshold 20)
# dictionary compression still enabled, setting is deprecated
query I
SELECT dictionary_page_offset IS NULL FROM parquet_metadata('__TEST_DIR__/dictionary_compression_ratio_threshold.parquet')
----
false
# create table with all uniques
statement ok
CREATE OR REPLACE TABLE test AS SELECT 'coolstring' || range i FROM range(100000)
# should still have compression, setting is deprecated
statement ok
COPY test TO '__TEST_DIR__/dictionary_compression_ratio_threshold.parquet'
query I
SELECT dictionary_page_offset IS NULL FROM parquet_metadata('__TEST_DIR__/dictionary_compression_ratio_threshold.parquet')
----
true
# but if we set our threshold to 0 we create a dictionary anyway
statement ok
COPY test TO '__TEST_DIR__/dictionary_compression_ratio_threshold.parquet' (dictionary_compression_ratio_threshold 0)
query I
SELECT dictionary_page_offset IS NULL FROM parquet_metadata('__TEST_DIR__/dictionary_compression_ratio_threshold.parquet')
----
true

View File

@@ -0,0 +1,14 @@
# name: test/sql/copy/parquet/enum_converted_type.test
# description: Test enum converted type
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
query IIIIIII
select * from 'data/parquet-testing/enum.parquet';
----
1 0 t1 test_span 1612550512340953 500000 [{'key': service_name, 'v_type': STRING, 'v_str': test_service, 'v_bool': false, 'v_int64': 0, 'v_float64': 0.0, 'v_binary': ''}, {'key': http_method, 'v_type': STRING, 'v_str': POST, 'v_bool': false, 'v_int64': 0, 'v_float64': 0.0, 'v_binary': ''}, {'key': method, 'v_type': STRING, 'v_str': callbacks.flannel, 'v_bool': false, 'v_int64': 0, 'v_float64': 0.0, 'v_binary': ''}, {'key': boolean, 'v_type': BOOL, 'v_str': '', 'v_bool': true, 'v_int64': 0, 'v_float64': 0.0, 'v_binary': ''}, {'key': int, 'v_type': INT64, 'v_str': '', 'v_bool': false, 'v_int64': 1000, 'v_float64': 1001.2, 'v_binary': ''}, {'key': float, 'v_type': FLOAT64, 'v_str': '', 'v_bool': false, 'v_int64': 1000, 'v_float64': 1001.2, 'v_binary': ''}, {'key': binary, 'v_type': BINARY, 'v_str': ignored, 'v_bool': false, 'v_int64': 0, 'v_float64': 0.0, 'v_binary': binaryTagValue}, {'key': type, 'v_type': STRING, 'v_str': msg_type, 'v_bool': false, 'v_int64': 0, 'v_float64': 0.0, 'v_binary': ''}]
2 1 t1 test_span 1612550512340954 500001 [{'key': service_name, 'v_type': STRING, 'v_str': test_service, 'v_bool': false, 'v_int64': 0, 'v_float64': 0.0, 'v_binary': ''}, {'key': http_method, 'v_type': STRING, 'v_str': POST, 'v_bool': false, 'v_int64': 0, 'v_float64': 0.0, 'v_binary': ''}, {'key': method, 'v_type': STRING, 'v_str': callbacks.flannel, 'v_bool': false, 'v_int64': 0, 'v_float64': 0.0, 'v_binary': ''}, {'key': boolean, 'v_type': BOOL, 'v_str': '', 'v_bool': true, 'v_int64': 0, 'v_float64': 0.0, 'v_binary': ''}, {'key': int, 'v_type': INT64, 'v_str': '', 'v_bool': false, 'v_int64': 1000, 'v_float64': 1001.2, 'v_binary': ''}, {'key': float, 'v_type': FLOAT64, 'v_str': '', 'v_bool': false, 'v_int64': 1000, 'v_float64': 1001.2, 'v_binary': ''}, {'key': binary, 'v_type': BINARY, 'v_str': ignored, 'v_bool': false, 'v_int64': 0, 'v_float64': 0.0, 'v_binary': binaryTagValue}, {'key': type, 'v_type': STRING, 'v_str': msg_type, 'v_bool': false, 'v_int64': 0, 'v_float64': 0.0, 'v_binary': ''}]

View File

@@ -0,0 +1,17 @@
# name: test/sql/copy/parquet/file_metadata.test
# group: [parquet]
require parquet
statement ok
SET parquet_metadata_cache = true;
query IIIIIIIII
SELECT * FROM parquet_file_metadata('data/parquet-testing/arrow/alltypes_dictionary.parquet')
----
data/parquet-testing/arrow/alltypes_dictionary.parquet impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9) 2 1 1 NULL NULL 1698 723
query IIIIIIIII
SELECT * FROM parquet_file_metadata('data/parquet-testing/arrow/alltypes_dictionary.parquet')
----
data/parquet-testing/arrow/alltypes_dictionary.parquet impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9) 2 1 1 NULL NULL 1698 723

View File

@@ -0,0 +1,10 @@
# name: test/sql/copy/parquet/fixed.test
# description: Strings in fixed length binary arrays
# group: [parquet]
require parquet
query I
SELECT data FROM parquet_scan('data/parquet-testing/fixed.parquet')
----
\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F

View File

@@ -0,0 +1,38 @@
# name: test/sql/copy/parquet/float16.test
# description: Test reading half-floats
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
query III
select type, type_length, logical_type from parquet_schema('data/parquet-testing/float16.parquet') where name = 'x'
----
FIXED_LEN_BYTE_ARRAY 2 Float16Type()
query I
select typeof(x) from read_parquet('data/parquet-testing/float16.parquet') limit 1;
----
FLOAT
query I
select x from read_parquet('data/parquet-testing/float16.parquet') order by x;
----
-inf
0.0
0.5
1.0
1.5
inf
nan
-nan
query I
select x from read_parquet('data/parquet-testing/float16.parquet') where x > 1.1 order by x;
----
1.5
inf
nan
-nan

View File

@@ -0,0 +1,37 @@
# name: test/sql/copy/parquet/hive_partitioning_struct.test
# description: Test hive partitioning and struct pushdown
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
statement ok
COPY (
SELECT
i//50 id,
{'a': i, 'b': 21} s
FROM range(100) t(i)
) TO '__TEST_DIR__/hive_partitioned_struct_col' (FORMAT PARQUET, PARTITION_BY (id))
query II
SELECT * FROM read_parquet('__TEST_DIR__/hive_partitioned_struct_col/**/*.parquet', hive_partitioning=1) WHERE s.a=42
----
{'a': 42, 'b': 21} 0
query I
SELECT s.a FROM read_parquet('__TEST_DIR__/hive_partitioned_struct_col/**/*.parquet', hive_partitioning=1) WHERE s.a=42
----
42
# what if the hive types themselves are structs?
statement ok
COPY (SELECT i id, {'a': i//2} s FROM range(100) t(i)) TO '__TEST_DIR__/hive_partitioned_struct' (FORMAT PARQUET, PARTITION_BY (s))
query II
SELECT * FROM read_parquet('__TEST_DIR__/hive_partitioned_struct/**/*.parquet', hive_partitioning=1, hive_types={'s': 'STRUCT(a INT)'}) WHERE s.a=42 ORDER BY ALL
----
84 {'a': 42}
85 {'a': 42}

View File

@@ -0,0 +1,62 @@
# name: test/sql/copy/parquet/hive_timestamps.test
# description: Prefer strict hive timestamps to dates
# group: [parquet]
require parquet
# requires notwindows for embedded spaces in the path
require notwindows
statement ok
PRAGMA enable_verification
set seed 0.8675309
statement ok
CREATE TABLE raw_data (
ts TIMESTAMP_S NOT NULL,
hits INTEGER NOT NULL
);
statement ok
INSERT INTO raw_data
SELECT *, (random() * 500)::INTEGER
FROM RANGE(TIMESTAMP '2023-11-01', TIMESTAMP '2023-11-06', INTERVAL 1 MINUTE);
statement ok
CREATE TABLE timeseries AS (
SELECT DATE_TRUNC('hour', ts) AS bucket, SUM(hits)::BIGINT AS total
FROM raw_data
GROUP BY bucket
);
query II
SELECT *
FROM timeseries
ORDER BY ALL
LIMIT 5
----
2023-11-01 00:00:00 15127
2023-11-01 01:00:00 16634
2023-11-01 02:00:00 14676
2023-11-01 03:00:00 14493
2023-11-01 04:00:00 13288
statement ok
COPY (
SELECT * FROM timeseries
) TO '__TEST_DIR__/hive' (
FORMAT 'PARQUET', COMPRESSION 'SNAPPY', PARTITION_BY (bucket), OVERWRITE_OR_IGNORE
);
query II
SELECT bucket, total
FROM read_parquet('__TEST_DIR__/hive/*/*.parquet')
ORDER BY ALL
LIMIT 5
----
2023-11-01 00:00:00 15127
2023-11-01 01:00:00 16634
2023-11-01 02:00:00 14676
2023-11-01 03:00:00 14493
2023-11-01 04:00:00 13288

View File

@@ -0,0 +1,60 @@
# name: test/sql/copy/parquet/incorrect_converted_type.test
# description: Test parquet files with incorrect converted type annotations
# group: [parquet]
require parquet
statement error
SELECT * FROM 'data/parquet-testing/broken/broken_bigint.parquet';
----
<REGEX>:.*IO Error.*converted type.*
statement error
SELECT * FROM 'data/parquet-testing/broken/broken_date.parquet';
----
<REGEX>:.*IO Error.*converted type.*
statement error
SELECT * FROM 'data/parquet-testing/broken/broken_int.parquet';
----
<REGEX>:.*IO Error.*converted type.*
statement error
SELECT * FROM 'data/parquet-testing/broken/broken_smallint.parquet';
----
<REGEX>:.*IO Error.*converted type.*
statement error
SELECT * FROM 'data/parquet-testing/broken/broken_timestamp.parquet';
----
<REGEX>:.*IO Error.*converted type.*
statement error
SELECT * FROM 'data/parquet-testing/broken/broken_timestamp_ms.parquet';
----
<REGEX>:.*IO Error.*converted type.*
statement error
SELECT * FROM 'data/parquet-testing/broken/broken_tinyint.parquet';
----
<REGEX>:.*IO Error.*converted type.*
statement error
SELECT * FROM 'data/parquet-testing/broken/broken_ubigint.parquet';
----
<REGEX>:.*IO Error.*converted type.*
statement error
SELECT * FROM 'data/parquet-testing/broken/broken_uinteger.parquet';
----
<REGEX>:.*IO Error.*converted type.*
statement error
SELECT * FROM 'data/parquet-testing/broken/broken_usmallint.parquet';
----
<REGEX>:.*IO Error.*converted type.*
statement error
SELECT * FROM 'data/parquet-testing/broken/broken_utinyint.parquet';
----
<REGEX>:.*IO Error.*converted type.*

View File

@@ -0,0 +1,24 @@
# name: test/sql/copy/parquet/infer_copy_format.test
# description: Infer COPY TO format test
# group: [parquet]
require parquet
statement ok
CREATE TABLE integers AS SELECT * FROM range(6) tbl(i);
statement ok
COPY integers TO '__TEST_DIR__/integers.parquet';
query I
SELECT SUM(i) FROM '__TEST_DIR__/integers.parquet';
----
15
statement ok
COPY integers TO '__TEST_DIR__/integers.csv';
query I
SELECT SUM(i) FROM '__TEST_DIR__/integers.csv' tbl(i);
----
15

View File

@@ -0,0 +1,24 @@
# name: test/sql/copy/parquet/json_parquet.test
# description: Test JSON Parquet
# group: [parquet]
require parquet
require json
statement ok
CREATE TABLE json_tbl AS FROM 'data/parquet-testing/json_convertedtype.parquet';
query I
SELECT json_extract(TX_JSON[1], 'block_hash') FROM json_tbl
----
"0x95cc694a09424ba463e4b1b704b86a56a41521473b3b4875691383c3d5c799b3"
"0x5aa34b59d13fc0c6c199c67451c5643ecfd905ee1ac940478b1e700203c707be"
"0x987d9d3a51a630337cdbd78858676ca6237ea692306ebd3586b4c3cb79e3762c"
"0x5f90325321b570ba4ade766478df3c73a5b89336c2b57b8fa8e64d2f937639d9"
"0x48cb55cb6814a86cbba8e5e859df11bc8f01f772a3972fb5c22a31d1339c24e4"
"0x059f581a8c9b196e0d11b462be0e194eb837922e17409c94c6888b72d0001b49"
"0xa3855d62087826f46f97d6c90854c223f19bf95b2df0d2a446e5f00efbae8b80"
"0x5193e56b142dfca5873d7272ea4892a83a74b3d4faa99f8e49ff83d7d4b53e0d"
"0x099041d5e2b624f8be592db1d624998e06495f680c6134bf1d7401d919bd0af0"
"0x2e5b481bba4f1596484c65ac5c36075d83eb0f85d10ee509d808d23f2f2af8e0"

View File

@@ -0,0 +1,54 @@
# name: test/sql/copy/parquet/kv_metadata.test
# group: [parquet]
require parquet
# Test basic roundtrip
statement ok
COPY (SELECT 1, 'foo') TO '__TEST_DIR__/kv_metadata_test.parquet' (FORMAT PARQUET, KV_METADATA {foo: 'bar', baz: 42, quz: '\xC3\xB6\xC3\xA4\xC3\xA5'::BLOB});
query II
SELECT key::VARCHAR, value::VARCHAR FROM parquet_kv_metadata('__TEST_DIR__/kv_metadata_test.parquet');
----
foo bar
baz 42
quz \xC3\xB6\xC3\xA4\xC3\xA5
query II
SELECT * FROM '__TEST_DIR__/kv_metadata_test.parquet'
----
1 foo
# Test decoding blobs
query II
SELECT key::VARCHAR, decode(value) FROM parquet_kv_metadata('__TEST_DIR__/kv_metadata_test.parquet') WHERE key = 'quz';
----
quz öäå
# Test invalid metadata
statement error
COPY (SELECT 1, 'foo') TO '__TEST_DIR__/kv_metadata_test_fail.parquet' (FORMAT PARQUET, KV_METADATA 'foobar');
----
Expected kv_metadata argument to be a STRUCT
# Test no kv
statement ok
COPY (SELECT 3, 'baz') TO '__TEST_DIR__/kv_metadata_test3.parquet' (FORMAT PARQUET);
query II
SELECT key::VARCHAR, value::VARCHAR FROM parquet_kv_metadata('__TEST_DIR__/kv_metadata_test3.parquet');
----
# Test globbing
statement ok
COPY (SELECT 2, 'bar') TO '__TEST_DIR__/kv_metadata_test2.parquet' (FORMAT PARQUET, KV_METADATA {a: 'b', c: 'd'});
query III
SELECT replace(replace(file_name, '\', '/'),replace('__TEST_DIR__/', '\', '/'), '') AS file_name, key::VARCHAR, value::VARCHAR FROM parquet_kv_metadata('__TEST_DIR__/kv_metadata_tes*') ORDER BY 1, 2;
----
kv_metadata_test.parquet baz 42
kv_metadata_test.parquet foo bar
kv_metadata_test.parquet quz \xC3\xB6\xC3\xA4\xC3\xA5
kv_metadata_test2.parquet a b
kv_metadata_test2.parquet c d

View File

@@ -0,0 +1,33 @@
# name: test/sql/copy/parquet/lineitem_arrow.test
# description: Issue #2261: TPC-H Q6 fails on Parquet input
# group: [parquet]
require tpch
require parquet
statement ok
CREATE TABLE lineitem AS SELECT * FROM 'data/parquet-testing/arrow/lineitem-arrow.parquet'
query I nosort q01
PRAGMA tpch(1)
----
query I nosort q06
PRAGMA tpch(6)
----
statement ok
DROP TABLE lineitem
statement ok
CREATE VIEW lineitem AS SELECT * FROM 'data/parquet-testing/arrow/lineitem-arrow.parquet'
query I nosort q01
PRAGMA tpch(1)
----
query I nosort q06
PRAGMA tpch(6)
----

View File

@@ -0,0 +1,68 @@
# name: test/sql/copy/parquet/multi_file/multi_file_filter_integer_types.test
# description: Test multi file filters
# group: [multi_file]
require parquet
statement ok
PRAGMA enable_verification
statement ok
COPY (FROM (VALUES ('f1', 42::INT), ('f1', 8::INT), ('f1', NULL::INT)) t(f, i)) TO '__TEST_DIR__/multi_file_filter_f1.parquet'
statement ok
COPY (FROM (VALUES (42::BIGINT, 'f2'), (124::BIGINT, 'f2'), (NULL::BIGINT, 'f2')) t(i, f)) TO '__TEST_DIR__/multi_file_filter_f2.parquet'
# the schema of the file depends on the first file read
statement ok
CREATE VIEW integer_file_first AS FROM read_parquet(['__TEST_DIR__/multi_file_filter_f1.parquet', '__TEST_DIR__/multi_file_filter_f2.parquet'])
statement ok
CREATE VIEW bigint_file_first AS FROM read_parquet(['__TEST_DIR__/multi_file_filter_f2.parquet', '__TEST_DIR__/multi_file_filter_f1.parquet'])
# equality
query II
SELECT f, i
FROM integer_file_first
WHERE i='042'
----
f1 42
f2 42
query II
SELECT f, i
FROM bigint_file_first
WHERE i='042'
ORDER BY ALL
----
f1 42
f2 42
# greater than
query II
SELECT f, i
FROM integer_file_first
WHERE i>10
ORDER BY ALL
----
f1 42
f2 42
f2 124
query II
SELECT f, i
FROM bigint_file_first
WHERE i>'10'
ORDER BY ALL
----
f1 42
f2 42
f2 124
query II
SELECT f, i
FROM integer_file_first
WHERE i IS NULL
----
f1 NULL
f2 NULL

View File

@@ -0,0 +1,69 @@
# name: test/sql/copy/parquet/multi_file/multi_file_filter_mixed.test
# description: Test multi file filters
# group: [multi_file]
require parquet
statement ok
PRAGMA enable_verification
statement ok
COPY (FROM (VALUES ('f1', 42), ('f1', 8), ('f1', NULL)) t(f, i)) TO '__TEST_DIR__/multi_file_filter_f1.parquet'
statement ok
COPY (FROM (VALUES ('042', 'f2'), ('124', 'f2'), (NULL, 'f2')) t(i, f)) TO '__TEST_DIR__/multi_file_filter_f2.parquet'
# the schema of the file depends on the first file read
statement ok
CREATE VIEW integer_file_first AS FROM read_parquet(['__TEST_DIR__/multi_file_filter_f1.parquet', '__TEST_DIR__/multi_file_filter_f2.parquet'])
statement ok
CREATE VIEW string_file_first AS FROM read_parquet(['__TEST_DIR__/multi_file_filter_f2.parquet', '__TEST_DIR__/multi_file_filter_f1.parquet'])
# equality
# casting to integer - this works (since '042' = 42)
query II
SELECT f, i
FROM integer_file_first
WHERE i='042'
----
f1 42
f2 42
# casting to string - we only get '042' now
query II
SELECT f, i
FROM string_file_first
WHERE i='042'
----
f2 042
# greater than
query II
SELECT f, i
FROM integer_file_first
WHERE i>10
ORDER BY ALL
----
f1 42
f2 42
f2 124
# for strings, '8' is bigger than '10' (since '8' is bigger than '1')
query II
SELECT f, i
FROM string_file_first
WHERE i>'10'
ORDER BY ALL
----
f1 42
f1 8
f2 124
query II
SELECT f, i
FROM integer_file_first
WHERE i IS NULL
----
f1 NULL
f2 NULL

View File

@@ -0,0 +1,81 @@
# name: test/sql/copy/parquet/multi_file/multi_file_filter_struct.test
# description: Test multi file filters on structs
# group: [multi_file]
require parquet
# statement ok
# PRAGMA enable_verification
statement ok
COPY (SELECT {'f': f, 'i': i} struct_val FROM (VALUES ('f1', 42::INT), ('f1', 8::INT), ('f1', NULL::INT)) t(f, i)) TO '__TEST_DIR__/multi_file_filter_f1.parquet'
statement ok
COPY (SELECT {'i': i, 'f2': f} struct_val FROM (VALUES (42::BIGINT, 'f2'), (124::BIGINT, 'f2'), (NULL::BIGINT, 'f2')) t(i, f)) TO '__TEST_DIR__/multi_file_filter_f2.parquet'
# the schema of the file depends on the first file read
statement ok
CREATE VIEW integer_file_first AS FROM read_parquet(['__TEST_DIR__/multi_file_filter_f1.parquet', '__TEST_DIR__/multi_file_filter_f2.parquet'])
statement ok
CREATE VIEW bigint_file_first AS FROM read_parquet(['__TEST_DIR__/multi_file_filter_f2.parquet', '__TEST_DIR__/multi_file_filter_f1.parquet'])
# projection pushdown
query I
SELECT struct_val.i
FROM integer_file_first
ORDER BY ALL
----
8
42
42
124
NULL
NULL
# equality
query II
SELECT struct_val.f, struct_val.i
FROM integer_file_first
WHERE struct_val.i='042'
----
f1 42
NULL 42
query I
SELECT struct_val.i
FROM bigint_file_first
WHERE struct_val.i='042'
ORDER BY ALL
----
42
42
# greater than
query II
SELECT struct_val.f, struct_val.i
FROM integer_file_first
WHERE struct_val.i>10
ORDER BY ALL
----
f1 42
NULL 42
NULL 124
query I
SELECT struct_val.i
FROM bigint_file_first
WHERE struct_val.i>'10'
ORDER BY ALL
----
42
42
124
query II
SELECT struct_val.f, struct_val.i
FROM integer_file_first
WHERE struct_val.i IS NULL
----
f1 NULL
NULL NULL

View File

@@ -0,0 +1,24 @@
# name: test/sql/copy/parquet/multi_file_conversion_error.test
# description: Test multi-file conversion error
# group: [parquet]
require parquet
statement ok
copy (select 42 as a) to '__TEST_DIR__/conversion_error1.parquet';
statement ok
copy (select blob 'hello world' as a) to '__TEST_DIR__/conversion_error2.parquet';
statement error
SELECT * FROM read_parquet(['__TEST_DIR__/conversion_error1.parquet', '__TEST_DIR__/conversion_error2.parquet'])
----
failed to cast column "a" from type BLOB to INTEGER
statement ok
CREATE TABLE integers(i INT);
statement error
COPY integers FROM '__TEST_DIR__/conversion_error*.parquet'
----
column "a" has type BLOB, but we are trying to load it into column "i" with type INTEGER

View File

@@ -0,0 +1,49 @@
# name: test/sql/copy/parquet/parallel_parquet_glob.test
# description: Test parallel reads on multiple parquet files
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
statement ok
PRAGMA threads=4
query I
select count(*) from parquet_scan('data/parquet-testing/glob/t?.parquet')
----
2
query I
select count(*) from parquet_scan('data/parquet-testing/glob/*')
----
2
query I
select count(*) from parquet_scan('data/parquet-testing/glob/*.parquet')
----
2
query I
select count(*) from parquet_scan('data/parquet-testing/g*/*.parquet')
----
3
query I
select count(*) from parquet_scan('data/parquet-testing/g*/t1.parquet')
----
2
statement ok
SET parquet_metadata_cache=true
query I
select count(*) from parquet_scan('data/parquet-testing/g*/t1.parquet')
----
2
query I
select count(*) from parquet_scan('data/parquet-testing/g*/t1.parquet')
----
2

View File

@@ -0,0 +1,285 @@
# name: test/sql/copy/parquet/parquet2.test
# description: Issue #2261: TPC-H Q6 fails on Parquet input
# group: [parquet]
# Here's how we generate this
# from pyspark.sql import SparkSession
# from pyspark.sql.types import *
#
# spark = SparkSession.builder.master("local").config("spark.hadoop.parquet.writer.version", "v2").getOrCreate()
# sc = spark.sparkContext
#
# ref = spark.range(42, 10000, 2).toDF("id").orderBy(rand())
# ref.show(10)
#
# ref.write.parquet("p2.parquet")
require parquet
query I
SELECT id FROM 'data/parquet-testing/p2.parquet' offset 4968;
----
1436
2596
4774
4402
5378
5372
8658
808
5876
7214
9816
query I
SELECT id FROM 'data/parquet-testing/p2.parquet' limit 10;
----
2644
8534
3276
5264
5766
6018
2080
576
1350
9312
query I
SELECT id FROM 'data/parquet-testing/p2.parquet' limit 100;
----
2644
8534
3276
5264
5766
6018
2080
576
1350
9312
8898
1126
6704
2836
390
4440
7582
4386
4482
6866
7814
7246
8998
8454
2004
7770
7590
9092
7586
4762
5672
6782
3968
8102
726
3384
3232
9628
4460
556
1368
560
4116
4294
988
1404
8380
862
9172
3964
5728
8018
8052
8786
8828
8140
4044
324
7102
5898
6848
174
5240
4834
1354
5080
2386
7402
8508
2006
1270
4936
4682
436
6056
7772
2792
982
7028
8964
6632
4062
8260
9494
6260
8850
9238
7968
9430
8156
9388
478
4478
3400
370
130
552
7614
1234
5302
query I
SELECT id_with_null FROM 'data/parquet-testing/p2.parquet' limit 100;
----
2644
8534
3276
5264
5766
6018
NULL
576
NULL
9312
8898
1126
6704
2836
NULL
NULL
7582
4386
4482
6866
7814
7246
8998
8454
2004
NULL
NULL
9092
7586
4762
5672
6782
3968
8102
726
3384
3232
9628
NULL
556
1368
NULL
4116
4294
988
1404
NULL
862
9172
3964
5728
8018
8052
8786
8828
NULL
4044
324
7102
5898
6848
174
NULL
4834
1354
NULL
2386
7402
8508
2006
NULL
4936
4682
436
6056
7772
2792
982
7028
8964
6632
4062
NULL
9494
NULL
NULL
9238
7968
NULL
8156
9388
478
4478
NULL
NULL
NULL
552
7614
1234
5302
query IIIIIIII
select min(id), max(id), sum(id), count(id), min(id_with_null), max(id_with_null), sum(id_with_null), count(id_with_null) from 'data/parquet-testing/p2.parquet'
----
42 9998 24994580 4979 42 9998 19999680 3984
query IIII
select min(id_int), max(id_int), sum(id_int), count(id_int) from 'data/parquet-testing/p2.parquet'
----
42 9998 19999680 3984
# from bug 2882
query I
select * from 'data/parquet-testing/7-set.snappy.arrow2.parquet';
----
0
1
2
3
4
5
6

View File

@@ -0,0 +1,27 @@
# name: test/sql/copy/parquet/parquet2strings.test
# description: Issue #2261: TPC-H Q6 fails on Parquet input
# group: [parquet]
# Here's how we generate this
# from pyspark.sql import SparkSession
# from pyspark.sql.types import *
# from pyspark.sql.functions import *
# spark = SparkSession.builder.master("local").config("spark.hadoop.parquet.writer.version", "v2").getOrCreate()
# sc = spark.sparkContext
# ref = spark.range(42, 10000, 2).toDF("id").orderBy(rand())
# ref2 = ref.selectExpr("*", "repeat('XYZ', id%5) || cast(id as string) id_string")
# ref2.show(10)
# ref2.write.parquet("p2strings.parquet")
# ref2.write.csv("p2strings.csv")
# for now
mode skip
require parquet
query I
SELECT id_string FROM 'data/parquet-testing/p2strings.parquet' limit 10;
----

View File

@@ -0,0 +1,19 @@
# name: test/sql/copy/parquet/parquet_10148.test
# description: Issue #10148: Wide decimal values in stats
# group: [parquet]
require parquet
query I
SELECT CDCONO FROM 'data/parquet-testing/bug10148-wide-decimal-stats.parquet'
----
0
0
0
0
0
0
0
0
0
0

View File

@@ -0,0 +1,11 @@
# name: test/sql/copy/parquet/parquet_10279.test
# description: Issue #10279: Data loss with parquet INT64 and DELTA encoding
# group: [parquet]
require parquet
query IIIIIIII
SELECT * FROM 'data/parquet-testing/issue10279_delta_encoding.parquet'
----
MIN_VALUE false -128 -32768 -2147483648 -9223372036854775808 1e-45 5e-324
MAX_VALUE true 127 32767 2147483647 9223372036854775807 3.4028235e+38 1.7976931348623157e+308

View File

@@ -0,0 +1,12 @@
# name: test/sql/copy/parquet/parquet_12621.test
# description: Issue #12621: Parquet read : Invalid decimal encoding in Parquet file
# group: [parquet]
require parquet
query I
select *
from read_parquet('data/parquet-testing/issue12621.parquet')
limit 1;
----
0.0000

View File

@@ -0,0 +1,36 @@
# name: test/sql/copy/parquet/parquet_13053_duplicate_column_names.test
# description: Issue #13053: Parquet reader can't deal with duplicate column names
# group: [parquet]
require parquet
# original names
query I
select name from parquet_schema( 'data/parquet-testing/bug13053.parquet') offset 1;
----
column
COLUMN
Column
# renamed names
query I
SELECT column_name FROM (DESCRIBE FROM 'data/parquet-testing/bug13053.parquet')
----
column
COLUMN_1
Column_2
# case where _1 is already a column, maybe bit ugly but fine and consistent with CSV reader
query I
select name from parquet_schema( 'data/parquet-testing/bug13053-2.parquet') offset 1;
----
column
column_1
column
query I
SELECT column_name FROM (DESCRIBE FROM 'data/parquet-testing/bug13053-2.parquet')
----
column
column_1
column_1_1

View File

@@ -0,0 +1,16 @@
# name: test/sql/copy/parquet/parquet_1554.test
# description: Unclear what went wrong here in the past, but its fixed, and let's make sure it never happens again
# group: [parquet]
require parquet
query I
SELECT COUNT(backlink_count) FROM parquet_scan('data/parquet-testing/bug1554.parquet') WHERE http_status_code=200
----
0
query II
SELECT http_status_code, COUNT(backlink_count) FROM parquet_scan('data/parquet-testing/bug1554.parquet') GROUP BY http_status_code ORDER BY http_status_code
----
200 0
301 0

View File

@@ -0,0 +1,54 @@
# name: test/sql/copy/parquet/parquet_1588.test
# description: Test boolean filters
# group: [parquet]
require parquet
statement ok
pragma enable_verification
# pandas equivalent:
# df = pandas.read_parquet('data/parquet-testing/bug1588.parquet')
# df[(df.has_image_link == 1) & ((df.has_image_alt_text == 1) | df.is_image_alt_text_empty == 1)]
statement ok
create table some_bools (val boolean);
statement ok
insert into some_bools values (TRUE)
query I
select count(*) from some_bools where val = 1;
----
1
query I
select count(*) from some_bools where val = '1'::bool;
----
1
query I
SELECT has_image_link FROM parquet_scan('data/parquet-testing/bug1588.parquet') where has_image_link = 1
----
1
1
1
query I
SELECT COUNT(*) FROM parquet_scan('data/parquet-testing/bug1588.parquet') WHERE has_image_link = 1
----
3
query I
SELECT COUNT(*) FROM parquet_scan('data/parquet-testing/bug1588.parquet') WHERE has_image_link = '1'::bool
----
3
# original query for the lolz
query I
SELECT COUNT(*) FROM parquet_scan('data/parquet-testing/bug1588.parquet') WHERE (has_image_link = 1 AND (has_image_alt_text = 0 OR is_image_alt_text_empty = 1))
----
2

View File

@@ -0,0 +1,16 @@
# name: test/sql/copy/parquet/parquet_1589.test
# description: Test boolean filters
# group: [parquet]
require parquet
statement ok
pragma enable_verification
query I
SELECT backlink_count FROM parquet_scan('data/parquet-testing/bug1589.parquet') LIMIT 1
----
NULL
statement ok
SELECT * FROM parquet_scan('data/parquet-testing/bug1589.parquet')

View File

@@ -0,0 +1,23 @@
# name: test/sql/copy/parquet/parquet_1618_struct_strings.test
# description: Unclear what went wrong here in the past, but its fixed, and let's make sure it never happens again
# group: [parquet]
require parquet
query I
SELECT "inner"['str_field'] FROM parquet_scan('data/parquet-testing/bug1618_struct_strings.parquet')
----
hello
NULL
query I
SELECT "inner"['f64_field'] FROM parquet_scan('data/parquet-testing/bug1618_struct_strings.parquet')
----
NULL
1.23
query I
SELECT "inner" FROM parquet_scan('data/parquet-testing/bug1618_struct_strings.parquet')
----
{'str_field': hello, 'f64_field': NULL}
{'str_field': NULL, 'f64_field': 1.23}

View File

@@ -0,0 +1,23 @@
# name: test/sql/copy/parquet/parquet_1619.test
# description: Error: Not implemented Error: Expr of type 347 not implemented
# group: [parquet]
require parquet
query I
select struct_extract("inner", 'f64_field') from parquet_scan('data/parquet-testing/struct.parquet');
----
NULL
1.23
query I
select ("inner")."f64_field" from parquet_scan('data/parquet-testing/struct.parquet');
----
NULL
1.23
query I
select "inner"['f64_field'] from parquet_scan('data/parquet-testing/struct.parquet');
----
NULL
1.23

View File

@@ -0,0 +1,14 @@
# name: test/sql/copy/parquet/parquet_1723.test_slow
# description: CREATE TABLE from parquet crashes latest bleeding edge
# group: [parquet]
require parquet
query I nosort query
select * from 'data/parquet-testing/leftdate3_192_loop_1.parquet'
statement ok
create table test as select * from 'data/parquet-testing/leftdate3_192_loop_1.parquet'
query I nosort query
select * from test

View File

@@ -0,0 +1,15 @@
# name: test/sql/copy/parquet/parquet_2267.test
# description: Issue #2267: "Struct child row count mismatch" on parquet read
# group: [parquet]
require parquet
query I
SELECT * FROM parquet_scan('data/parquet-testing/bug2267.parquet')
----
[{'disabledPlans': [bea4c11e-220a-4e6d-8eb8-8ea15d019f90], 'skuId': c7df2760-2c81-4ef7-b578-5b5392b571df}, {'disabledPlans': [8a256a2b-b617-496d-b51b-e76466e88db0, 41781fb2-bc02-4b7c-bd55-b576c07bb09d, eec0eb4f-6444-4f95-aba0-50c24d67f998], 'skuId': 84a661c4-e949-4bd2-a560-ed7766fcaf2b}, {'disabledPlans': [], 'skuId': b05e124f-c7cc-45a0-a6aa-8cf78c946968}, {'disabledPlans': [], 'skuId': f30db892-07e9-47e9-837c-80727f46fd3d}]
query I
SELECT assignedLicenses[1] FROM parquet_scan('data/parquet-testing/bug2267.parquet')
----
{'disabledPlans': [bea4c11e-220a-4e6d-8eb8-8ea15d019f90], 'skuId': c7df2760-2c81-4ef7-b578-5b5392b571df}

View File

@@ -0,0 +1,78 @@
# name: test/sql/copy/parquet/parquet_3896.test
# description: Issue #3896: Error reading parquet file: Struct child row count mismatch
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
# single struct with map and scalar key
statement ok
CREATE VIEW v1 AS
SELECT map([2], [{'key1': map([3,4],[1,2]), 'key2':2}]) AS x
query I nosort mapres1
SELECT * FROM v1;
----
statement ok
COPY v1
TO '__TEST_DIR__/map.parquet' (FORMAT 'parquet');
query I nosort mapres1
SELECT * FROM '__TEST_DIR__/map.parquet';
----
# multiple struct with map and scalar key
statement ok
CREATE VIEW v2 AS
SELECT map([2], [{'key1': map([3,4],[1,2]), 'key2':2}]) AS x
UNION ALL
SELECT map([2], [{'key1': map([3,4],[1,2]), 'key2':2}])
query I nosort mapres2
SELECT * FROM v2;
----
statement ok
COPY v2
TO '__TEST_DIR__/map.parquet' (FORMAT 'parquet');
query I nosort mapres2
SELECT * FROM '__TEST_DIR__/map.parquet';
----
# struct with struct of lists and scalar key
statement ok
CREATE VIEW v3 AS
SELECT {'key': [2], 'val': [{'key1': {'key': [3,4], 'val': [1,2]}, 'key2':2}]} AS x
query I nosort structres1
SELECT * FROM v3;
----
statement ok
COPY v3
TO '__TEST_DIR__/map.parquet' (FORMAT 'parquet');
query I nosort structres1
SELECT * FROM '__TEST_DIR__/map.parquet';
----
# struct with struct of lists and scalar list key
statement ok
CREATE VIEW v4 AS
SELECT {'key': [2], 'val': [{'key1': {'key': [3,4], 'val': [1,2]}, 'key2':[2]}]} AS x
query I nosort structres2
SELECT * FROM v4;
----
statement ok
COPY v4
TO '__TEST_DIR__/map.parquet' (FORMAT 'parquet');
query I nosort structres2
SELECT * FROM '__TEST_DIR__/map.parquet';
----

View File

@@ -0,0 +1,16 @@
# name: test/sql/copy/parquet/parquet_3989.test
# description: Issue #3989: Skipping more than 1024 values on list column fails
# group: [parquet]
require parquet
statement ok
CREATE TABLE lists as SELECT i as id, [i] as list from range(0,10000) tbl(i);
statement ok
COPY lists to '__TEST_DIR__/list_bug_test.parquet';
query I
SELECT list from '__TEST_DIR__/list_bug_test.parquet' where id = 5000;
----
[5000]

View File

@@ -0,0 +1,10 @@
# name: test/sql/copy/parquet/parquet_4442.test
# description: Issue #4442: Parquet reader converts timestamp to i64 *sometimes*
# group: [parquet]
require parquet
query IIIIIIIIIIIIIIIII
SELECT * FROM 'data/parquet-testing/bug4442.parquet'
----
12 5184 1 22 2011-10-06 22:21:49.58+00 outbound 323020033 {} 2100 33 0 7 10 0 1317427200000 1317939709580 11

View File

@@ -0,0 +1,8 @@
# name: test/sql/copy/parquet/parquet_4859.test
# description: Issue #4859: Structs in structs lost type info in recursive call to TypeHasExactRowCount
# group: [parquet]
require parquet
statement ok
select "repositoryTopics.edges" from "data/parquet-testing/bug4859.parquet"

View File

@@ -0,0 +1,11 @@
# name: test/sql/copy/parquet/parquet_4903.test
# description: Issue #4442: Parquet reader converts timestamp to i64 *sometimes*
# group: [parquet]
require parquet
# file is corrupt
statement error
SELECT type_param_constraints FROM 'data/parquet-testing/bug4903.parquet' limit 10
----
<REGEX>:.*Binder Error.*not found in FROM clause.*

View File

@@ -0,0 +1,24 @@
# name: test/sql/copy/parquet/parquet_5209.test
# description: Issue #5209: Parquet writer did not set total_uncompressed_size in column chunk statistics
# group: [parquet]
require parquet
require vector_size 2048
statement ok
SET threads=1;
statement ok
CREATE TABLE test_5209 AS SELECT range FROM range(10000);
statement ok
COPY test_5209 TO '__TEST_DIR__/test_5209.parquet' (ROW_GROUP_SIZE 1000, PARQUET_VERSION 'V1');
query III
SELECT SUM(total_compressed_size) > 10000,
SUM(total_uncompressed_size) > 10000,
SUM(total_uncompressed_size) > SUM(total_compressed_size)
FROM parquet_metadata('__TEST_DIR__/test_5209.parquet');
----
1 1 1

View File

@@ -0,0 +1,13 @@
# name: test/sql/copy/parquet/parquet_6044.test
# description: Issue #6044: node: assertion failure when calling parquet_metadata
# group: [parquet]
require parquet
statement ok
copy (select 0.9 AS a) to '__TEST_DIR__/tiny_decimal.parquet' (format 'parquet', codec 'zstd');
query I
SELECT * FROM '__TEST_DIR__/tiny_decimal.parquet'
----
0.9

View File

@@ -0,0 +1,22 @@
# name: test/sql/copy/parquet/parquet_6580.test
# description: Issue #6580: Error when reading Parquet INT96 timestamps that are far in the past
# group: [parquet]
require parquet
query IIII
select *, make_timestamp(dt2*1000*1000)
from read_parquet('data/parquet-testing/bug4903.parquet')
where dt2 <= -9214570800
limit 10
----
1678-01-01 00:00:00 -9214570800 1677-12-31 21:00:00 1677-12-31 21:00:00
1677-01-01 00:00:00 -9246106800 1676-12-31 21:00:00 1676-12-31 21:00:00
1676-01-01 00:00:00 -9277729200 1675-12-31 21:00:00 1675-12-31 21:00:00
1675-01-01 00:00:00 -9309265200 1674-12-31 21:00:00 1674-12-31 21:00:00
1674-01-01 00:00:00 -9340801200 1673-12-31 21:00:00 1673-12-31 21:00:00
1673-01-01 00:00:00 -9372337200 1672-12-31 21:00:00 1672-12-31 21:00:00
1672-01-01 00:00:00 -9403959600 1671-12-31 21:00:00 1671-12-31 21:00:00
1671-01-01 00:00:00 -9435495600 1670-12-31 21:00:00 1670-12-31 21:00:00
1670-01-01 00:00:00 -9467031600 1669-12-31 21:00:00 1669-12-31 21:00:00
1669-01-01 00:00:00 -9498567600 1668-12-31 21:00:00 1668-12-31 21:00:00

View File

@@ -0,0 +1,25 @@
# name: test/sql/copy/parquet/parquet_6630_union_by_name.test
# description: Issue #6630: Segmentation Fault when using union_by_name with read_parquet
# group: [parquet]
require parquet
query II
select
distinct name,
true as is_suspended_or_cancelled
from read_parquet('data/parquet-testing/issue6630_*.parquet',union_by_name=True)
where "timestamp" between '2023-01-26 20:00:00' and '2023-01-28 04:00:00'
and (suspended = true or cancelled <> '' or state='SUSPENDED')
and actual_time is null;
----
query II
select
distinct name,
true as is_suspended_or_cancelled
from read_parquet('data/parquet-testing/issue6630_*.parquet', union_by_name=False)
where "timestamp" between '2023-01-26 20:00:00' and '2023-01-28 04:00:00'
and (suspended = true or cancelled <> '' or state='SUSPENDED')
and actual_time is null;
----

View File

@@ -0,0 +1,48 @@
# name: test/sql/copy/parquet/parquet_6933.test
# description: Issue #6933: Segfault when using parquet_metadata_cache alongside union_by_name for parquet files
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE table1 (
name VARCHAR,
);
statement ok
INSERT INTO table1 VALUES ('Test value 1!');
statement ok
INSERT INTO table1 VALUES ('Test value 2!');
statement ok
COPY table1 TO '__TEST_DIR__/output1.parquet' (FORMAT PARQUET);
statement ok
CREATE TABLE table2 (
name VARCHAR,
number INTEGER,
);
statement ok
INSERT INTO table2 VALUES ('Other test value', 1);
statement ok
INSERT INTO table2 VALUES ('Other test value', 2);
statement ok
COPY table2 TO '__TEST_DIR__/output2.parquet' (FORMAT PARQUET);
statement ok
set parquet_metadata_cache=true;
query II
SELECT name, number FROM read_parquet(['__TEST_DIR__/output*.parquet'], union_by_name=True) ORDER BY name, number
----
Other test value 1
Other test value 2
Test value 1! NULL
Test value 2! NULL

View File

@@ -0,0 +1,11 @@
# name: test/sql/copy/parquet/parquet_6990.test_slow
# description: Issue #6990: Reading parquet file causes a segfault
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
statement ok
SELECT * FROM 'data/parquet-testing/issue6990.parquet';

View File

@@ -0,0 +1,15 @@
# name: test/sql/copy/parquet/parquet_arrow_timestamp.test
# description: Test loading a timestamp column from an arrow-parquet generated file
# group: [parquet]
require parquet
query T
select * from parquet_scan('data/parquet-testing/timestamp.parquet') order by 1
----
2020-10-05 17:21:49.48844
query T
select * from parquet_scan('data/parquet-testing/timestamp-ms.parquet') order by 1
----
2020-10-05 17:21:49

View File

@@ -0,0 +1,15 @@
# name: test/sql/copy/parquet/parquet_blob.test
# description: Test parquet file with blob content
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
query III
select * from parquet_scan('data/parquet-testing/blob.parquet')
----
1 \x04\x00 str1
2 \x04\x00\x80 str2
3 \x03\xFF\x00\xFF str3

View File

@@ -0,0 +1,90 @@
# name: test/sql/copy/parquet/parquet_blob_string.test
# description: Test binary_as_string BLOB Function
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
query I
SELECT typeof(#1) FROM parquet_scan('data/parquet-testing/binary_string.parquet',binary_as_string=False) limit 1
----
BLOB
query I
SELECT * FROM parquet_scan('data/parquet-testing/binary_string.parquet',binary_as_string=False)
----
foo
bar
baz
query I
SELECT typeof(#1) FROM parquet_scan('data/parquet-testing/binary_string.parquet',binary_as_string=True) limit 1
----
VARCHAR
query I
SELECT * FROM parquet_scan('data/parquet-testing/binary_string.parquet',binary_as_string=True)
----
foo
bar
baz
query I
SELECT converted_type FROM parquet_schema('data/parquet-testing/binary_string.parquet')
----
NULL
NULL
statement error
SET binary_as_sting=true
----
statement ok
SET binary_as_string=true
query I
SELECT typeof(#1) FROM parquet_scan('data/parquet-testing/binary_string.parquet') limit 1
----
VARCHAR
query I
SELECT * FROM parquet_scan('data/parquet-testing/binary_string.parquet')
----
foo
bar
baz
statement ok
SET binary_as_string=false
query I
SELECT typeof(#1) FROM parquet_scan('data/parquet-testing/binary_string.parquet') limit 1
----
BLOB
query I
SELECT * FROM parquet_scan('data/parquet-testing/binary_string.parquet')
----
foo
bar
baz
# Preference goes to variable set in scan
statement ok
PRAGMA binary_as_string=1
query I
SELECT typeof(#1) FROM parquet_scan('data/parquet-testing/binary_string.parquet' ,binary_as_string=False) limit 1
----
BLOB
query I
SELECT * FROM parquet_scan('data/parquet-testing/binary_string.parquet')
----
foo
bar
baz

View File

@@ -0,0 +1,43 @@
# name: test/sql/copy/parquet/parquet_copy_type_mismatch.test
# description: Test error message when COPY FROM finds a type mismatch
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
statement ok
SET storage_compatibility_version='v1.1.0'
statement ok
CREATE TABLE integers(i INTEGER);
statement ok
COPY (SELECT DATE '1992-01-01' d) TO '__TEST_DIR__/single_date.parquet' (FORMAT parquet);
statement error
COPY integers FROM '__TEST_DIR__/single_date.parquet'
----
the column "d" has type DATE, but we are trying to load it into column "i" with type INTEGER
statement ok
COPY (SELECT DATE '1992-01-01' d, 42 k) TO '__TEST_DIR__/too_many_columns.parquet' (FORMAT parquet);
statement error
COPY integers FROM '__TEST_DIR__/too_many_columns.parquet'
----
Table schema: i INTEGER
# multiple files with different schema
statement ok
COPY (SELECT 42 i) TO '__TEST_DIR__/f2.parquet' (FORMAT parquet);
statement ok
COPY (SELECT date '1992-01-01' d, 84 i) TO '__TEST_DIR__/f1.parquet' (FORMAT parquet);
# result here depends on globbing order
statement maybe
COPY integers FROM '__TEST_DIR__/f*.parquet' (FORMAT parquet);
----
column count mismatch: expected 1 columns but found 2

View File

@@ -0,0 +1,10 @@
# name: test/sql/copy/parquet/parquet_corrupt_stats.test
# description: Test reading a Parquet file with stats that are out-of-range of the type
# group: [parquet]
require parquet
query I
FROM 'data/parquet-testing/out_of_range_stats.parquet'
----
255

View File

@@ -0,0 +1,15 @@
# name: test/sql/copy/parquet/parquet_count_star.test
# description: Test count star
# group: [parquet]
require parquet
query I
SELECT COUNT(*) FROM 'data/parquet-testing/out_of_range_stats.parquet'
----
1
query I
select COUNT(*) from parquet_scan('data/parquet-testing/glob*/t?.parquet')
----
3

View File

@@ -0,0 +1,33 @@
# name: test/sql/copy/parquet/parquet_encoding_skip.test
# description: Test skipping of various encodings
# group: [parquet]
require parquet
foreach parquet_version v1 v2
# primitives
statement ok
COPY (SELECT i id, i AS bigint, i::SMALLINT AS tinyint, i::DOUBLE dbl, 'prefix_' || i::VARCHAR str, 'constant' const_str FROM range(3000) t(i)) TO '__TEST_DIR__/skip.parquet' (PARQUET_VERSION '${parquet_version}');
query IIIIII
SELECT * FROM '__TEST_DIR__/skip.parquet' WHERE id>2995
----
2996 2996 2996 2996.0 prefix_2996 constant
2997 2997 2997 2997.0 prefix_2997 constant
2998 2998 2998 2998.0 prefix_2998 constant
2999 2999 2999 2999.0 prefix_2999 constant
# nested types
statement ok
COPY (SELECT i id, [i, i + 1, i + 2] l, {'a': i, 'l': [i, i + 1, i + 2]} struct_1, [{'a': i}, {'a': i + 1}, {'a': i + 2}] struct_2 FROM range(3000) t(i)) TO '__TEST_DIR__/skip_nested.parquet' (PARQUET_VERSION '${parquet_version}');
query IIII
SELECT * FROM '__TEST_DIR__/skip_nested.parquet' WHERE id>2995
----
2996 [2996, 2997, 2998] {'a': 2996, 'l': [2996, 2997, 2998]} [{'a': 2996}, {'a': 2997}, {'a': 2998}]
2997 [2997, 2998, 2999] {'a': 2997, 'l': [2997, 2998, 2999]} [{'a': 2997}, {'a': 2998}, {'a': 2999}]
2998 [2998, 2999, 3000] {'a': 2998, 'l': [2998, 2999, 3000]} [{'a': 2998}, {'a': 2999}, {'a': 3000}]
2999 [2999, 3000, 3001] {'a': 2999, 'l': [2999, 3000, 3001]} [{'a': 2999}, {'a': 3000}, {'a': 3001}]
endloop

View File

@@ -0,0 +1,94 @@
# name: test/sql/copy/parquet/parquet_encrypted_tpch_httpfs.test_slow
# description: Test Parquet encryption with OpenSSL for TPC-H
# group: [parquet]
require parquet
require httpfs
require tpch
statement ok
CALL dbgen(sf=1)
statement ok
PRAGMA add_parquet_key('key128', '0123456789112345')
statement ok
EXPORT DATABASE '__TEST_DIR__/tpch_encrypted' (FORMAT 'parquet', ENCRYPTION_CONFIG {footer_key: 'key128'})
load :memory:
# re-add key upon loading the DB again
statement ok
PRAGMA add_parquet_key('key128', '0123456789112345')
statement ok
IMPORT DATABASE '__TEST_DIR__/tpch_encrypted'
loop i 1 9
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q0${i}.csv
endloop
loop i 10 23
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q${i}.csv
endloop
# now again without importing the DB, just with views, so we can test projection/filter pushdown
load :memory:
# re-add key upon loading the DB again
statement ok
PRAGMA add_parquet_key('key128', '0123456789112345')
statement ok
CREATE VIEW lineitem AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/lineitem.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW orders AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/orders.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW partsupp AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/partsupp.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW part AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/part.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW customer AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/customer.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW supplier AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/supplier.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW nation AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/nation.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW region AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/region.parquet', encryption_config={footer_key: 'key128'});
loop i 1 9
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q0${i}.csv
endloop
loop i 10 23
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q${i}.csv
endloop

View File

@@ -0,0 +1,98 @@
# name: test/sql/copy/parquet/parquet_encryption.test
# description: Test Parquet encryption
# group: [parquet]
require parquet
# parquet keys are not persisted across restarts
require noforcestorage
statement ok
PRAGMA enable_verification
# AES key must have one of the three specified lengths or be valid Base64
statement error
PRAGMA add_parquet_key('my_cool_key', '42')
----
Invalid Input Error: Invalid AES key. Not a plain AES key NOR a base64 encoded string
# Valid Base64 AES key must have one of the three specified lengths
statement error
PRAGMA add_parquet_key('my_invalid_duck_key', 'ZHVjaw==')
----
Invalid Input Error: Invalid AES key. Must have a length of 128, 192, or 256 bits (16, 24, or 32 bytes)
# we dont support this yet
statement error
COPY (SELECT 42 i) to '__TEST_DIR__/encrypted.parquet' (ENCRYPTION_CONFIG {column_keys: {key_name: ['col0', 'col1']}})
----
Not implemented Error: Parquet encryption_config column_keys not yet implemented
statement error
COPY (SELECT 42 i) to '__TEST_DIR__/encrypted.parquet' (ENCRYPTION_CONFIG {footer_key: 'nonexistant'})
----
Binder Error: No key with name "nonexistant" exists. Add it with PRAGMA add_parquet_key('<key_name>','<key>');
# add keys of 3 different lengths
statement ok
PRAGMA add_parquet_key('key128', '0123456789112345')
statement ok
PRAGMA add_parquet_key('key192', '012345678911234501234567')
statement ok
PRAGMA add_parquet_key('key256', '01234567891123450123456789112345')
# test all valid AES key lengths
foreach key_len 128 192 256
statement ok
COPY (SELECT 42 i) to '__TEST_DIR__/encrypted${key_len}.parquet' (ENCRYPTION_CONFIG {footer_key: 'key${key_len}'})
query I
SELECT * FROM read_parquet('__TEST_DIR__/encrypted${key_len}.parquet', encryption_config={footer_key: 'key${key_len}'})
----
42
statement ok
CREATE OR REPLACE TABLE test (i INTEGER)
statement ok
COPY test FROM '__TEST_DIR__/encrypted${key_len}.parquet' (ENCRYPTION_CONFIG {footer_key: 'key${key_len}'})
query I
SELECT * FROM test
----
42
endloop
# what happens if we don't try to decrypt even if the file is encrypted?
statement error
SELECT * FROM read_parquet('__TEST_DIR__/encrypted128.parquet')
----
Invalid Input Error
# what if we try to decrypt with the wrong key?
statement error
SELECT * FROM read_parquet('__TEST_DIR__/encrypted128.parquet', encryption_config={footer_key: 'key192'})
----
Invalid Input Error: Computed AES tag differs from read AES tag, are you using the right key?
# what if we don't encrypt, but try to decrypt?
statement ok
COPY (SELECT 42 i) to '__TEST_DIR__/unencrypted.parquet'
statement error
SELECT * FROM read_parquet('__TEST_DIR__/unencrypted.parquet', encryption_config={footer_key: 'key256'})
----
Invalid Input Error
# Use Base64 encoded key
statement ok
PRAGMA add_parquet_key('key256base64', 'MDEyMzQ1Njc4OTExMjM0NTAxMjM0NTY3ODkxMTIzNDU=')
query I
SELECT * FROM read_parquet('__TEST_DIR__/encrypted256.parquet', encryption_config={footer_key: 'key256base64'})
----
42

View File

@@ -0,0 +1,92 @@
# name: test/sql/copy/parquet/parquet_encryption_tpch.test_slow
# description: Test Parquet encryption for TPC-H
# group: [parquet]
require parquet
require tpch
statement ok
CALL dbgen(sf=1)
statement ok
PRAGMA add_parquet_key('key128', '0123456789112345')
statement ok
EXPORT DATABASE '__TEST_DIR__/tpch_encrypted' (FORMAT 'parquet', ENCRYPTION_CONFIG {footer_key: 'key128'})
load :memory:
# re-add key upon loading the DB again
statement ok
PRAGMA add_parquet_key('key128', '0123456789112345')
statement ok
IMPORT DATABASE '__TEST_DIR__/tpch_encrypted'
loop i 1 9
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q0${i}.csv
endloop
loop i 10 23
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q${i}.csv
endloop
# now again without importing the DB, just with views, so we can test projection/filter pushdown
load :memory:
# re-add key upon loading the DB again
statement ok
PRAGMA add_parquet_key('key128', '0123456789112345')
statement ok
CREATE VIEW lineitem AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/lineitem.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW orders AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/orders.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW partsupp AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/partsupp.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW part AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/part.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW customer AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/customer.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW supplier AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/supplier.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW nation AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/nation.parquet', encryption_config={footer_key: 'key128'});
statement ok
CREATE VIEW region AS SELECT * FROM read_parquet('__TEST_DIR__/tpch_encrypted/region.parquet', encryption_config={footer_key: 'key128'});
loop i 1 9
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q0${i}.csv
endloop
loop i 10 23
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q${i}.csv
endloop

View File

@@ -0,0 +1,13 @@
# name: test/sql/copy/parquet/parquet_enum_test.test
# description: Test parquet file with enum content
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
query IIIIIIIIIIIIIIIIIIIIII
SELECT * FROM parquet_scan('data/parquet-testing/adam_genotypes.parquet')
----
{'referenceName': NULL, 'start': NULL, 'end': NULL, 'names': [name], 'splitFromMultiAllelic': false, 'referenceAllele': NULL, 'alternateAllele': NULL, 'quality': NULL, 'filtersApplied': NULL, 'filtersPassed': NULL, 'filtersFailed': [], 'annotation': NULL} NULL NULL NULL NULL NULL NULL NULL [] NULL NULL NULL NULL NULL NULL [] [] [] false false NULL NULL

View File

@@ -0,0 +1,34 @@
# name: test/sql/copy/parquet/parquet_expression_filter.test
# description: Test expression filters on Parquet
# group: [parquet]
require parquet
statement ok
CREATE TABLE tbl AS
SELECT i, 'thisisalongstring'||(i%5000)::VARCHAR AS str
FROM range(100000) t(i);
statement ok
COPY tbl TO '__TEST_DIR__/parquet_expr.parquet'
statement ok
CREATE VIEW parq AS FROM '__TEST_DIR__/parquet_expr.parquet'
query I
SELECT COUNT(*) FROM parq
WHERE least(str, 'thisisalongstring50') = str
----
88940
query I
SELECT COUNT(*) FROM parq
WHERE least(str, 'thisisalongstring50') = str AND str >= 'this'
----
88940
query I
SELECT COUNT(*) FROM parq
WHERE least(str, 'thisisalongstring50') = str AND str >= 'thisisalongstring2000' AND str <= 'thisisalongstring4000'
----
44460

View File

@@ -0,0 +1,58 @@
# name: test/sql/copy/parquet/parquet_external_access.test
# description: Test that enable_external_access blocks Parquet reads
# group: [parquet]
require parquet
# we cannot read parquet files
statement ok
CREATE TABLE lineitem AS SELECT * FROM 'data/parquet-testing/arrow/lineitem-arrow.parquet'
statement ok
SET enable_external_access=false;
# we cannot read parquet files
statement error
SELECT * FROM 'data/parquet-testing/arrow/lineitem-arrow.parquet'
----
<REGEX>:.*Permission Error: Cannot access file.*
# or their metadata
statement error
SELECT * FROM parquet_metadata('data/parquet-testing/arrow/lineitem-arrow.parquet')
----
<REGEX>:.*Permission Error: Cannot access file.*
statement error
SELECT * FROM parquet_schema('data/parquet-testing/arrow/lineitem-arrow.parquet')
----
<REGEX>:.*Permission Error: Cannot access file.*
# also not in a list
statement error
SELECT * FROM parquet_scan(['data/parquet-testing/arrow/lineitem-arrow.parquet', 'data/parquet-testing/arrow/lineitem-arrow.parquet'])
----
<REGEX>:.*Permission Error: Cannot access file.*
# neither can we glob
statement error
SELECT * FROM glob('data/parquet-testing/arrow/lineitem-arrow.parquet')
----
<REGEX>:.*Permission Error: Cannot access file.*
# or copy to/from...
statement error
COPY lineitem FROM 'data/parquet-testing/arrow/lineitem-arrow.parquet'
----
<REGEX>:.*Permission Error: Cannot access file.*
statement error
COPY lineitem TO '__TEST_DIR__/lineitem.parquet'
----
<REGEX>:.*Permission Error: Cannot access file.*
# we also can't just enable external access again
statement error
SET enable_external_access=true;
----
<REGEX>:.*Invalid Input Error: Cannot change.*while database is running.*

View File

@@ -0,0 +1,107 @@
# name: test/sql/copy/parquet/parquet_filename.test
# description: Test the filename option of the parquet reader
# group: [parquet]
require parquet
# Simple glob with filenames, note that we replace \ for / to make tests pass on windows
query III
select i, j, replace(filename, '\', '/') from parquet_scan('data/parquet-testing/glob*/t?.parquet', FILENAME=1) order by i;
----
1 a data/parquet-testing/glob/t1.parquet
2 b data/parquet-testing/glob/t2.parquet
3 c data/parquet-testing/glob2/t1.parquet
# Filter on filename col
query III
select i, j, replace(filename, '\', '/') as file from parquet_scan('data/parquet-testing/glob*/t?.parquet', FILENAME=1) where file='data/parquet-testing/glob2/t1.parquet';
----
3 c data/parquet-testing/glob2/t1.parquet
# filter on multiple vector sizes of rows
query I
SELECT count(filename) FROM parquet_scan('data/parquet-testing/p2.parquet', FILENAME=1) where id < 1000;
----
479
# filter pushdown on filename
query I
SELECT count(id) FROM parquet_scan('data/parquet-testing/p2.parquet', FILENAME=1) where filename >= 'data';
----
4979
# Filter on non-filename col
query I
select replace(filename, '\', '/') from parquet_scan('data/parquet-testing/glob*/t?.parquet', FILENAME=1) where i=2;
----
data/parquet-testing/glob/t2.parquet
statement ok
CREATE TABLE test_csv AS SELECT 1 as id, 'test_csv_content' as filename;
statement ok
COPY test_csv TO '__TEST_DIR__/filename_as_column.csv' WITH HEADER;
# This currently fails with a binder error
statement error
SELECT id, filename FROM read_csv_auto('__TEST_DIR__/filename_as_column.csv', FILENAME=1);
----
# Parquet filename name conflict
statement ok
CREATE TABLE test AS SELECT 1 as id, 'test' as filename;
statement ok
COPY test TO '__TEST_DIR__/filename_as_column.parquet';
# we currently don't support filename as a column name when using the filename option
statement error
SELECT * FROM parquet_scan('__TEST_DIR__/filename_as_column.parquet', FILENAME=1);
----
# Now also test copy
statement ok
CREATE TABLE test_copy (i INT, j VARCHAR, filename VARCHAR);
statement ok
INSERT INTO test_copy FROM read_parquet('data/parquet-testing/glob/t1.parquet', filename=1, binary_as_string=1);
query III
SELECT i, j, replace(filename, '\', '/') FROM test_copy
----
1 a data/parquet-testing/glob/t1.parquet
statement ok
INSERT INTO test_copy FROM read_parquet('data/parquet-testing/glob/t1.parquet', filename=1);
query III
SELECT i, j, replace(filename, '\', '/') FROM test_copy
----
1 a data/parquet-testing/glob/t1.parquet
1 a data/parquet-testing/glob/t1.parquet
statement error
COPY test_copy FROM 'data/parquet-testing/glob/t1.parquet';
----
column count mismatch
# Multiple row groups in same file
statement ok
CREATE TABLE test_table_large AS SELECT * FROM range(0,10000) tbl(i);
statement ok
COPY test_table_large TO '__TEST_DIR__/test_table_large.parquet' (ROW_GROUP_SIZE 1000);
query II
SELECT sum(i), max(regexp_replace(filename, '^.*/', '')) FROM parquet_scan('__TEST_DIR__/test_table_large.parquet', FILENAME=1) where i>5000;
----
37492500 test_table_large.parquet
# Same file twice
query III
SELECT i, j, replace(filename, '\', '/') as file FROM parquet_scan(['data/parquet-testing/glob/t1.parquet', 'data/parquet-testing/glob/t1.parquet', 'data/parquet-testing/glob/t2.parquet'], FILENAME=1) where file like '%t1%'
----
1 a data/parquet-testing/glob/t1.parquet
1 a data/parquet-testing/glob/t1.parquet

View File

@@ -0,0 +1,102 @@
# name: test/sql/copy/parquet/parquet_filename_filter.test
# description: Test the filename filter pushdown
# group: [parquet]
require parquet
query III
select id, value as f, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where filename='value1';
----
# requires notwindows for windows-style path backslash reasons
require notwindows
query III
select i, j, filename from parquet_scan('data/parquet-testing/glob*/t?.parquet', FILENAME=1) order by i;
----
1 a data/parquet-testing/glob/t1.parquet
2 b data/parquet-testing/glob/t2.parquet
3 c data/parquet-testing/glob2/t1.parquet
query III
select i, j, filename as file from parquet_scan('data/parquet-testing/glob*/t?.parquet', FILENAME=1) where file='data/parquet-testing/glob2/t1.parquet' or file='data/parquet-testing/glob/t2.parquet' order by i;
----
2 b data/parquet-testing/glob/t2.parquet
3 c data/parquet-testing/glob2/t1.parquet
query III
select i, j, filename as file from parquet_scan('data/parquet-testing/glob*/t?.parquet', FILENAME=1) where file='data/parquet-testing/glob2/t1.parquet' and i=3 order by i;
----
3 c data/parquet-testing/glob2/t1.parquet
query III
select i, j, filename as file from parquet_scan('data/parquet-testing/glob*/t?.parquet', FILENAME=1) where file='data/parquet-testing/glob2/t1.parquet' and i=2 order by i;
----
# This query should trigger the file skipping mechanism, which prevents reading metadata for files that are not scanned
query IIII
select id, value, date, filename from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, FILENAME=1) order by id;
----
1 value1 2012-01-01 data/parquet-testing/hive-partitioning/different_order/date=2012-01-01/part=a/test.parquet
2 value2 2013-01-01 data/parquet-testing/hive-partitioning/different_order/part=b/date=2013-01-01/test.parquet
# These queries test that the file skipping mechanism works even for complex filters on multiple filename-based filters
query IIII
select id, value, date, filename from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, FILENAME=1) where concat(date,filename)='2013-01-01data/parquet-testing/hive-partitioning/different_order/part=b/date=2013-01-01/test.parquet';
----
2 value2 2013-01-01 data/parquet-testing/hive-partitioning/different_order/part=b/date=2013-01-01/test.parquet
query IIII
select id, value, date, filename from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, FILENAME=1) where concat(date,filename)='2012-01-01data/parquet-testing/hive-partitioning/different_order/date=2012-01-01/part=a/test.parquet';
----
1 value1 2012-01-01 data/parquet-testing/hive-partitioning/different_order/date=2012-01-01/part=a/test.parquet
# Ensure we don't somehow endup mixing things up
query III
select id, value as f, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where f='value2';
----
2 value2 2013-01-01
query III
select id, value as f, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where f='value1';
----
1 value1 2012-01-01
query III
select id, value as f, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where filename='value1';
----
# These tests confirm that the ParquetScanStats will properly handle the pruned files list
statement ok
SET parquet_metadata_cache=true;
query II
select id, value from parquet_scan('data/parquet-testing/hive-partitioning/*/*/*/test.parquet', FILENAME=1) where filename like '%mismatching_count%' and id > 1;
----
2 value2
query II
select id, value from parquet_scan('data/parquet-testing/hive-partitioning/*/*/*/test.parquet', FILENAME=1) where filename like '%mismatching_count%' and id > 1;
----
2 value2
query II
select id, value from parquet_scan('data/parquet-testing/hive-partitioning/*/*/*/test.parquet', FILENAME=1) where filename like '%mismatching_count%' and value = 'value1';
----
1 value1
query II
select id, value from parquet_scan('data/parquet-testing/hive-partitioning/*/*/*/test.parquet', FILENAME=1) where filename like '%mismatching_count%' and value = 'value2';
----
2 value2
query II
select id, value from parquet_scan('data/parquet-testing/hive-partitioning/*/*/*/test.parquet', FILENAME=1) where filename like '%simple%' and value = 'value1';
----
1 value1
query II
select id, value from parquet_scan('data/parquet-testing/hive-partitioning/*/*/*/test.parquet', FILENAME=1) where filename like '%simple%' and value = 'value2';
----
2 value2

View File

@@ -0,0 +1,42 @@
# name: test/sql/copy/parquet/parquet_filter_bug1391.test
# description: Test basic parquet reading
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
statement ok
CREATE VIEW tbl AS SELECT * FROM PARQUET_SCAN('data/parquet-testing/filter_bug1391.parquet');
query I
SELECT ORGUNITID FROM tbl LIMIT 10
----
98
13
175
200
262
206
204
131
181
269
query I
SELECT COUNT(*) FROM tbl;
----
9789
query I
SELECT COUNT(*) FROM tbl
WHERE Namevalidfrom <= '2017-03-01'
AND Namevalidto >= '2017-03-01'
AND Parentnamevalidfrom <= '2017-03-01'
AND Parentnamevalidto >= '2017-03-01'
AND CustomerCode = 'CODE';
----
8722

View File

@@ -0,0 +1,28 @@
# name: test/sql/copy/parquet/parquet_fixed_length_blob_dict.test
# description: Parquet file with dictionary of fixed length byte arrays
# group: [parquet]
require parquet
query IIIIIIIIIIII
SELECT
MIN(sfc_key), MAX(sfc_key),
MIN(gps_time), MAX(gps_time),
MIN(intensity), MAX(intensity),
MIN(classification), MAX(classification),
MIN(return_number), MAX(return_number),
MIN(number_of_returns), MAX(number_of_returns)
FROM parquet_scan('data/parquet-testing/sorted.zstd_18_131072_small.parquet')
----
\x00\xA0e\xFB\xF8|\xF0\xA8_t\x16\x9A
\x03,\xDF$)\xF5\x13\x11\x9B\x11k\x10
205949378.92443183
205949634.3036811
4
1035
1
9
1
5
1
7

View File

@@ -0,0 +1,100 @@
# name: test/sql/copy/parquet/parquet_glob.test
# description: Test basic globbing of parquet files
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
query II
select * from parquet_scan('data/parquet-testing/glob*/t?.parquet') order by i
----
1 a
2 b
3 c
query II
select * from parquet_scan('data/parquet-testing/glob/t[0-9].parquet') order by i
----
1 a
2 b
query II
select * from parquet_scan('data/parquet-testing/glob/*') order by i
----
1 a
2 b
query II
select * from parquet_scan('data/parquet-testing/glob/*.parquet') order by i
----
1 a
2 b
query II
select * from parquet_scan('data/parquet-testing/g*/*.parquet') order by i
----
1 a
2 b
3 c
query II
select * from parquet_scan('data/parquet-testing/g*/t1.parquet') order by i
----
1 a
3 c
# abs path
query II
select * from parquet_scan('__WORKING_DIRECTORY__/data/parquet-testing/g*/t1.parquet') order by i
----
1 a
3 c
# forward slashes
query II
select * from parquet_scan('data\parquet-testing\g*\t1.parquet') order by i
----
1 a
3 c
# Double partial matches
query II rowsort
FROM parquet_scan('data/parquet-testing/glob3/*/dir/*.parquet');
----
1 a
3 c
statement error
select count(*) from parquet_scan('')
----
<REGEX>:.*IO Error.*can only be set for.*
# schema mismatch in parquet glob
statement error
select * from parquet_scan('data/parquet-testing/*.parquet')
----
<REGEX>:.*Invalid Input Error: Failed to read file.*
# parquet glob with COPY FROM
statement ok
CREATE TABLE vals (i INTEGER, j BLOB)
statement ok
COPY vals FROM 'data/parquet-testing/glob/t?.parquet' (FORMAT PARQUET);
query II
SELECT * FROM vals ORDER BY 1
----
1 a
2 b
# failed to copy: incorrect types found in parquet file
statement ok
CREATE TABLE vals2 (i INTEGER, j INTEGER)
statement error
COPY vals2 FROM '*/sql/*/parquet/*/glob/t?.parquet' (FORMAT PARQUET);
----
<REGEX>:.*IO Error: No files found that match the pattern.*

View File

@@ -0,0 +1,17 @@
# name: test/sql/copy/parquet/parquet_go.test
# description: Issue #5744: Fail to import .parquet file created with parquet-go
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
query II
SELECT * FROM 'data/parquet-testing/parquet_go.parquet'
----
John Hello World
John Hello World
John Hello World
John Hello World
John Hello World

View File

@@ -0,0 +1,196 @@
# name: test/sql/copy/parquet/parquet_hive.test
# description: Test the automatic parsing of the hive partitioning scheme
# group: [parquet]
require parquet
# test parsing hive partitioning scheme
query IIII
select id, value, part, date from parquet_scan('data/parquet-testing/hive-partitioning/simple/*/*/test.parquet', HIVE_PARTITIONING=1) order by id
----
1 value1 a 2012-01-01
2 value2 b 2013-01-01
# As long as the names match, we don't really mind since everything is a string anyway
query IIII
select id, value, part, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) order by id
----
1 value1 a 2012-01-01
2 value2 b 2013-01-01
# Filter should work too
query II
select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2013-01-01';
----
2 2013-01-01
query II
select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2012-01-01';
----
1 2012-01-01
query II
select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2018-01-01';
----
query IIII
select id, value, part, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where part='a' OR part='b' order by id;
----
1 value1 a 2012-01-01
2 value2 b 2013-01-01
query II
select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2013-01-01' and id = 2;
----
2 2013-01-01
query II
select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2013-01-01' and id = 1;
----
# This query should trigger the file skipping mechanism, which prevents reading metadata for files that are not scanned
query III
select id, value, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2012-01-01' and id = 1;
----
1 value1 2012-01-01
query III
select id, value, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2012-01-01' or id <= 2 order by id;
----
1 value1 2012-01-01
2 value2 2013-01-01
# If the key names don't add up, there's nothing we can do
statement error
select * from parquet_scan('data/parquet-testing/hive-partitioning/mismatching_names/*/*/test.parquet', HIVE_PARTITIONING=1)
----
Hive partition mismatch
statement error
select * from parquet_scan('data/parquet-testing/hive-partitioning/mismatching_count/*/*/test.parquet', HIVE_PARTITIONING=1) WHERE part=b
----
Hive partition mismatch
statement error
select * from parquet_scan('data/parquet-testing/hive-partitioning/mismatching_names/*/*/test.parquet', HIVE_PARTITIONING=1, UNION_BY_NAME=1)
----
Hive partition mismatch
# Verify that filters are pushed down into the parquet scan (Only file with the filter are read)
query II
EXPLAIN select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where date = '2013-01-01';
----
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(date = '2013.*-01.*-01'\).*
query II
EXPLAIN select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where date = '2018-01-01';
----
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(date = '2018.*-01.*-01'\).*
# No Parquet Scan Filters should be applied here
query II
EXPLAIN select id, value, part, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where part='a' OR part='b' order by id;
----
physical_plan <!REGEX>:.*PARQUET_SCAN.*File Filters:.*
query II
EXPLAIN select id, date from parquet_scan('data/parquet-testing/hive-partitioning/simple/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where date = '2012-01-01' and id < 10;
----
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(date = '2012.*-01.*-01'\).*
query II
EXPLAIN select id, date from parquet_scan('data/parquet-testing/hive-partitioning/simple/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where date = '2013-01-01' and id < 10;
----
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(date = '2013.*-01.*-01'\).*
# Complex filter filtering first file
query IIII
select id, value, part, CAST(date AS DATE) as date_cast from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where concat(date_cast::VARCHAR, part) == '2013-01-01b';
----
2 value2 b 2013-01-01
# Complex filter filtering first file, filter should be pruned completely
query II
explain select id, value, part, CAST(date AS DATE) as date_cast from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where concat(date_cast::VARCHAR, part) == '2013-01-01b';
----
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(concat\(CAST.*\(CAST.*\(date AS.*DATE\) AS VARCHAR\), part\).*= '2013-01-01b'\).*
# Complex filter filtering second file
query IIII
select id, value, part, CAST(date AS DATE) as date_cast from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where concat(date_cast::VARCHAR, part) == '2012-01-01a';
----
1 value1 a 2012-01-01
# Complex filter filtering second file, filter should be pruned completely
query II
explain select id, value, part, CAST(date AS DATE) as date_cast from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where concat(date_cast::VARCHAR, part) == '2012-01-01a';
----
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(concat\(CAST.*\(CAST.*\(date AS.*DATE\) AS VARCHAR\), part\).*= '2012-01-01a'\).*
# Currently, complex fiters combining hive columns and regular columns, can prevent filter pushdown for some situations
# TODO: we want to support filter pushdown here too
query II
explain select id, value, part, CAST(date AS DATE) as date_cast from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where (date_cast=CAST('2013-01-01' as DATE) AND (value='value1' OR concat(date_cast::VARCHAR, part) == '2013-01-01b'));
----
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(CAST\(date AS DATE\) =.*'2013.*-01-01'::DATE\).*
# Idem
query II
explain select id, value, part, CAST(date AS DATE) as date_cast from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where (date_cast=CAST('2012-01-01' as DATE) AND (value='value2' OR concat(date_cast::VARCHAR, part) == '2012-01-01a'));
----
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(CAST\(date AS DATE\) =.*'2012.*-01-01'::DATE\).*
# Confirm that hive partitions override existing columns
# Without hive partitioning we just read the files, note the mismatch here between the hive partition in the filename and the col in the file
query III
SELECT a, b, replace(filename, '\', '/') filename FROM parquet_scan('data/parquet-testing/hive-partitioning/hive_col_also_in_file/*/test.parquet', HIVE_PARTITIONING=0, FILENAME=1) order by filename;
----
1 2 data/parquet-testing/hive-partitioning/hive_col_also_in_file/a=5/test.parquet
3 4 data/parquet-testing/hive-partitioning/hive_col_also_in_file/a=6/test.parquet
# Hive col from path overrides col in file
query III
SELECT a, b, replace(filename, '\', '/') filename FROM parquet_scan('data/parquet-testing/hive-partitioning/hive_col_also_in_file/*/test.parquet', HIVE_PARTITIONING=1, FILENAME=1) order by filename;
----
5 2 data/parquet-testing/hive-partitioning/hive_col_also_in_file/a=5/test.parquet
6 4 data/parquet-testing/hive-partitioning/hive_col_also_in_file/a=6/test.parquet
# Test handling missing files
query IIII
select id, value, part, date
from parquet_scan('data/parquet-testing/hive-partitioning/missing/*/*/test.parquet', HIVE_PARTITIONING=1)
order by id
----
3 value3 c 2014-01-01
4 value4 d 2015-01-01
# check cases where there are file filters AND table filters
statement ok
Create table t1 (a int, b int, c int);
foreach i 0 1 2 3 4 5 6 7 8 9
statement ok
insert into t1 (select range, ${i}*10, ${i}*100 from range(0,10));
endloop
statement ok
COPY (SELECT * FROM t1) TO '__TEST_DIR__/hive_filters' (FORMAT PARQUET, PARTITION_BY c);
statement ok
COPY (SELECT * FROM t1) TO '__TEST_DIR__/hive_filters_2' (FORMAT PARQUET, PARTITION_BY (c, b));
# There should be Table Filters (id < 50) and regular filters
query II
EXPLAIN select a from parquet_scan('__TEST_DIR__/hive_filters/*/*.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where c::INT=500 and a::INT < 4;
----
physical_plan <REGEX>:.*PARQUET_SCAN.*Filters:.*a<4.*File Filters:.*\(CAST\(c AS.*INTEGER\) = 500\).*
# unsatisfiable file filters also show up
query II
EXPLAIN select a from parquet_scan('__TEST_DIR__/hive_filters_2/*/*/*.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where c::INT > 500 and c::INT < 500;
----
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(CAST\(c AS.*INTEGER\).*BETWEEN.*500 AND 500\).*

View File

@@ -0,0 +1,46 @@
# name: test/sql/copy/parquet/parquet_hive2.test
# description: Test generating hive partitioning scheme
# group: [parquet]
require parquet
# See https://github.com/duckdb/duckdb/pull/9473#issuecomment-1786231577
statement ok
create or replace table orders(m int,v int,j int);
statement ok
insert into orders select i%12+1,i,j from range(360)t(i),range(1000)s(j);
statement ok
copy (select 2000+(v//12)y,m,v,j from orders) TO '__TEST_DIR__/orders_m' (FORMAT PARQUET, PARTITION_BY (m));
query IIII
SELECT AVG(y), AVG(m), AVG(v), AVG(j) FROM '__TEST_DIR__/orders_m/**/*.parquet'
----
2014.5 6.5 179.5 499.5
statement ok
copy (select 2000+(v//12)y,m,v,j from orders) TO '__TEST_DIR__/orders_y' (FORMAT PARQUET, PARTITION_BY (y));
query IIII
SELECT AVG(y), AVG(m), AVG(v), AVG(j) FROM '__TEST_DIR__/orders_y/**/*.parquet'
----
2014.5 6.5 179.5 499.5
statement ok
copy (select 2000+(v//12)y,m,v,j from orders) TO '__TEST_DIR__/orders_ym' (FORMAT PARQUET,PARTITION_BY (y,m));
query IIII
SELECT AVG(y), AVG(m), AVG(v), AVG(j) FROM '__TEST_DIR__/orders_ym/**/*.parquet'
----
2014.5 6.5 179.5 499.5
# random shuffle
statement ok
copy (select 2000+(v//12)y,m,v,j from orders order by random()) TO '__TEST_DIR__/orders_ym_rand' (FORMAT PARQUET,PARTITION_BY (y,m));
query IIII
SELECT AVG(y), AVG(m), AVG(v), AVG(j) FROM '__TEST_DIR__/orders_ym_rand/**/*.parquet'
----
2014.5 6.5 179.5 499.5

View File

@@ -0,0 +1,37 @@
# name: test/sql/copy/parquet/parquet_hive_empty.test
# description: Test empty partitioning values
# group: [parquet]
require parquet
query II
select *
from parquet_scan('data/parquet-testing/hive-partitioning/empty_string/*/*.parquet')
ORDER BY ALL
----
a a
b (empty)
c NULL
# filter on hive partitioning with NULL values
query II
select *
from parquet_scan('data/parquet-testing/hive-partitioning/empty_string/*/*.parquet')
WHERE key IS NULL
----
c NULL
query II
select *
from parquet_scan('data/parquet-testing/hive-partitioning/empty_string/*/*.parquet')
WHERE key='a'
----
a a
query II
select *
from parquet_scan('data/parquet-testing/hive-partitioning/empty_string/*/*.parquet')
WHERE key=''
----
b (empty)

View File

@@ -0,0 +1,51 @@
# name: test/sql/copy/parquet/parquet_hive_null.test
# description: Test NULL partitioning values
# group: [parquet]
require parquet
statement ok
create table test as select i%5 as a, i%2 as b from range(0,10) tbl(i);
statement ok
copy (FROM test UNION ALL select 'NULL' as a, 'NULL' as b) to '__TEST_DIR__/null-parquet' (PARTITION_BY (a,b), FORMAT 'parquet', WRITE_PARTITION_COLUMNS);
query II
select *
from parquet_scan('__TEST_DIR__/null-parquet/**/*.parquet', hive_partitioning=1, hive_types={'a': INT})
ORDER BY ALL
----
0 0
0 1
1 0
1 1
2 0
2 1
3 0
3 1
4 0
4 1
NULL NULL
statement ok
create table test2 as select i%5 as a, i%2 as b, i as c from range(0,10) tbl(i);
statement ok
copy (FROM test2 UNION ALL select 'NULL' as a, 'NULL' as b, 'NULL' as c) to '__TEST_DIR__/null-parquet' (PARTITION_BY (a,b), FORMAT 'parquet', OVERWRITE);
query III
select *
from parquet_scan('__TEST_DIR__/null-parquet/**/*.parquet', hive_partitioning=1, hive_types={'a': INT})
ORDER BY ALL
----
0 0 0
1 1 1
2 2 0
3 3 1
4 4 0
5 0 1
6 1 0
7 2 1
8 3 0
9 4 1
NULL NULL NULL

View File

@@ -0,0 +1,139 @@
# name: test/sql/copy/parquet/parquet_late_materialization.test
# description: Test Top N Optimization for Parquet
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
statement ok
COPY (SELECT i, i + 1 AS j, i + 2 AS k, -i AS l FROM range(10) t(i)) TO '__TEST_DIR__/late_materialization.parquet';
statement ok
CREATE VIEW test AS FROM '__TEST_DIR__/late_materialization.parquet';
statement ok
SET explain_output='optimized_only'
# Top N optimization
# this gets turned into a row-id join
query II
explain SELECT * FROM test ORDER BY j DESC LIMIT 2;
----
logical_opt <REGEX>:.*COMPARISON_JOIN.*
query IIII
SELECT * FROM test ORDER BY j DESC LIMIT 2;
----
9 10 11 -9
8 9 10 -8
query II
explain SELECT * FROM test ORDER BY j, i LIMIT 2;
----
logical_opt <REGEX>:.*COMPARISON_JOIN.*
query IIII
SELECT * FROM test ORDER BY j, i LIMIT 2;
----
0 1 2 0
1 2 3 -1
# this does not, we cannot remove any columns by turning it into a row-id join
query II
explain SELECT i FROM test ORDER BY i LIMIT 2;
----
logical_opt <!REGEX>:.*COMPARISON_JOIN.*
# we cannot do this with volatile expressions
query II
explain SELECT * FROM (SELECT i + random() AS i, j, k, l FROM test) ORDER BY i LIMIT 2;
----
logical_opt <!REGEX>:.*COMPARISON_JOIN.*
# top-n with expressions
query IIII
SELECT * FROM (SELECT -i i, -j j, -k k, -l l FROM test) ORDER BY -j DESC LIMIT 2
----
-9 -10 -11 9
-8 -9 -10 8
# multiple layers
query IIII
SELECT * FROM (SELECT 100 + i i, 1000 + j j, 10000 + k k, 100000 + l l FROM (SELECT -i i, -j j, -k k, -l l FROM test)) ORDER BY j DESC LIMIT 2
----
100 999 9998 100000
99 998 9997 100001
# limit + offset
query II
explain SELECT * FROM test LIMIT 2 OFFSET 2;
----
logical_opt <REGEX>:.*COMPARISON_JOIN.*
query IIII
SELECT * FROM test LIMIT 2 OFFSET 2;
----
2 3 4 -2
3 4 5 -3
# sample
query II
explain SELECT * FROM test USING SAMPLE 2 ROWS
----
logical_opt <REGEX>:.*COMPARISON_JOIN.*
# we can only use joins when we are sampling rows
query II
explain SELECT * FROM test USING SAMPLE 10%
----
logical_opt <!REGEX>:.*COMPARISON_JOIN.*
# order on expression
query IIII
SELECT * FROM test ORDER BY -j DESC LIMIT 2
----
0 1 2 0
1 2 3 -1
# projection in subquery
query IIII
SELECT * FROM (SELECT -i i, -j j, -k k, -l l FROM test) ORDER BY -j DESC LIMIT 2
----
-9 -10 -11 9
-8 -9 -10 8
# filter after top-n
query IIII
SELECT * FROM (
SELECT * FROM test ORDER BY j DESC LIMIT 2
) WHERE i=8
----
8 9 10 -8
query I
SELECT l FROM (
SELECT * FROM test ORDER BY j DESC LIMIT 2
) WHERE k=10
----
-8
# now with varchar columns
statement ok
COPY (SELECT i, printf('%02d', i + 1) AS j, printf('%02d', i + 2) AS k, -i AS l FROM range(10) t(i)) TO '__TEST_DIR__/late_materialization_varchar.parquet';
statement ok
CREATE OR REPLACE VIEW test AS FROM '__TEST_DIR__/late_materialization_varchar.parquet';
query IIII
SELECT * FROM test ORDER BY j DESC LIMIT 2;
----
9 10 11 -9
8 09 10 -8
query IIII
SELECT j, k, l, i FROM test WHERE i > 5 ORDER BY j DESC LIMIT 2;
----
10 11 -9 9
09 10 -8 8

View File

@@ -0,0 +1,55 @@
# name: test/sql/copy/parquet/parquet_list.test
# description: Test list syntax for reading multiple files
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
# standard list syntax
query I
select count(*) from parquet_scan(['data/parquet-testing/glob/t1.parquet', 'data/parquet-testing/glob/t2.parquet']);
----
2
# glob inside a list
query I
select count(*) from parquet_scan(['data/parquet-testing/glob/*.parquet', 'data/parquet-testing/glob/t2.parquet']);
----
3
# read the same file multiple times
query I
select count(*) from parquet_scan(['data/parquet-testing/glob/t1.parquet', 'data/parquet-testing/glob/t1.parquet', 'data/parquet-testing/glob/t1.parquet', 'data/parquet-testing/glob/t1.parquet', 'data/parquet-testing/glob/t1.parquet']);
----
5
# file does not exist
statement error
select count(*) from parquet_scan(['data/parquet-testing/glob/t1.parquet', 'data/parquet-testing/glob/t2.parquet', 'this/file/doesnot/exist/hopefully.parquet']);
----
# empty list
statement error
select count(*) from parquet_scan([]::varchar[]);
----
at least one file
# null inside a list
statement error
select count(*) from parquet_scan([NULL]);
----
NULL
# null list
statement error
select count(*) from parquet_scan(NULL::VARCHAR[]);
----
NULL
# null varchar
statement error
select count(*) from parquet_scan(NULL::VARCHAR);
----
NULL

Some files were not shown because too many files have changed in this diff Show More