197 lines
9.4 KiB
SQL
197 lines
9.4 KiB
SQL
# name: test/sql/copy/parquet/parquet_hive.test
|
|
# description: Test the automatic parsing of the hive partitioning scheme
|
|
# group: [parquet]
|
|
|
|
require parquet
|
|
|
|
# test parsing hive partitioning scheme
|
|
query IIII
|
|
select id, value, part, date from parquet_scan('data/parquet-testing/hive-partitioning/simple/*/*/test.parquet', HIVE_PARTITIONING=1) order by id
|
|
----
|
|
1 value1 a 2012-01-01
|
|
2 value2 b 2013-01-01
|
|
|
|
# As long as the names match, we don't really mind since everything is a string anyway
|
|
query IIII
|
|
select id, value, part, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) order by id
|
|
----
|
|
1 value1 a 2012-01-01
|
|
2 value2 b 2013-01-01
|
|
|
|
# Filter should work too
|
|
query II
|
|
select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2013-01-01';
|
|
----
|
|
2 2013-01-01
|
|
|
|
query II
|
|
select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2012-01-01';
|
|
----
|
|
1 2012-01-01
|
|
|
|
query II
|
|
select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2018-01-01';
|
|
----
|
|
|
|
query IIII
|
|
select id, value, part, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where part='a' OR part='b' order by id;
|
|
----
|
|
1 value1 a 2012-01-01
|
|
2 value2 b 2013-01-01
|
|
|
|
query II
|
|
select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2013-01-01' and id = 2;
|
|
----
|
|
2 2013-01-01
|
|
|
|
query II
|
|
select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2013-01-01' and id = 1;
|
|
----
|
|
|
|
# This query should trigger the file skipping mechanism, which prevents reading metadata for files that are not scanned
|
|
query III
|
|
select id, value, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2012-01-01' and id = 1;
|
|
----
|
|
1 value1 2012-01-01
|
|
|
|
query III
|
|
select id, value, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where date = '2012-01-01' or id <= 2 order by id;
|
|
----
|
|
1 value1 2012-01-01
|
|
2 value2 2013-01-01
|
|
|
|
# If the key names don't add up, there's nothing we can do
|
|
statement error
|
|
select * from parquet_scan('data/parquet-testing/hive-partitioning/mismatching_names/*/*/test.parquet', HIVE_PARTITIONING=1)
|
|
----
|
|
Hive partition mismatch
|
|
|
|
statement error
|
|
select * from parquet_scan('data/parquet-testing/hive-partitioning/mismatching_count/*/*/test.parquet', HIVE_PARTITIONING=1) WHERE part=b
|
|
----
|
|
Hive partition mismatch
|
|
|
|
statement error
|
|
select * from parquet_scan('data/parquet-testing/hive-partitioning/mismatching_names/*/*/test.parquet', HIVE_PARTITIONING=1, UNION_BY_NAME=1)
|
|
----
|
|
Hive partition mismatch
|
|
|
|
# Verify that filters are pushed down into the parquet scan (Only file with the filter are read)
|
|
query II
|
|
EXPLAIN select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where date = '2013-01-01';
|
|
----
|
|
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(date = '2013.*-01.*-01'\).*
|
|
|
|
query II
|
|
EXPLAIN select id, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where date = '2018-01-01';
|
|
----
|
|
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(date = '2018.*-01.*-01'\).*
|
|
|
|
# No Parquet Scan Filters should be applied here
|
|
query II
|
|
EXPLAIN select id, value, part, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where part='a' OR part='b' order by id;
|
|
----
|
|
physical_plan <!REGEX>:.*PARQUET_SCAN.*File Filters:.*
|
|
|
|
query II
|
|
EXPLAIN select id, date from parquet_scan('data/parquet-testing/hive-partitioning/simple/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where date = '2012-01-01' and id < 10;
|
|
----
|
|
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(date = '2012.*-01.*-01'\).*
|
|
|
|
query II
|
|
EXPLAIN select id, date from parquet_scan('data/parquet-testing/hive-partitioning/simple/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where date = '2013-01-01' and id < 10;
|
|
----
|
|
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(date = '2013.*-01.*-01'\).*
|
|
|
|
# Complex filter filtering first file
|
|
query IIII
|
|
select id, value, part, CAST(date AS DATE) as date_cast from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where concat(date_cast::VARCHAR, part) == '2013-01-01b';
|
|
----
|
|
2 value2 b 2013-01-01
|
|
|
|
# Complex filter filtering first file, filter should be pruned completely
|
|
query II
|
|
explain select id, value, part, CAST(date AS DATE) as date_cast from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where concat(date_cast::VARCHAR, part) == '2013-01-01b';
|
|
----
|
|
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(concat\(CAST.*\(CAST.*\(date AS.*DATE\) AS VARCHAR\), part\).*= '2013-01-01b'\).*
|
|
|
|
# Complex filter filtering second file
|
|
query IIII
|
|
select id, value, part, CAST(date AS DATE) as date_cast from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where concat(date_cast::VARCHAR, part) == '2012-01-01a';
|
|
----
|
|
1 value1 a 2012-01-01
|
|
|
|
# Complex filter filtering second file, filter should be pruned completely
|
|
query II
|
|
explain select id, value, part, CAST(date AS DATE) as date_cast from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where concat(date_cast::VARCHAR, part) == '2012-01-01a';
|
|
----
|
|
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(concat\(CAST.*\(CAST.*\(date AS.*DATE\) AS VARCHAR\), part\).*= '2012-01-01a'\).*
|
|
|
|
# Currently, complex fiters combining hive columns and regular columns, can prevent filter pushdown for some situations
|
|
# TODO: we want to support filter pushdown here too
|
|
query II
|
|
explain select id, value, part, CAST(date AS DATE) as date_cast from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where (date_cast=CAST('2013-01-01' as DATE) AND (value='value1' OR concat(date_cast::VARCHAR, part) == '2013-01-01b'));
|
|
----
|
|
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(CAST\(date AS DATE\) =.*'2013.*-01-01'::DATE\).*
|
|
|
|
# Idem
|
|
query II
|
|
explain select id, value, part, CAST(date AS DATE) as date_cast from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where (date_cast=CAST('2012-01-01' as DATE) AND (value='value2' OR concat(date_cast::VARCHAR, part) == '2012-01-01a'));
|
|
----
|
|
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(CAST\(date AS DATE\) =.*'2012.*-01-01'::DATE\).*
|
|
|
|
# Confirm that hive partitions override existing columns
|
|
|
|
# Without hive partitioning we just read the files, note the mismatch here between the hive partition in the filename and the col in the file
|
|
query III
|
|
SELECT a, b, replace(filename, '\', '/') filename FROM parquet_scan('data/parquet-testing/hive-partitioning/hive_col_also_in_file/*/test.parquet', HIVE_PARTITIONING=0, FILENAME=1) order by filename;
|
|
----
|
|
1 2 data/parquet-testing/hive-partitioning/hive_col_also_in_file/a=5/test.parquet
|
|
3 4 data/parquet-testing/hive-partitioning/hive_col_also_in_file/a=6/test.parquet
|
|
|
|
# Hive col from path overrides col in file
|
|
query III
|
|
SELECT a, b, replace(filename, '\', '/') filename FROM parquet_scan('data/parquet-testing/hive-partitioning/hive_col_also_in_file/*/test.parquet', HIVE_PARTITIONING=1, FILENAME=1) order by filename;
|
|
----
|
|
5 2 data/parquet-testing/hive-partitioning/hive_col_also_in_file/a=5/test.parquet
|
|
6 4 data/parquet-testing/hive-partitioning/hive_col_also_in_file/a=6/test.parquet
|
|
|
|
# Test handling missing files
|
|
query IIII
|
|
select id, value, part, date
|
|
from parquet_scan('data/parquet-testing/hive-partitioning/missing/*/*/test.parquet', HIVE_PARTITIONING=1)
|
|
order by id
|
|
----
|
|
3 value3 c 2014-01-01
|
|
4 value4 d 2015-01-01
|
|
|
|
|
|
# check cases where there are file filters AND table filters
|
|
statement ok
|
|
Create table t1 (a int, b int, c int);
|
|
|
|
foreach i 0 1 2 3 4 5 6 7 8 9
|
|
|
|
statement ok
|
|
insert into t1 (select range, ${i}*10, ${i}*100 from range(0,10));
|
|
|
|
endloop
|
|
|
|
statement ok
|
|
COPY (SELECT * FROM t1) TO '__TEST_DIR__/hive_filters' (FORMAT PARQUET, PARTITION_BY c);
|
|
|
|
statement ok
|
|
COPY (SELECT * FROM t1) TO '__TEST_DIR__/hive_filters_2' (FORMAT PARQUET, PARTITION_BY (c, b));
|
|
|
|
# There should be Table Filters (id < 50) and regular filters
|
|
query II
|
|
EXPLAIN select a from parquet_scan('__TEST_DIR__/hive_filters/*/*.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where c::INT=500 and a::INT < 4;
|
|
----
|
|
physical_plan <REGEX>:.*PARQUET_SCAN.*Filters:.*a<4.*File Filters:.*\(CAST\(c AS.*INTEGER\) = 500\).*
|
|
|
|
# unsatisfiable file filters also show up
|
|
query II
|
|
EXPLAIN select a from parquet_scan('__TEST_DIR__/hive_filters_2/*/*/*.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where c::INT > 500 and c::INT < 500;
|
|
----
|
|
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(CAST\(c AS.*INTEGER\).*BETWEEN.*500 AND 500\).*
|