Files
email-tracker/external/duckdb/test/sql/copy/csv/csv_hive.test
2025-10-24 19:21:19 -05:00

152 lines
7.3 KiB
SQL

# name: test/sql/copy/csv/csv_hive.test
# description: Test the automatic parsing of the hive partitioning scheme
# group: [csv]
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE test AS SELECT 1 as id, 'value1' as value;
CREATE TABLE test2 AS SELECT 2 as id, 'value2' as value;
# filenames could allow you to parse hive partitions manually using SQL
query III
select id, value, replace(filename, '\', '/') from read_csv_auto('data/csv/hive-partitioning/simple/*/*/test.csv', FILENAME=1) order by id
----
1 value1 data/csv/hive-partitioning/simple/part=a/date=2012-01-01/test.csv
2 value2 data/csv/hive-partitioning/simple/part=b/date=2013-01-01/test.csv
# however this is just a lot nicer
query IIII
select id, value, part, date from read_csv_auto('data/csv/hive-partitioning/simple/*/*/test.csv', HIVE_PARTITIONING=1) order by id
----
1 value1 a 2012-01-01
2 value2 b 2013-01-01
# As long as the names match, we don't really mind since everything is a string anyway
query IIII
select id, value, part, date from read_csv_auto('data/csv/hive-partitioning/different_order/*/*/test.csv', HIVE_PARTITIONING=1) order by id
----
1 value1 a 2012-01-01
2 value2 b 2013-01-01
# If the key names don't add up, we throw
statement error
select * from read_csv_auto('data/csv/hive-partitioning/mismatching_names/*/*/test.csv', HIVE_PARTITIONING=1)
----
Hive partition mismatch
# If the key names don't add up, we throw
statement error
select * from read_csv_auto('data/csv/hive-partitioning/mismatching_count/*/*/test.csv', HIVE_PARTITIONING=1)
----
Hive partition mismatch
# Now we do a bunch of filtering on the partitions, to test the file skipping mechanism
query IIII
select id, value, part, date from read_csv_auto('data/csv/hive-partitioning/different_order/*/*/test.csv', HIVE_PARTITIONING=1) where part='a'
----
1 value1 a 2012-01-01
query IIII
select id, value, part, date from read_csv_auto('data/csv/hive-partitioning/different_order/*/*/test.csv', HIVE_PARTITIONING=1) where part='b'
----
2 value2 b 2013-01-01
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where part_cast > 0 and part_cast < 5000;
----
1 value1 1000 2012-01-01
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where part_cast > 5000;
----
2 value2 9000 2013-01-01
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where date_cast > CAST('2000-01-01' as DATE) and date_cast < CAST('2012-12-12' as DATE);
----
1 value1 1000 2012-01-01
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where date_cast > CAST('2000-01-01' as DATE) order by date_cast;
----
1 value1 1000 2012-01-01
2 value2 9000 2013-01-01
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where date_cast=CAST('2012-01-01' as DATE) OR part_cast=9000 ORDER BY date_cast;
----
1 value1 1000 2012-01-01
2 value2 9000 2013-01-01
## Filter expressions we can calculate during pushdown using filenames/hive partitions should be pruned
# Filtering out 0/2 files
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == '2012-01-011000') OR (part_cast=9000) ORDER BY date_cast;
----
1 value1 1000 2012-01-01
2 value2 9000 2013-01-01
# There should not be any filter operation remaining since it can be handled completely during pushdown by pruning file list
query II
EXPLAIN select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == '2012-01-011000') OR (part_cast=9000) ORDER BY date_cast;
----
physical_plan <!REGEX>:.*FILTER.*
# Query filtering out first file
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == 'foobar') OR (part_cast=9000) ORDER BY date_cast;
----
2 value2 9000 2013-01-01
# Again, we should not have a filter operator here
query II
explain select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == 'foobar') OR (part_cast=9000) ORDER BY date_cast;
----
physical_plan <!REGEX>:.*FILTER.*
# Query filtering out second file
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == '2012-01-011000') OR (part_cast=1337) ORDER BY date_cast;
----
1 value1 1000 2012-01-01
# Again, we should not have a filter operator here
query II
explain select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == '2012-01-011000') OR (part_cast=1337) ORDER BY date_cast;
----
physical_plan <!REGEX>:.*FILTER.*
# Filtering out both files
query IIII
select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == 'foobar') OR (part_cast=1337) ORDER BY date_cast;
----
# Again, we should not have a filter operator here
query II
EXPLAIN select id, value, CAST(part AS INT) as part_cast, CAST(date AS DATE) as date_cast from read_csv_auto('data/csv/hive-partitioning/types/*/*/test.csv', HIVE_PARTITIONING=1) where (date_cast=CAST('2012-01-01' as DATE) AND concat(date_cast::VARCHAR, part_cast::VARCHAR) == 'foobar') OR (part_cast=1337) ORDER BY date_cast;
----
physical_plan <!REGEX>:.*FILTER.*
# projection pushdown
query I
select value from read_csv_auto('data/csv/hive-partitioning/different_order/*/*/test.csv', HIVE_PARTITIONING=1) order by 1
----
value1
value2
query I
select part from read_csv_auto('data/csv/hive-partitioning/different_order/*/*/test.csv', HIVE_PARTITIONING=1) order by 1
----
a
b
# project only some columns from a hive partition
query I
select date from read_csv_auto('data/csv/hive-partitioning/different_order/*/*/test.csv', HIVE_PARTITIONING=1) order by 1
----
2012-01-01
2013-01-01