should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,68 @@
# name: test/sql/copy/partitioned/hive_filter_pushdown.test
# description: confirm that hive-specific filter pushdown does not mess up the filters
# group: [partitioned]
require parquet
# Confirm
statement ok
COPY (SELECT i::VARCHAR as a, (i*10)::VARCHAR as b, (i*100)::VARCHAR as c from range(0,10) tbl(i)) TO '__TEST_DIR__/hive_pushdown_bug' (FORMAT PARQUET, PARTITION_BY c);
query II
explain SELECT * FROM parquet_scan('__TEST_DIR__/hive_pushdown_bug/*/*.parquet', HIVE_PARTITIONING=1) where a > b;
----
physical_plan <!REGEX>:.*AND.*
query II nosort q1
explain SELECT * FROM parquet_scan('__TEST_DIR__/hive_pushdown_bug/*/*.parquet', HIVE_PARTITIONING=1) where a > b;
----
query II nosort q1
explain SELECT * FROM parquet_scan('__TEST_DIR__/hive_pushdown_bug/*/*.parquet') where a > b;
----
physical_plan <!REGEX>:.*PARQUET_SCAN.*File Filters:.*
query II
explain SELECT * FROM parquet_scan('__TEST_DIR__/hive_pushdown_bug/*/*.parquet', HIVE_PARTITIONING=1) where a::VARCHAR > c::VARCHAR;
----
physical_plan <!REGEX>:.*AND.*
# no file filters here
query II nosort q2
explain SELECT * FROM parquet_scan('__TEST_DIR__/hive_pushdown_bug/*/*.parquet', HIVE_PARTITIONING=1) where a::VARCHAR > c::VARCHAR;
----
physical_plan <!REGEX>:.*PARQUET_SCAN.*File Filters:.*
query II nosort q2
explain SELECT * FROM parquet_scan('__TEST_DIR__/hive_pushdown_bug/*/*.parquet') where a::VARCHAR > c::VARCHAR;
----
physical_plan <!REGEX>:.*PARQUET_SCAN.*File Filters:.*
# Check that hive specific filters are pushed down into the explain plan regardless of format type
query II
explain SELECT * FROM parquet_scan('__TEST_DIR__/hive_pushdown_bug/*/*.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=1) where c=500;
----
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(c = 500\).*Scanning Files:.*1\/10.*
query II
explain SELECT * FROM parquet_scan('__TEST_DIR__/hive_pushdown_bug/*/*.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=1) where c=500 and b='20';
----
physical_plan <REGEX>:.*PARQUET_SCAN.*File Filters:.*\(c = 500\).*Scanning Files:.*1\/10.*
# File Filters show up in read csv auto for hive partitioned csv files.
statement ok
COPY (SELECT i::VARCHAR as a, (i*10)::VARCHAR as b, (i*100)::VARCHAR as c from range(0,10) tbl(i)) TO '__TEST_DIR__/hive_pushdown_bug_csv' (FORMAT CSV, PARTITION_BY c);
query II
explain SELECT * FROM read_csv_auto('__TEST_DIR__/hive_pushdown_bug_csv/*/*.csv', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=1, names=['a','b','c']) where c=500;
----
physical_plan <REGEX>:.*READ_CSV_AUTO.*File Filters:.*\(c = 500\).*Scanning Files:.*1\/10.*
# same for json paritioned files
#statement ok
#COPY (SELECT i::VARCHAR as a, (i*10)::VARCHAR as b, (i*100)::VARCHAR as c from range(0,10) tbl(i)) TO '__TEST_DIR__/hive_pushdown_bug_csv' (PARTITION_BY c);
#
#query II
#explain SELECT * FROM read_csv_auto('__TEST_DIR__/hive_pushdown_bug/*/*.parquet', HIVE_PARTITIONING=1, names=['a','b','c']) where c=500;
#----
#physical_plan <REGEX>:.*Filters:\s*c=500.*

View File

@@ -0,0 +1,63 @@
# name: test/sql/copy/partitioned/hive_partition_append.test
# description: test APPEND mode for hive partitioned write
# group: [partitioned]
require parquet
statement ok
CREATE TABLE sensor_data(ts TIMESTAMP, value INT);
statement ok
INSERT INTO sensor_data VALUES
(TIMESTAMP '2000-01-01 01:02:03', 42),
(TIMESTAMP '2000-02-01 01:02:03', 100),
(TIMESTAMP '2000-03-01 12:11:10', 1000)
;
statement ok
COPY (SELECT YEAR(ts) AS year, MONTH(ts) AS month, * FROM sensor_data)
TO '__TEST_DIR__/partitioned_append' (FORMAT PARQUET, PARTITION_BY (year, month), APPEND);
query III
SELECT year, month, SUM(value) FROM '__TEST_DIR__/partitioned_append/**/*.parquet' GROUP BY ALL ORDER BY ALL
----
2000 1 42
2000 2 100
2000 3 1000
statement ok
DELETE FROM sensor_data;
statement ok
INSERT INTO sensor_data VALUES
(TIMESTAMP '2000-01-01 02:02:03', 62),
(TIMESTAMP '2000-03-01 13:11:10', 50)
;
statement ok
COPY (SELECT YEAR(ts) AS year, MONTH(ts) AS month, * FROM sensor_data)
TO '__TEST_DIR__/partitioned_append' (FORMAT PARQUET, PARTITION_BY (year, month), APPEND, FILENAME_PATTERN 'my_pattern_{uuid}');
query III
SELECT year, month, SUM(value) FROM '__TEST_DIR__/partitioned_append/**/*.parquet' GROUP BY ALL ORDER BY ALL
----
2000 1 104
2000 2 100
2000 3 1050
statement ok
COPY (SELECT YEAR(ts) AS year, MONTH(ts) AS month, * FROM sensor_data)
TO '__TEST_DIR__/partitioned_append' (FORMAT PARQUET, PARTITION_BY (year, month), FILENAME_PATTERN 'my_pattern_{uuid}', APPEND);
query III
SELECT year, month, SUM(value) FROM '__TEST_DIR__/partitioned_append/**/*.parquet' GROUP BY ALL ORDER BY ALL
----
2000 1 166
2000 2 100
2000 3 1100
statement error
COPY (SELECT YEAR(ts) AS year, MONTH(ts) AS month, * FROM sensor_data)
TO '__TEST_DIR__/partitioned_append' (FORMAT PARQUET, PARTITION_BY (year, month), APPEND, FILENAME_PATTERN 'my_pattern_without_uuid');
----
APPEND mode requires a {uuid} label in filename_pattern

View File

@@ -0,0 +1,11 @@
# name: test/sql/copy/partitioned/hive_partition_case_insensitive_column.test
# description: Test when columns in the hive partitioned files differ only in case from the partitions themselves
# group: [partitioned]
require parquet
query II
SELECT * FROM 'data/parquet-testing/hive-partitioning/ci-column-names/**/*.parquet' ORDER BY ALL
----
Hannes 2
Mark 1

View File

@@ -0,0 +1,25 @@
# name: test/sql/copy/partitioned/hive_partition_compression.test
# description: Test we can round-trip partitioned compressed Parquet files
# group: [partitioned]
statement ok
PRAGMA enable_verification
require parquet
statement ok
CREATE TABLE test AS VALUES ('a', 'foo', 1), ('a', 'foo', 2), ('a', 'bar', 1), ('b', 'bar', 1);
statement ok
COPY (FROM test) TO '__TEST_DIR__/hive_partition_compress' (FORMAT parquet, COMPRESSION 'gzip', PARTITION_BY ('col0', 'col1'));
# Specify Compression
query III
FROM read_parquet('__TEST_DIR__/hive_partition_compress/*/*/*.parquet')
ORDER BY ALL
----
1 a bar
1 a foo
1 b bar
2 a foo

View File

@@ -0,0 +1,15 @@
# name: test/sql/copy/partitioned/hive_partition_duplicate_name.test
# description: Test partitioning names with duplicate keys
# group: [partitioned]
require parquet
# we just use the first partitioning key by default
query III
select *
from parquet_scan('data/parquet-testing/hive-partitioning/duplicate_names/**/*.parquet')
ORDER BY ALL
----
1 value1 1
2 value2 2

View File

@@ -0,0 +1,74 @@
# name: test/sql/copy/partitioned/hive_partition_escape.test
# description: Test escaping during hive partition read/write
# group: [partitioned]
require parquet
statement ok
CREATE SEQUENCE seq
statement ok
CREATE TABLE weird_tbl(id INT DEFAULT nextval('seq'), key VARCHAR)
statement ok
INSERT INTO weird_tbl (key) VALUES
('/'),
('\/\/'),
('==='),
('value with strings'),
('?:&'),
('🦆'),
('==='),
('===');
statement ok
COPY weird_tbl TO '__TEST_DIR__/escaped_partitions' (FORMAT PARQUET, PARTITION_BY(key))
query II
select key, COUNT(*)
from parquet_scan('__TEST_DIR__/escaped_partitions/**/*.parquet')
GROUP BY ALL
ORDER BY ALL
----
/ 1
=== 3
?:& 1
\/\/ 1
value with strings 1
🦆 1
# now with columns with weird characters in the name
statement ok
ALTER TABLE weird_tbl RENAME COLUMN key TO "=/ \\/"
# this column name won't work with automatic HIVE partition due to the equal character
statement ok
COPY weird_tbl TO '__TEST_DIR__/escaped_partitions_names' (FORMAT PARQUET, PARTITION_BY("=/ \\/"))
statement error
select "=/ \\/", COUNT(*)
from parquet_scan('__TEST_DIR__/escaped_partitions_names/**/*.parquet')
GROUP BY ALL
ORDER BY ALL
----
Binder Error: Referenced column "=/ \\/" not found in FROM clause!
# if we write the partition column on files, it can be read
statement ok
COPY weird_tbl TO '__TEST_DIR__/escaped_partitions_names' (FORMAT PARQUET, PARTITION_BY("=/ \\/"), OVERWRITE, WRITE_PARTITION_COLUMNS)
query II
select "=/ \\/", COUNT(*)
from parquet_scan('__TEST_DIR__/escaped_partitions_names/**/*.parquet')
GROUP BY ALL
ORDER BY ALL
----
/ 1
=== 3
?:& 1
\/\/ 1
value with strings 1
🦆 1

View File

@@ -0,0 +1,78 @@
# name: test/sql/copy/partitioned/hive_partition_join_pushdown.test
# description: Test pruning of hive partitions through join conditions
# group: [partitioned]
require parquet
# partitions
statement ok
CREATE TABLE tbl AS SELECT i//1000 AS partition, i FROM range(10000) t(i)
statement ok
COPY tbl TO '__TEST_DIR__/partition_join_pushdown' (FORMAT parquet, PARTITION_BY (partition))
query II
EXPLAIN ANALYZE SELECT COUNT(*), MIN(partition), MAX(partition), SUM(i)
FROM '__TEST_DIR__/partition_join_pushdown/**/*.parquet'
----
analyzed_plan <REGEX>:.*Total Files Read.*10.*
query IIII
SELECT COUNT(*), MIN(partition), MAX(partition), SUM(i)
FROM '__TEST_DIR__/partition_join_pushdown/**/*.parquet'
WHERE partition=(SELECT MAX(partition) FROM tbl)
----
1000 9 9 9499500
query II
EXPLAIN ANALYZE SELECT COUNT(*), MIN(partition), MAX(partition), SUM(i)
FROM '__TEST_DIR__/partition_join_pushdown/**/*.parquet'
WHERE partition=(SELECT MAX(partition) FROM tbl)
----
analyzed_plan <REGEX>:.*Total Files Read.*1.*
query IIII
SELECT COUNT(*), MIN(partition), MAX(partition), SUM(i)
FROM '__TEST_DIR__/partition_join_pushdown/**/*.parquet'
WHERE i>=9980 AND partition=(SELECT MAX(partition) FROM tbl)
----
20 9 9 199790
# multiple filters on the same partition
query IIII
SELECT COUNT(*), MIN(partition), MAX(partition), SUM(i)
FROM '__TEST_DIR__/partition_join_pushdown/**/*.parquet'
WHERE partition>5 AND partition=(SELECT MAX(partition) FROM tbl)
----
1000 9 9 9499500
# multiple partitions
statement ok
CREATE TABLE tbl2 AS SELECT (date '2000-01-01' + interval (i//2000) years)::DATE AS part1, i%2 AS part2, i FROM range(10000) t(i)
statement ok
COPY tbl2 TO '__TEST_DIR__/partition_join_pushdown_multi' (FORMAT parquet, PARTITION_BY (part1, part2))
# multiple join filters
query IIIIII
SELECT COUNT(*), MIN(part1), MAX(part1), MIN(part2), MAX(part2), SUM(i)
FROM '__TEST_DIR__/partition_join_pushdown_multi/**/*.parquet'
WHERE part1=(SELECT MAX(part1) FROM tbl2) AND part2=(SELECT MAX(part2) FROM tbl2)
----
1000 2004-01-01 2004-01-01 1 1 9000000
# mix of static and join flters
query IIIIII
SELECT COUNT(*), MIN(part2), MAX(part2), MIN(part1), MAX(part1), SUM(i)
FROM '__TEST_DIR__/partition_join_pushdown_multi/**/*.parquet'
WHERE part2=(SELECT MAX(part2) FROM tbl2) AND part1=date '2004-01-01'
----
1000 1 1 2004-01-01 2004-01-01 9000000
# only selecting a single column
query I
SELECT COUNT(*)
FROM '__TEST_DIR__/partition_join_pushdown_multi/**/*.parquet'
WHERE part2=(SELECT MAX(part2) FROM tbl2)
----
5000

View File

@@ -0,0 +1,44 @@
# name: test/sql/copy/partitioned/hive_partition_recursive_cte.test
# description: Test for hive partitioned read with recursive CTE
# group: [partitioned]
require parquet
#statement ok
#PRAGMA enable_verification
# create a table
statement ok
CREATE TABLE t AS SELECT 2000+i%10 AS year, 1+i%3 AS month, i%4 AS c, i%5 AS d FROM RANGE(0,20) tbl(i);
statement ok
COPY t TO '__TEST_DIR__/partition_rec_cte' (FORMAT PARQUET, PARTITION_BY (year, month));
statement ok
CREATE VIEW partitioned_tbl AS FROM '__TEST_DIR__/partition_rec_cte/**/*.parquet';
loop i 0 2
# this recursive CTE iterates over the years (2000...2009) and counts the number of rows in each of the years
# then at the end we add everything up
query III
WITH RECURSIVE cte AS (
SELECT 0 AS count, 1999 AS selected_year
UNION ALL
SELECT COUNT(*) AS count, MAX(partitioned_tbl.year)
FROM partitioned_tbl, (SELECT MAX(selected_year) AS next_year FROM cte)
WHERE partitioned_tbl.year = (SELECT MAX(selected_year) + 1 FROM cte)
HAVING COUNT(*)>0
)
SELECT SUM(count), MIN(selected_year), MAX(selected_year)
FROM cte
WHERE count>0
----
20 2000 2009
# retry with union by name
statement ok
CREATE OR REPLACE VIEW partitioned_tbl AS FROM read_parquet('__TEST_DIR__/partition_rec_cte/**/*.parquet', union_by_name=True);
endloop

View File

@@ -0,0 +1,417 @@
# name: test/sql/copy/partitioned/hive_partitioned_auto_detect.test
# description: basic tests for the hive partition auto detection
# group: [partitioned]
statement ok
PRAGMA enable_verification
# create a table
statement ok
CREATE TABLE t AS SELECT i%2 AS year, i%3 AS month, i%4 AS c, i%5 AS d FROM RANGE(0,20) tbl(i);
# without partition columns written
# test a csv partition by year
statement ok
COPY t TO '__TEST_DIR__/csv_partition_1' (partition_by(year));
query I
select count(*) from glob('__TEST_DIR__/csv_partition_1/**');
----
2
# with HIVE_PARTITIONING=0, directory names won't be read unless they are written in data
query III
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_1/**', names=['a','b','c','d'], HIVE_PARTITIONING=0) LIMIT 1;
----
a b c
# with HIVE_PARTITIONING, column name from directory name supercedes "names" parameter
query IIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_1/**', names=['a','b','c','d'], HIVE_PARTITIONING=1) LIMIT 1;
----
a b c year
query IIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_1/**', names=['a','b','c','d']) LIMIT 1;
----
a b c year
# test a csv partition by year,month
statement ok
COPY t TO '__TEST_DIR__/csv_partition_2' (partition_by(year,month));
query I
select count(*) from glob('__TEST_DIR__/csv_partition_2/**');
----
6
query II
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_2/**', names=['a','b','c','d'], HIVE_PARTITIONING=0) LIMIT 1;
----
a b
query IIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_2/**', names=['a','b','c','d'], HIVE_PARTITIONING=1) LIMIT 1;
----
a b month year
query IIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_2/**', names=['a','b','c','d']) LIMIT 1;
----
a b month year
# test a single file
query I
select count(*) from glob('__TEST_DIR__/t.csv');
----
0
statement ok
COPY t TO '__TEST_DIR__/bad_file.csv';
query I
select count(*) from glob('__TEST_DIR__/bad_file.csv');
----
1
query IIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/bad_file.csv', names=['a','b','c','d'], HIVE_PARTITIONING=0) LIMIT 1;
----
a b c d
query IIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/bad_file.csv', names=['a','b','c','d'], HIVE_PARTITIONING=1) LIMIT 1;
----
a b c d
query IIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/bad_file.csv', names=['a','b','c','d']) LIMIT 1;
----
a b c d
# add bad file to list: hive partitioning will be false, because scheme doesn't match
query II
select alias(columns(*)) from read_csv_auto(['__TEST_DIR__/csv_partition_2/**', '__TEST_DIR__/bad_file.csv'], HIVE_PARTITIONING=0, names=['a','b','c','d']) LIMIT 1;
----
a b
statement error
select alias(columns(*)) from read_csv_auto(['__TEST_DIR__/csv_partition_2/**', '__TEST_DIR__/bad_file.csv'], HIVE_PARTITIONING=1, names=['a','b','c','d']) LIMIT 1;
----
Binder Error: Hive partition mismatch
query II
select alias(columns(*)) from read_csv_auto(['__TEST_DIR__/csv_partition_2/**', '__TEST_DIR__/bad_file.csv'], names=['a','b','c','d']) LIMIT 1;
----
a b
# same tests with parquet
require parquet
# test a parquet partition by year
statement ok
COPY t TO '__TEST_DIR__/parquet_partition_1' (format parquet, partition_by(year));
query I
select count(*) from glob('__TEST_DIR__/parquet_partition_1/**');
----
2
query III
select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_1/**', HIVE_PARTITIONING=0) LIMIT 1;
----
month c d
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_1/**', HIVE_PARTITIONING=1) LIMIT 1;
----
month c d year
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_1/**') LIMIT 1;
----
month c d year
# test a parquet partition by year,month
statement ok
COPY t TO '__TEST_DIR__/parquet_partition_2' (format parquet, partition_by(year,month));
query I
select count(*) from glob('__TEST_DIR__/parquet_partition_2/**');
----
6
query II
select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_2/**', HIVE_PARTITIONING=0) LIMIT 1;
----
c d
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_2/**', HIVE_PARTITIONING=1) LIMIT 1;
----
c d month year
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_2/**') LIMIT 1;
----
c d month year
# test a single file
query I
select count(*) from glob('__TEST_DIR__/t.parquet');
----
0
statement ok
COPY t TO '__TEST_DIR__/t.parquet' (format parquet);
query I
select count(*) from glob('__TEST_DIR__/t.parquet');
----
1
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/t.parquet', HIVE_PARTITIONING=0) LIMIT 1;
----
year month c d
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/t.parquet', HIVE_PARTITIONING=1) LIMIT 1;
----
year month c d
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/t.parquet') LIMIT 1;
----
year month c d
# add bad file to list: hive partitioning will be false, because scheme doesn't match
query II
select alias(columns(*)) from read_parquet(['__TEST_DIR__/parquet_partition_2/**', '__TEST_DIR__/t.parquet'], HIVE_PARTITIONING=0) LIMIT 1;
----
c d
statement error
select alias(columns(*)) from read_parquet(['__TEST_DIR__/parquet_partition_2/**', '__TEST_DIR__/t.parquet'], HIVE_PARTITIONING=1) LIMIT 1;
----
Binder Error: Hive partition mismatch
query II
select alias(columns(*)) from read_parquet(['__TEST_DIR__/parquet_partition_2/**', '__TEST_DIR__/t.parquet']) LIMIT 1;
----
c d
# with partition columns written
# test a csv partition by year
statement ok
COPY t TO '__TEST_DIR__/csv_partition_1' (partition_by(year), overwrite_or_ignore, write_partition_columns);
query I
select count(*) from glob('__TEST_DIR__/csv_partition_1/**');
----
2
query IIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_1/**', names=['a','b','c','d'], HIVE_PARTITIONING=0) LIMIT 1;
----
a b c d
query IIIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_1/**', names=['a','b','c','d'], HIVE_PARTITIONING=1) LIMIT 1;
----
a b c d year
query IIIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_1/**', names=['a','b','c','d']) LIMIT 1;
----
a b c d year
# test a csv partition by year,month
statement ok
COPY t TO '__TEST_DIR__/csv_partition_2' (partition_by(year,month), overwrite_or_ignore, write_partition_columns);
query I
select count(*) from glob('__TEST_DIR__/csv_partition_2/**');
----
6
query IIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_2/**', names=['a','b','c','d'], HIVE_PARTITIONING=0) LIMIT 1;
----
a b c d
query IIIIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_2/**', names=['a','b','c','d'], HIVE_PARTITIONING=1) LIMIT 1;
----
a b c d month year
query IIIIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/csv_partition_2/**', names=['a','b','c','d']) LIMIT 1;
----
a b c d month year
# test a single file
query I
select count(*) from glob('__TEST_DIR__/t.csv');
----
0
statement ok
COPY t TO '__TEST_DIR__/bad_file.csv';
query I
select count(*) from glob('__TEST_DIR__/bad_file.csv');
----
1
query IIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/bad_file.csv', names=['a','b','c','d'], HIVE_PARTITIONING=0) LIMIT 1;
----
a b c d
query IIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/bad_file.csv', names=['a','b','c','d'], HIVE_PARTITIONING=1) LIMIT 1;
----
a b c d
query IIII
select alias(columns(*)) from read_csv_auto('__TEST_DIR__/bad_file.csv', names=['a','b','c','d']) LIMIT 1;
----
a b c d
# add bad file to list: hive partitioning will be false, because scheme doesn't match
query IIII
select alias(columns(*)) from read_csv_auto(['__TEST_DIR__/csv_partition_2/**', '__TEST_DIR__/bad_file.csv'], HIVE_PARTITIONING=0, names=['a','b','c','d']) LIMIT 1;
----
a b c d
statement error
select alias(columns(*)) from read_csv_auto(['__TEST_DIR__/csv_partition_2/**', '__TEST_DIR__/bad_file.csv'], HIVE_PARTITIONING=1, names=['a','b','c','d']) LIMIT 1;
----
Binder Error: Hive partition mismatch
query IIII
select alias(columns(*)) from read_csv_auto(['__TEST_DIR__/csv_partition_2/**', '__TEST_DIR__/bad_file.csv'], names=['a','b','c','d']) LIMIT 1;
----
a b c d
# same tests with parquet
require parquet
# test a parquet partition by year
statement ok
COPY t TO '__TEST_DIR__/parquet_partition_1' (format parquet, partition_by(year), overwrite_or_ignore, write_partition_columns);
query I
select count(*) from glob('__TEST_DIR__/parquet_partition_1/**');
----
2
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_1/**', HIVE_PARTITIONING=0) LIMIT 1;
----
year month c d
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_1/**', HIVE_PARTITIONING=1) LIMIT 1;
----
year month c d
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_1/**') LIMIT 1;
----
year month c d
# test a parquet partition by year,month
statement ok
COPY t TO '__TEST_DIR__/parquet_partition_2' (format parquet, partition_by(year,month), overwrite_or_ignore, write_partition_columns);
query I
select count(*) from glob('__TEST_DIR__/parquet_partition_2/**');
----
6
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_2/**', HIVE_PARTITIONING=0) LIMIT 1;
----
year month c d
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_2/**', HIVE_PARTITIONING=1) LIMIT 1;
----
year month c d
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/parquet_partition_2/**') LIMIT 1;
----
year month c d
# test a single file
statement ok
COPY t TO '__TEST_DIR__/t.parquet' (format parquet);
query I
select count(*) from glob('__TEST_DIR__/t.parquet');
----
1
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/t.parquet', HIVE_PARTITIONING=0) LIMIT 1;
----
year month c d
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/t.parquet', HIVE_PARTITIONING=1) LIMIT 1;
----
year month c d
query IIII
select alias(columns(*)) from read_parquet('__TEST_DIR__/t.parquet') LIMIT 1;
----
year month c d
# add bad file to list: hive partitioning will be false, because scheme doesn't match
query IIII
select alias(columns(*)) from read_parquet(['__TEST_DIR__/parquet_partition_2/**', '__TEST_DIR__/t.parquet'], HIVE_PARTITIONING=0) LIMIT 1;
----
year month c d
statement error
select alias(columns(*)) from read_parquet(['__TEST_DIR__/parquet_partition_2/**', '__TEST_DIR__/t.parquet'], HIVE_PARTITIONING=1) LIMIT 1;
----
Binder Error: Hive partition mismatch
query IIII
select alias(columns(*)) from read_parquet(['__TEST_DIR__/parquet_partition_2/**', '__TEST_DIR__/t.parquet']) LIMIT 1;
----
year month c d
query IIII
select i,j,k,x
from read_parquet('data/parquet-testing/hive-partitioning/union_by_name/*/*.parquet', hive_partitioning=0, union_by_name=1)
order by j,x nulls last;
----
42 84 NULL 1
42 84 NULL NULL
NULL 128 33 NULL
query IIII
select i,j,k,x
from read_parquet('data/parquet-testing/hive-partitioning/union_by_name/*/*.parquet', hive_partitioning=1, union_by_name=1)
order by j,x nulls last;
----
42 84 NULL 1
42 84 NULL 1
NULL 128 33 2
query IIII
select i,j,k,x
from read_parquet('data/parquet-testing/hive-partitioning/union_by_name/*/*.parquet', union_by_name=1)
order by j,x nulls last;
----
42 84 NULL 1
42 84 NULL 1
NULL 128 33 2

View File

@@ -0,0 +1,195 @@
# name: test/sql/copy/partitioned/hive_partitioned_write.test
# description: basic tests for the hive partitioned write
# group: [partitioned]
require parquet
# Simple table that is easy to partition
statement ok
CREATE TABLE test as SELECT i%2 as part_col, (i+1)%5 as value_col, i as value2_col from range(0,10) tbl(i);
statement ok
COPY test TO '__TEST_DIR__/partitioned1' (FORMAT PARQUET, PARTITION_BY (part_col));
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/partitioned1/part_col=0/*.parquet' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/partitioned1/part_col=1/*.parquet' ORDER BY value2_col;
----
1 2 1
1 4 3
1 1 5
1 3 7
1 0 9
# Want a modified version of the partition_col? (for example to do custom string conversion?) No problem:
statement ok
COPY (SELECT * EXCLUDE (part_col), 'prefix-'::VARCHAR || part_col::VARCHAR as part_col FROM test) TO '__TEST_DIR__/partitioned2' (FORMAT PARQUET, PARTITION_BY (part_col));
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/partitioned2/part_col=prefix-0/*.parquet' ORDER BY value2_col;
----
prefix-0 1 0
prefix-0 3 2
prefix-0 0 4
prefix-0 2 6
prefix-0 4 8
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/partitioned2/part_col=prefix-1/*.parquet' ORDER BY value2_col;
----
prefix-1 2 1
prefix-1 4 3
prefix-1 1 5
prefix-1 3 7
prefix-1 0 9
# Test partitioning by all
statement ok
COPY test TO '__TEST_DIR__/partitioned3' (FORMAT PARQUET, PARTITION_BY '*', WRITE_PARTITION_COLUMNS);
query I
SELECT min(value2_col) as min_val
FROM parquet_scan('__TEST_DIR__/partitioned3/part_col=*/value_col=*/value2_col=*/*.parquet', FILENAME=1)
GROUP BY filename
ORDER BY min_val
----
0
1
2
3
4
5
6
7
8
9
# single col as param is also fine
statement ok
COPY test TO '__TEST_DIR__/partitioned4' (FORMAT PARQUET, PARTITION_BY part_col);
query III
SELECT part_col, value_col, value2_col FROM parquet_scan('__TEST_DIR__/partitioned4/part_col=*/*.parquet', HIVE_PARTITIONING=1) WHERE part_col=0 ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# a file already exist, throw error
statement error
COPY test TO '__TEST_DIR__/partitioned4' (FORMAT PARQUET, PARTITION_BY part_col);
----
Directory
# Trailing slash ist auch gut!
statement ok
COPY test TO '__TEST_DIR__/partitioned5/' (FORMAT PARQUET, PARTITION_BY part_col);
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/partitioned5/part_col=0/*.parquet' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# Cannot use the USE_TMP_FILE option simulatiously with partitioning
statement error
COPY test TO '__TEST_DIR__/partitioned6' (FORMAT PARQUET, PARTITION_BY part_col, USE_TMP_FILE TRUE);
----
Not implemented Error: Can't combine USE_TMP_FILE and PARTITION_BY for COPY
# Technically it doesn't really matter, as currently out parition_by behaves similarly, but for clarity user should just
# EITHER use partition_by or per_thread_output.
statement error
COPY test TO '__TEST_DIR__/partitioned6' (FORMAT PARQUET, PARTITION_BY part_col, PER_THREAD_OUTPUT TRUE);
----
Not implemented Error: Can't combine PER_THREAD_OUTPUT and PARTITION_BY for COPY
# partitioning csv files is also a thing
statement ok
COPY test TO '__TEST_DIR__/partitioned7' (FORMAT CSV, PARTITION_BY part_col);
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/partitioned7/part_col=0/*.csv' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/partitioned7/part_col=1/*.csv' ORDER BY value2_col;
----
1 2 1
1 4 3
1 1 5
1 3 7
1 0 9
# Don't care about capitalization
statement ok
COPY test TO '__TEST_DIR__/partitioned8' (FORMAT PARQUET, PARTITION_BY pArt_cOl);
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/partitioned8/part_col=0/*.parquet' ORDER BY value2_cOl;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# Order matters!
statement ok
COPY test TO '__TEST_DIR__/partitioned9' (FORMAT PARQUET, PARTITION_BY (part_col, value_col));
query I
SELECT min(value2_col) as min_val
FROM parquet_scan('__TEST_DIR__/partitioned9/part_col=*/value_col=*/*.parquet', FILENAME=1)
GROUP BY filename
ORDER BY min_val
----
0
1
2
3
4
5
6
7
8
9
statement ok
COPY test TO '__TEST_DIR__/partitioned10' (FORMAT PARQUET, PARTITION_BY (value_col, part_col));
query I
SELECT min(value2_col) as min_val
FROM parquet_scan('__TEST_DIR__/partitioned10/value_col=*/part_col=*/*.parquet', FILENAME=1)
GROUP BY filename
ORDER BY min_val
----
0
1
2
3
4
5
6
7
8
9

View File

@@ -0,0 +1,51 @@
# name: test/sql/copy/partitioned/hive_partitioned_write.test_slow
# description: slow test for the hive partitioned write
# group: [partitioned]
require parquet
require tpch
statement ok
pragma memory_limit='100mb'
# around 200MB worth of data, will require the PartitionedColumnData to spill to disk
statement ok
COPY (SELECT i%2::INT32 as part_col, i::INT32 FROM range(0,25000000) tbl(i)) TO '__TEST_DIR__/partitioned_memory_spill' (FORMAT parquet, PARTITION_BY part_col);
statement ok
pragma memory_limit='-1'
statement ok
call dbgen(sf=1);
# Partition by 2 columns
statement ok
COPY lineitem TO '__TEST_DIR__/lineitem_sf1_partitioned' (FORMAT PARQUET, PARTITION_BY (l_linestatus, l_returnflag));
statement ok
DROP TABLE lineitem;
statement ok
CREATE VIEW lineitem as SELECT * FROM parquet_scan('__TEST_DIR__/lineitem_sf1_partitioned/l_linestatus=*/l_returnflag=*/*.parquet', HIVE_PARTITIONING=1);
loop i 1 9
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q0${i}.csv
endloop
loop i 10 23
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q${i}.csv
endloop
statement ok
pragma threads=1;

View File

@@ -0,0 +1,47 @@
# name: test/sql/copy/partitioned/hive_partitioning_overwrite.test
# description: Test OVERWRITE option
# group: [partitioned]
require parquet
# write a partition with value 42
statement ok
COPY (SELECT 42 AS part_col, 43 AS value_col) TO '__TEST_DIR__/overwrite_test' (FORMAT PARQUET, PARTITION_BY (part_col));
# writing to the same directory fails now
statement error
COPY (SELECT 84 AS part_col, 85 AS value_col) TO '__TEST_DIR__/overwrite_test' (FORMAT PARQUET, PARTITION_BY (part_col));
----
Enable OVERWRITE option to overwrite files
# test the overwrite setting
statement ok
COPY (SELECT 84 AS part_col, 85 AS value_col) TO '__TEST_DIR__/overwrite_test' (FORMAT PARQUET, PARTITION_BY (part_col), OVERWRITE 1);
# the old file (with part_col=42) should now be removed
query II
SELECT * FROM '__TEST_DIR__/overwrite_test/**/*.parquet'
----
85 84
# what if the file is a file?
statement ok
COPY (SELECT 42 AS part_col) TO '__TEST_DIR__/overwrite_test2' (FORMAT PARQUET);
statement error
COPY (SELECT 84 AS part_col, 85 AS value_col) TO '__TEST_DIR__/overwrite_test2' (FORMAT PARQUET, PARTITION_BY (part_col));
----
it exists and is a file
statement ok
COPY (SELECT 84 AS part_col, 85 AS value_col) TO '__TEST_DIR__/overwrite_test2' (FORMAT PARQUET, PARTITION_BY (part_col), OVERWRITE 1);
query II
SELECT * FROM '__TEST_DIR__/overwrite_test2/**/*.parquet'
----
85 84
statement error
COPY (SELECT 84 AS part_col) TO '__TEST_DIR__/overwrite_test' (FORMAT PARQUET, PARTITION_BY (part_col), OVERWRITE 1, OVERWRITE_OR_IGNORE 1);
----
OVERWRITE

View File

@@ -0,0 +1,17 @@
# name: test/sql/copy/partitioned/partition_issue_6304.test
# description: Issue #6304: INTERNAL Error: Comparison on NULL values
# group: [partitioned]
require parquet
statement ok
copy (select NULL as i, NULL as j from range(100000)) to '__TEST_DIR__/issue6304_null' (format parquet, partition_by(i), overwrite_or_ignore);
statement ok
copy (select 1 as i, 2 as j from range(100000)) to '__TEST_DIR__/issue6304_constant' (format parquet, partition_by(i), overwrite_or_ignore);
statement ok
copy (select NULL as i from range(100000)) to '__TEST_DIR__/issue6304_null' (format parquet, partition_by(i), overwrite_or_ignore, write_partition_columns);
statement ok
copy (select 1 as i from range(100000)) to '__TEST_DIR__/issue6304_constant' (format parquet, partition_by(i), overwrite_or_ignore, write_partition_columns);

View File

@@ -0,0 +1,159 @@
# name: test/sql/copy/partitioned/partitioned_group_by.test
# description: Test partitioned aggregates
# group: [partitioned]
require no_vector_verification # query plans are not robust against VERIFY_VECTOR operator
require parquet
statement ok
CREATE TABLE partitioned_tbl AS SELECT i%2 AS partition, i col1, i // 7 col2, (i%3)::VARCHAR col3 FROM range(10000) t(i)
statement ok
COPY partitioned_tbl TO '__TEST_DIR__/partition_group_by' (FORMAT parquet, PARTITION_BY (partition))
statement ok
DROP TABLE partitioned_tbl
statement ok
CREATE VIEW partitioned_tbl AS FROM '__TEST_DIR__/partition_group_by/**/*.parquet'
query II
SELECT partition, SUM(col1)
FROM partitioned_tbl
GROUP BY partition
ORDER BY ALL
----
0 24995000
1 25000000
# make sure the partitioned aggregate is used
# all of these are identical, i.e. this gets folded into a single count aggregate
query II
EXPLAIN SELECT partition, SUM(col1)
FROM partitioned_tbl
GROUP BY partition
ORDER BY ALL
----
physical_plan <REGEX>:.*PARTITIONED_AGGREGATE.*
# distinct aggregate
query II
SELECT partition, COUNT(DISTINCT col2)
FROM partitioned_tbl
GROUP BY partition
ORDER BY ALL
----
0 1429
1 1429
# grouping sets
query II
SELECT partition, SUM(col1)
FROM partitioned_tbl
GROUP BY GROUPING SETS ((), (partition))
ORDER BY ALL
----
0 24995000
1 25000000
NULL 49995000
# filtered aggregate
query II
SELECT partition, SUM(col1) FILTER (col2%7>2)
FROM partitioned_tbl
GROUP BY partition
ORDER BY ALL
----
0 14302848
1 14302848
query II
SELECT SUM(col1), partition
FROM partitioned_tbl
GROUP BY partition
ORDER BY ALL
----
24995000 0
25000000 1
# filter
query II
SELECT partition, SUM(col1)
FROM partitioned_tbl
WHERE col2 > 100
GROUP BY partition
ORDER BY ALL
----
0 24870038
1 24875391
# partition on multiple columns
statement ok
CREATE TABLE partitioned_tbl2 AS SELECT i%2 AS partition1, i%3 AS partition2, i col1, i + 1 col2 FROM range(10000) t(i)
statement ok
COPY partitioned_tbl2 TO '__TEST_DIR__/partition_group_by_multiple' (FORMAT parquet, PARTITION_BY (partition1, partition2))
statement ok
DROP TABLE partitioned_tbl2
statement ok
CREATE VIEW partitioned_tbl2 AS FROM '__TEST_DIR__/partition_group_by_multiple/**/*.parquet'
query III
SELECT partition1, partition2, SUM(col1)
FROM partitioned_tbl2
GROUP BY partition1, partition2
ORDER BY ALL
----
0 0 8331666
0 1 8328334
0 2 8335000
1 0 8336667
1 1 8333333
1 2 8330000
# partition on a subset of the columns
query II
SELECT partition1, SUM(col1)
FROM partitioned_tbl2
GROUP BY partition1
ORDER BY ALL
----
0 24995000
1 25000000
query II
SELECT partition2, SUM(col1)
FROM partitioned_tbl2
GROUP BY partition2
ORDER BY ALL
----
0 16668333
1 16661667
2 16665000
# with a filter
query II
SELECT partition1, SUM(col1)
FROM partitioned_tbl2
WHERE partition2=0
GROUP BY partition1
ORDER BY ALL
----
0 8331666
1 8336667
# grouping sets
query III
SELECT partition1, partition2, SUM(col1)
FROM partitioned_tbl2
GROUP BY GROUPING SETS ((partition1), (partition2))
ORDER BY ALL
----
0 NULL 24995000
1 NULL 25000000
NULL 0 16668333
NULL 1 16661667
NULL 2 16665000

View File

@@ -0,0 +1,73 @@
# name: test/sql/copy/partitioned/partitioned_write_tpch.test_slow
# description: TPC-H test for hive partitioned write
# group: [partitioned]
require parquet
require tpch
statement ok
CALL dbgen(sf=1);
# test writing with a very low partition threshold
statement ok
SET partitioned_write_flush_threshold=10000;
# write lineitem partitioned by l_returnflag and l_linestatus
statement ok
COPY lineitem TO '__TEST_DIR__/lineitem_partitioned_parquet' (FORMAT PARQUET, PARTITION_BY (l_returnflag, l_linestatus));
# write to CSV as well
statement ok
COPY lineitem TO '__TEST_DIR__/lineitem_partitioned_csv' (FORMAT CSV, PARTITION_BY (l_returnflag, l_linestatus));
statement ok
DROP TABLE lineitem
statement ok
CREATE VIEW lineitem AS FROM '__TEST_DIR__/lineitem_partitioned_parquet/**/*.parquet'
# now run tpc-h - results should be the same
loop i 1 9
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q0${i}.csv
endloop
loop i 10 23
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q${i}.csv
endloop
statement ok
DROP VIEW lineitem
# try the CSV next - but copy it into a regular table
statement ok
CREATE TABLE lineitem AS FROM read_csv('__TEST_DIR__/lineitem_partitioned_csv/**/*.csv')
# now run tpc-h - results should be the same
loop i 1 9
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q0${i}.csv
endloop
loop i 10 23
query I
PRAGMA tpch(${i})
----
<FILE>:extension/tpch/dbgen/answers/sf1/q${i}.csv
endloop

View File

@@ -0,0 +1,280 @@
# name: test/sql/copy/partitioned/skip_partition_column_writes.test
# description: Skip partition column writes (issue 11931 & 12147)
# group: [partitioned]
require parquet
statement ok
CREATE TABLE test as SELECT i%2 as part_col, (i+1)%5 as value_col, i as value2_col from range(0,10) tbl(i);
# Parquet
# Skip write of the first partition column
statement ok
COPY test TO '__TEST_DIR__/no-part-cols' (FORMAT PARQUET, PARTITION_BY (part_col));
# SELECT query returns all columns, but written files do not have partition columns
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/no-part-cols/part_col=0/*.parquet' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# Skip write of the first partition column with explicit option
statement ok
COPY test TO '__TEST_DIR__/no-part-cols-explicit' (FORMAT PARQUET, PARTITION_BY (part_col), WRITE_PARTITION_COLUMNS false);
# SELECT query returns all columns, but written files do not have partition columns
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/no-part-cols-explicit/part_col=0/*.parquet' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# Skip writes of 2 partition columns
statement ok
COPY test TO '__TEST_DIR__/no-part-cols2' (FORMAT PARQUET, PARTITION_BY (part_col, value_col));
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/no-part-cols2/part_col=0/value_col=*/*.parquet' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# Modified version of the partition_col
statement ok
COPY (SELECT * EXCLUDE (part_col), 'prefix-'::VARCHAR || part_col::VARCHAR as part_col FROM test) TO '__TEST_DIR__/no-part-cols3' (FORMAT PARQUET, PARTITION_BY (part_col));
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/no-part-cols3/part_col=prefix-0/*.parquet' ORDER BY value2_col;
----
prefix-0 1 0
prefix-0 3 2
prefix-0 0 4
prefix-0 2 6
prefix-0 4 8
# Partitions of more than 8 columns
statement ok
COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/no-part-cols4' (FORMAT PARQUET, PARTITION_BY (part_col));
query IIIIIIIIII
SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/no-part-cols4/part_col=1/*.parquet' ORDER BY 1;
----
1 2 3 4 5 6 7 8 9 10
# Partition by last column out of 10 columns
statement ok
COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/no-part-cols5' (FORMAT PARQUET, PARTITION_BY (value9_col));
query IIIIIIIIII
SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/no-part-cols5/value9_col=*/*.parquet' ORDER BY 1;
----
1 2 3 4 5 6 7 8 9 10
# Partition by last 2 columns out of 10 columns
statement ok
COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/no-part-cols6' (FORMAT PARQUET, PARTITION_BY (value8_col, value9_col));
query IIIIIIIIII
SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/no-part-cols6/value8_col=*/value9_col=*/*.parquet' ORDER BY 1;
----
1 2 3 4 5 6 7 8 9 10
# Partition by last 3 columns out of 10 columns in a reverse order
statement ok
COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/no-part-cols7' (FORMAT PARQUET, PARTITION_BY (value9_col, value8_col, value7_col));
query IIIIIIIIII
SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/no-part-cols7/value9_col=*/value8_col=*/value7_col=*/*.parquet' ORDER BY 1;
----
1 2 3 4 5 6 7 8 9 10
# Throw an error when all columns are specified as partitions
statement error
COPY test TO '__TEST_DIR__/no-part-cols8' (FORMAT PARQUET, PARTITION_BY (part_col, value_col, value2_col));
----
Not implemented Error: No column to write as all columns are specified as partition columns. WRITE_PARTITION_COLUMNS option can be used to write partition columns.
# With explicit WRITE_PARTITION_COLUMNS option, all columns would still be written and still readable.
statement ok
COPY test TO '__TEST_DIR__/no-part-cols8' (FORMAT PARQUET, OVERWRITE, PARTITION_BY (part_col, value_col, value2_col), WRITE_PARTITION_COLUMNS);
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/no-part-cols8/part_col=0/value_col=*/value2_col=*/*.parquet' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# '*' also ends up with error
statement error
COPY test TO '__TEST_DIR__/no-part-cols9' (FORMAT PARQUET, PARTITION_BY '*');
----
Not implemented Error: No column to write as all columns are specified as partition columns. WRITE_PARTITION_COLUMNS option can be used to write partition columns.
# With explicit WRITE_PARTITION_COLUMNS option, all columns would still be written and still readable.
statement ok
COPY test TO '__TEST_DIR__/no-part-cols9' (FORMAT PARQUET, PARTITION_BY '*', OVERWRITE, WRITE_PARTITION_COLUMNS);
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/no-part-cols9/part_col=0/value_col=*/value2_col=*/*.parquet' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# WRITE_PARTITION_COLUMNS: false behaves the same as default, so partition by all should result in an error.
statement error
COPY test TO '__TEST_DIR__/no-part-cols9' (FORMAT PARQUET, PARTITION_BY '*', WRITE_PARTITION_COLUMNS false);
----
Not implemented Error: No column to write as all columns are specified as partition columns. WRITE_PARTITION_COLUMNS option can be used to write partition columns.
# CSV
# Skip write of the first partition column
statement ok
COPY test TO '__TEST_DIR__/csv-no-part-cols' (FORMAT CSV, PARTITION_BY (part_col));
# SELECT query returns all columns, but written files do not have partition columns
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/csv-no-part-cols/part_col=0/*.csv' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# Skip write of the first partition column with explicit option
statement ok
COPY test TO '__TEST_DIR__/csv-no-part-cols-explicit' (FORMAT CSV, PARTITION_BY (part_col), WRITE_PARTITION_COLUMNS false);
# SELECT query returns all columns, but written files do not have partition columns
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/csv-no-part-cols-explicit/part_col=0/*.csv' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# Skip writes of 2 partition columns
statement ok
COPY test TO '__TEST_DIR__/csv-no-part-cols2' (FORMAT CSV, PARTITION_BY (part_col, value_col));
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/csv-no-part-cols2/part_col=0/value_col=*/*.csv' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# Modified version of the partition_col
statement ok
COPY (SELECT * EXCLUDE (part_col), 'prefix-'::VARCHAR || part_col::VARCHAR as part_col FROM test) TO '__TEST_DIR__/csv-no-part-cols3' (FORMAT CSV, PARTITION_BY (part_col));
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/csv-no-part-cols3/part_col=prefix-0/*.csv' ORDER BY value2_col;
----
prefix-0 1 0
prefix-0 3 2
prefix-0 0 4
prefix-0 2 6
prefix-0 4 8
# Partitions of more than 8 columns
statement ok
COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/csv-no-part-cols4' (FORMAT CSV, PARTITION_BY (part_col));
query IIIIIIIIII
SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/csv-no-part-cols4/part_col=1/*.csv' ORDER BY 1;
----
1 2 3 4 5 6 7 8 9 10
# Partition by last column out of 10 columns
statement ok
COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/csv-no-part-cols5' (FORMAT CSV, PARTITION_BY (value9_col));
query IIIIIIIIII
SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/csv-no-part-cols5/value9_col=*/*.csv' ORDER BY 1;
----
1 2 3 4 5 6 7 8 9 10
# Partition by last 2 columns out of 10 columns
statement ok
COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/csv-no-part-cols6' (FORMAT CSV, PARTITION_BY (value8_col, value9_col));
query IIIIIIIIII
SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/csv-no-part-cols6/value8_col=*/value9_col=*/*.csv' ORDER BY 1;
----
1 2 3 4 5 6 7 8 9 10
# Partition by last 3 columns out of 10 columns in a reverse order
statement ok
COPY (SELECT 1 AS part_col, 2 AS value_col, 3 AS value2_col, 4 AS value3_col, 5 AS value4_col, 6 AS value5_col, 7 AS value6_col, 8 AS value7_col, 9 AS value8_col, 10 AS value9_col) TO '__TEST_DIR__/csv-no-part-cols7' (FORMAT CSV, PARTITION_BY (value9_col, value8_col, value7_col));
query IIIIIIIIII
SELECT part_col, value_col, value2_col, value3_col, value4_col, value5_col, value6_col, value7_col, value8_col, value9_col FROM '__TEST_DIR__/csv-no-part-cols7/value9_col=*/value8_col=*/value7_col=*/*.csv' ORDER BY 1;
----
1 2 3 4 5 6 7 8 9 10
# Throw an error when all columns are specified as partitions
statement error
COPY test TO '__TEST_DIR__/csv-no-part-cols8' (FORMAT CSV, PARTITION_BY (part_col, value_col, value2_col));
----
Not implemented Error: No column to write as all columns are specified as partition columns. WRITE_PARTITION_COLUMNS option can be used to write partition columns.
# With explicit WRITE_PARTITION_COLUMNS option, all columns would still be written and still readable.
statement ok
COPY test TO '__TEST_DIR__/csv-no-part-cols8' (FORMAT CSV, PARTITION_BY (part_col, value_col, value2_col), OVERWRITE, WRITE_PARTITION_COLUMNS);
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/csv-no-part-cols8/part_col=0/value_col=*/value2_col=*/*.csv' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# '*' also ends up with error
statement error
COPY test TO '__TEST_DIR__/csv-no-part-cols9' (FORMAT CSV, PARTITION_BY '*');
----
Not implemented Error: No column to write as all columns are specified as partition columns. WRITE_PARTITION_COLUMNS option can be used to write partition columns.
# With explicit WRITE_PARTITION_COLUMNS option, all columns would still be written and still readable.
statement ok
COPY test TO '__TEST_DIR__/csv-no-part-cols9' (FORMAT CSV, PARTITION_BY '*', OVERWRITE, WRITE_PARTITION_COLUMNS);
query III
SELECT part_col, value_col, value2_col FROM '__TEST_DIR__/csv-no-part-cols9/part_col=0/value_col=*/value2_col=*/*.csv' ORDER BY value2_col;
----
0 1 0
0 3 2
0 0 4
0 2 6
0 4 8
# WRITE_PARTITION_COLUMNS: false behaves the same as default, so partition by all should result in an error.
statement error
COPY test TO '__TEST_DIR__/csv-no-part-cols9' (FORMAT CSV, PARTITION_BY '*', WRITE_PARTITION_COLUMNS false);
----
Not implemented Error: No column to write as all columns are specified as partition columns. WRITE_PARTITION_COLUMNS option can be used to write partition columns.