Files
email-tracker/external/duckdb/test/sql/copy/format_uuid.test
2025-10-24 19:21:19 -05:00

236 lines
6.5 KiB
SQL

# name: test/sql/copy/format_uuid.test
# description: basic tests for the hive partitioned write
# group: [copy]
require parquet
require vector_size 512
statement ok
PRAGMA verify_parallelism;
statement ok
PRAGMA threads=4;
statement ok
CREATE TABLE test2 as SELECT i as a, (i*2) as b, power(i,2) as c from range(0,10) tbl(i);
statement ok
CREATE TABLE test3 as SELECT i as a, (i*3) as b, power(i,3) as c from range(0,10) tbl(i);
statement ok
CREATE TABLE test4 as SELECT i as a, (i*4) as b, power(i,4) as c from range(0,10) tbl(i);
statement ok
CREATE TABLE test5 as SELECT i as a, (i*5) as b, power(i,5) as c from range(0,10) tbl(i);
statement ok
CREATE TABLE testpto as SELECT i as a, (i*10) as b, (i*100) as c from range(0,10000) tbl(i);
query III
SELECT * FROM test2;
----
0 0 0
1 2 1
2 4 4
3 6 9
4 8 16
5 10 25
6 12 36
7 14 49
8 16 64
9 18 81
# Test with the FILENAME_PATTERN option. "{uuid}" will be replaced with a unique string.
statement ok
COPY test2 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a), FILENAME_PATTERN "leading_{uuid}");
# the filename contains a uuid
query III
SELECT * FROM '__TEST_DIR__/part/a=9/leading_????????-????-4???-????-????????????.parquet';
----
18 81 9
statement ok
COPY test2 TO '__TEST_DIR__/trailing' (FORMAT PARQUET, PARTITION_BY (a), FILENAME_PATTERN "{uuid}_trailing");
# the filename contains a uuid
query III
SELECT * FROM '__TEST_DIR__/trailing/a=9/????????-????-4???-????-????????????_trailing.parquet';
----
18 81 9
statement ok
COPY test2 TO '__TEST_DIR__/only_offset' (FORMAT PARQUET, PARTITION_BY (a), FILENAME_PATTERN "{i}");
query III
SELECT * FROM '__TEST_DIR__/only_offset/a=9/?.parquet';
----
18 81 9
# create a second file in the same partition dirs. This is the append workflow
statement ok
COPY test3 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a), overwrite_or_ignore TRUE, FILENAME_PATTERN "leading_{uuid}_trailing");
# the filename contains a uuid
query III sort
SELECT * FROM '__TEST_DIR__/part/a=9/leading_????????-????-4???-????-????????????_trailing.parquet';
----
27 729 9
# the partition dir contains the files from previous 2 queries
query III sort
SELECT * FROM '__TEST_DIR__/part/a=9/leading_????????-????-4???-????-????????????*.parquet';
----
18 81 9
27 729 9
# Test without a specified format name for the outputfile.
statement ok
COPY test4 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a), overwrite_or_ignore TRUE);
# the filename is the default: "data_{i}"
query III sort
SELECT * FROM '__TEST_DIR__/part/a=9/data_[0-9]*.parquet';
----
36 6561 9
# the partition dir contains the files from test[2-4]
query III sort
SELECT * FROM '__TEST_DIR__/part/a=9/*.parquet';
----
18 81 9
27 729 9
36 6561 9
# Test where the FILENAME_PATTERN does not contain "{i}" or "{uuid}".
statement ok
COPY test5 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a), overwrite_or_ignore TRUE, FILENAME_PATTERN "basename");
# the file has the name of the given input
query III sort
SELECT * FROM '__TEST_DIR__/part/a=9/basename[0-9]*.parquet';
----
45 59049 9
# the partition dir contains the files from test[2-5]
query III sort
SELECT * FROM '__TEST_DIR__/part/a=9/*.parquet';
----
18 81 9
27 729 9
36 6561 9
45 59049 9
# Test without the overwrite_or_ignore param, that tries to add a file to an existing directory
statement error
COPY test5 TO '__TEST_DIR__/part' (FORMAT PARQUET, PARTITION_BY (a));
----
Directory
# Test where the FILENAME_PATTERN param has no value
statement error
COPY test4 TO '__TEST_DIR__/to_be_overwritten' (FORMAT PARQUET, PARTITION_BY (a), FILENAME_PATTERN);
----
Copy option "FILENAME_PATTERN" requires an argument of type VARCHAR
statement ok
COPY test4 TO '__TEST_DIR__/to_be_overwritten' (FORMAT PARQUET, PARTITION_BY (a), FILENAME_PATTERN a_file_name);
query I
SELECT COUNT(*) FROM GLOB('__TEST_DIR__/to_be_overwritten/a=9/a_file_name*');
----
1
query III sort
SELECT * FROM '__TEST_DIR__/to_be_overwritten/a=9/a_file_name*.parquet';
----
36 6561 9
# Test with a combination of {i} and {uuid}
statement ok
COPY test5 TO '__TEST_DIR__/incorrect_pos' (FORMAT PARQUET, PARTITION_BY (a), filename_pattern "a_name{i}_with_{uuid}_numbers");
query III sort
SELECT * FROM '__TEST_DIR__/incorrect_pos/a=5/a_name?_with_????????-????-4???-????-????????????_numbers.parquet';
----
25 3125 5
# Test UUID v7
statement ok
COPY test5 TO '__TEST_DIR__/uuid_v7' (FORMAT PARQUET, PARTITION_BY (a), filename_pattern "a_name_with_{uuidv7}");
query III sort
SELECT * FROM '__TEST_DIR__/uuid_v7/a=5/a_name_with_????????-????-7???-????-????????????.parquet';
----
25 3125 5
# Test UUID v7 mixed with UUID v4
statement ok
COPY test5 TO '__TEST_DIR__/mixed_uuid_types' (FORMAT PARQUET, PARTITION_BY (a), filename_pattern "a_name{uuid}_with_{uuidv7}_numbers");
query III sort
SELECT * FROM '__TEST_DIR__/mixed_uuid_types/a=5/a_name????????-????-4???-????-????????????_with_????????-????-7???-????-????????????_numbers.parquet';
----
25 3125 5
# Test "{uuid}" with per_thread_output TRUE
statement ok
COPY testpto TO '__TEST_DIR__/pto' (FORMAT PARQUET, PER_THREAD_OUTPUT TRUE, OVERWRITE_OR_IGNORE TRUE, FILENAME_PATTERN "basename_{uuid}");
query I sort
SELECT COUNT(*) FROM '__TEST_DIR__/pto/basename_????????-????-4???-????-????????????.parquet';
----
10000
query I
SELECT COUNT(*) >= 1 FROM GLOB('__TEST_DIR__/pto/basename_????????-????-4???-????-????????????.parquet');
----
true
statement ok
PRAGMA threads=1;
# Test where overwrite_or_ignore param has no value -> will be true -> overwrite previous test
statement ok
COPY test5 TO '__TEST_DIR__/to_be_overwritten2' (FORMAT PARQUET, PARTITION_BY (a), OVERWRITE_OR_IGNORE);
query I
SELECT COUNT(*) FROM GLOB('__TEST_DIR__/to_be_overwritten2/a=9/*');
----
1
query III sort
SELECT * FROM '__TEST_DIR__/to_be_overwritten2/a=9/data_0*.parquet';
----
45 59049 9
# Test where overwrite_or_ignore param == true, and filename_pattern is set to an existing file
statement ok
COPY test5 TO '__TEST_DIR__/directory0' (FORMAT PARQUET, PARTITION_BY (a), overwrite_or_ignore TRUE, FILENAME_PATTERN "my_filename");
query I
SELECT COUNT(*) FROM GLOB('__TEST_DIR__/directory0/a=7/*');
----
1
query III sort
SELECT * FROM '__TEST_DIR__/directory0/a=7/my_filename0.parquet';
----
35 16807 7
# When partition columns are written to files, `select *` returns in order of columns
statement ok
COPY test5 TO '__TEST_DIR__/to_be_overwritten2' (FORMAT PARQUET, PARTITION_BY (a), OVERWRITE_OR_IGNORE, WRITE_PARTITION_COLUMNS);
query I
SELECT COUNT(*) FROM GLOB('__TEST_DIR__/to_be_overwritten2/a=9/*');
----
1
query III sort
SELECT * FROM '__TEST_DIR__/to_be_overwritten2/a=9/data_0*.parquet';
----
9 45 59049