Files
email-tracker/external/duckdb/test/sql/copy/parquet/parquet_filename.test
2025-10-24 19:21:19 -05:00

108 lines
3.3 KiB
SQL

# name: test/sql/copy/parquet/parquet_filename.test
# description: Test the filename option of the parquet reader
# group: [parquet]
require parquet
# Simple glob with filenames, note that we replace \ for / to make tests pass on windows
query III
select i, j, replace(filename, '\', '/') from parquet_scan('data/parquet-testing/glob*/t?.parquet', FILENAME=1) order by i;
----
1 a data/parquet-testing/glob/t1.parquet
2 b data/parquet-testing/glob/t2.parquet
3 c data/parquet-testing/glob2/t1.parquet
# Filter on filename col
query III
select i, j, replace(filename, '\', '/') as file from parquet_scan('data/parquet-testing/glob*/t?.parquet', FILENAME=1) where file='data/parquet-testing/glob2/t1.parquet';
----
3 c data/parquet-testing/glob2/t1.parquet
# filter on multiple vector sizes of rows
query I
SELECT count(filename) FROM parquet_scan('data/parquet-testing/p2.parquet', FILENAME=1) where id < 1000;
----
479
# filter pushdown on filename
query I
SELECT count(id) FROM parquet_scan('data/parquet-testing/p2.parquet', FILENAME=1) where filename >= 'data';
----
4979
# Filter on non-filename col
query I
select replace(filename, '\', '/') from parquet_scan('data/parquet-testing/glob*/t?.parquet', FILENAME=1) where i=2;
----
data/parquet-testing/glob/t2.parquet
statement ok
CREATE TABLE test_csv AS SELECT 1 as id, 'test_csv_content' as filename;
statement ok
COPY test_csv TO '__TEST_DIR__/filename_as_column.csv' WITH HEADER;
# This currently fails with a binder error
statement error
SELECT id, filename FROM read_csv_auto('__TEST_DIR__/filename_as_column.csv', FILENAME=1);
----
# Parquet filename name conflict
statement ok
CREATE TABLE test AS SELECT 1 as id, 'test' as filename;
statement ok
COPY test TO '__TEST_DIR__/filename_as_column.parquet';
# we currently don't support filename as a column name when using the filename option
statement error
SELECT * FROM parquet_scan('__TEST_DIR__/filename_as_column.parquet', FILENAME=1);
----
# Now also test copy
statement ok
CREATE TABLE test_copy (i INT, j VARCHAR, filename VARCHAR);
statement ok
INSERT INTO test_copy FROM read_parquet('data/parquet-testing/glob/t1.parquet', filename=1, binary_as_string=1);
query III
SELECT i, j, replace(filename, '\', '/') FROM test_copy
----
1 a data/parquet-testing/glob/t1.parquet
statement ok
INSERT INTO test_copy FROM read_parquet('data/parquet-testing/glob/t1.parquet', filename=1);
query III
SELECT i, j, replace(filename, '\', '/') FROM test_copy
----
1 a data/parquet-testing/glob/t1.parquet
1 a data/parquet-testing/glob/t1.parquet
statement error
COPY test_copy FROM 'data/parquet-testing/glob/t1.parquet';
----
column count mismatch
# Multiple row groups in same file
statement ok
CREATE TABLE test_table_large AS SELECT * FROM range(0,10000) tbl(i);
statement ok
COPY test_table_large TO '__TEST_DIR__/test_table_large.parquet' (ROW_GROUP_SIZE 1000);
query II
SELECT sum(i), max(regexp_replace(filename, '^.*/', '')) FROM parquet_scan('__TEST_DIR__/test_table_large.parquet', FILENAME=1) where i>5000;
----
37492500 test_table_large.parquet
# Same file twice
query III
SELECT i, j, replace(filename, '\', '/') as file FROM parquet_scan(['data/parquet-testing/glob/t1.parquet', 'data/parquet-testing/glob/t1.parquet', 'data/parquet-testing/glob/t2.parquet'], FILENAME=1) where file like '%t1%'
----
1 a data/parquet-testing/glob/t1.parquet
1 a data/parquet-testing/glob/t1.parquet