email-tracker/external/duckdb/test/parquet/test_parquet_schema.test

# name: test/parquet/test_parquet_schema.test
# description: Parquet reader schema parameter tests
# group: [parquet]

require parquet

statement ok
COPY (SELECT 42::INTEGER i) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {i: 0})

statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map{})
----
Invalid Input Error: 'schema' expects a STRUCT as the value type of the map

# can't combine with union_by_name
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
                    1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
                  }, union_by_name=true)
----
Binder Error: Parquet schema cannot be combined with union_by_name=true or hive_partitioning=true

# can't combine with hive_partitioning
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/*.parquet', schema=map {
                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
                    1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
                  }, hive_partitioning=true)
----
Binder Error: Parquet schema cannot be combined with union_by_name=true or hive_partitioning=true

statement ok
COPY (
    SELECT 1 i1, 3 i3, 4 i4, 5 i5 UNION ALL
    SELECT 2 i1, 3 i3, 4 i4, 5 i5
) TO '__TEST_DIR__/partitioned' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet, WRITE_PARTITION_COLUMNS)

# auto-detection of hive partitioning is enabled by default,
# but automatically disabled when a schema is supplied, so this should succeed
query IIII
SELECT *
FROM read_parquet('__TEST_DIR__/partitioned/*/*.parquet', schema=map {
                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
                  })
----
5	3	2	1
5	3	2	2

# when partition columns are specified in FIELD_IDS, error message should suggest WRITE_PARTITION_COLUMNS option
statement error
COPY (
    SELECT 1 i1, 3 i3, 4 i4, 5 i5 UNION ALL
    SELECT 2 i1, 3 i3, 4 i4, 5 i5
) TO '__TEST_DIR__/partitioned2' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet)
----
Binder Error: Column name "i1" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this column is a partition column. Available column names:

# cannot duplicate field_ids
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
                    0: {name: 'new_column', type: 'UTINYINT', default_value: 43}
                  })
----
Map keys must be unique

# cannot duplicate column names
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    0: {name: 'cool_column', type: 'BIGINT', default_value: NULL},
                    1: {name: 'cool_column', type: 'UTINYINT', default_value: 43}
                  }) pq
----
Binder Error: table "pq" has duplicate column name "cool_column"

# the supplied default value must be castable to the given type for that column
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    0: {name: 'cool_column', type: 'BIGINT', default_value: NULL},
                    1: {name: 'cool_column', type: 'UTINYINT', default_value: 'bla'}
                  }) pq
----
Binder Error: Unable to cast Parquet schema default_value "bla" to UTINYINT

query IIIIII
DESCRIBE SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
                    1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
                  })
----
renamed_i	BIGINT	YES	NULL	NULL	NULL
new_column	UTINYINT	YES	NULL	NULL	NULL

query IIIIII
DESCRIBE SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
                    1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
                  }, filename=true)
----
renamed_i	BIGINT	YES	NULL	NULL	NULL
new_column	UTINYINT	YES	NULL	NULL	NULL
filename	VARCHAR	YES	NULL	NULL	NULL

# we'll test if filename works on a persistent file otherwise __TEST_DIR__ will be different every time
query II
SELECT *
FROM read_parquet('data/parquet-testing/enum.parquet', schema=map {
                    1: {name: 'cool_column', type: 'VARCHAR', default_value: NULL}
                  }, filename=true)
LIMIT 1
----
1	data/parquet-testing/enum.parquet

query II
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
                    1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
                  })
----
42	43

# we just get a cast error when we can't cast to the supplied type
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    0: {name: 'renamed_i', type: 'DATE', default_value: NULL}
                  })
----
Conversion Error

# if we don't supply a field id, we can't refer to it using the schema parameter
statement ok
COPY (SELECT 42::INTEGER i) TO '__TEST_DIR__/integers.parquet'

query II
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
                    1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
                  })
----
NULL	43

# let's spice it up with more columns
statement ok
COPY (
    SELECT 1 i1, 3 i3, 4 i4, 5 i5
) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1})

# this is purposely a bit confusing but we're:
# 1. deleting field id 2
# 2. creating field id 4
# 3. reversing the order of the columns
# 4. renaming them (except i3)
# 5. upcasting them
query IIII
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
                  })
----
5	3	2	1

# projection still ok
query I
SELECT i1
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
                  })
----
5

# we can still select virtual columns as well
query III
SELECT file_row_number, filename[-16:], i4
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
                  })
----
0	integers.parquet	2

# projection still, even with different generated columns
query III
SELECT file_row_number, filename[-16:], i4
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
                  }, file_row_number=1, filename=1)
----
0	integers.parquet	2

# count(*) still ok
query I
SELECT count(*)
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
                  })
----
1

# combine with constant column
query II
SELECT i1, filename[-16:]
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
                  }, filename=true)
----
5	integers.parquet

statement ok
COPY (
    SELECT range % 4 g, range i FROM range(1000)
) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {g: 33, i: 42})

# let's also do a query with a filter and a downcast
query II
SELECT my_cool_group, sum(my_cool_value)
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    33: {name: 'my_cool_group', type: 'UINTEGER', default_value: NULL},
                    42: {name: 'my_cool_value', type: 'UINTEGER', default_value: NULL}
                  })
WHERE my_cool_group = 2
GROUP BY my_cool_group
----
2	125000

# also test multi-file reading with different field ids
# field id -> value:
# 1 -> 5
# 2 -> 4 (unused)
# 3 -> 3
# 4 -> - (missing)
# 5 -> 1
statement ok
COPY (
    SELECT
		1 i1,
		3 i3,
		4 i4,
		5 i5
) TO '__TEST_DIR__/multifile1.parquet' (FIELD_IDS {
	i1: 5,
	i3: 3,
	i4: 2,
	i5: 1
})

# field_id -> value:
# 1 -> 1
# 2 -> 3 (unused)
# 3 -> 4
# 4 -> 5
# 5 -> - (missing)
statement ok
COPY (
    SELECT
		1 j1,
		3 j3,
		4 j4,
		5 j5
) TO '__TEST_DIR__/multifile2.parquet' (FIELD_IDS {
	j1: 1,
	j3: 2,
	j4: 3,
	j5: 4
})

query IIIII
SELECT i1, i3, i4, i5, filename[-18:]
FROM read_parquet('__TEST_DIR__/multifile*.parquet', schema=map {
                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
                  }, filename=true)
ORDER BY filename
----
5	3	2	1	multifile1.parquet
1	4	5	NULL	multifile2.parquet

statement error
select * FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
	True: {name: 'my_cool_group', type: 'UINTEGER', default_value: NULL},
	False: {name: 'my_cool_value', type: 'UINTEGER', default_value: NULL}
});
----
Invalid Input Error: 'schema' expects the value type of the map to be either INTEGER or VARCHAR, not BOOLEAN

query II
SELECT alias(COLUMNS(*)) FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
                    'i': {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
                    'j': {name: 'new_column', type: 'UTINYINT', default_value: 43}
                  }) limit 1;
----
renamed_i	new_column

# issue 15504
statement ok
COPY (select 1 as id, list_value('a', 'b', 'c') as arr, { key: 1, v1: 'a', v2: 'b' } as s) TO '__TEST_DIR__/15504.parquet' (field_ids { 'id': 0, 'arr': 1, 's': 2 });

query III
SELECT * FROM read_parquet('__TEST_DIR__/15504.parquet', schema=map { 0: { name: 'id', type: 'int32', default_value: NULL }, 1: { name: 'arr', type: 'varchar[]', default_value: NULL }, 2: { name: 's', type: 'STRUCT(key INT, v1 TEXT, v2 TEXT)', default_value: NULL } });
----
1	[a, b, c]	{'key': 1, 'v1': a, 'v2': b}

# issue 16094
statement ok
copy (
	select
		x
	from generate_series(1,100) as g(x)
) to '__TEST_DIR__/16094.parquet'
with (
	field_ids {x: 1}
);

statement ok
select
	x,
	filename
from read_parquet(
	'__TEST_DIR__/16094.parquet',
	schema=map {
		1: {name: 'x', type: 'int', default_value: NULL}
	},
	filename=True
) where x = 1;