Files
email-tracker/external/duckdb/test/parquet/test_parquet_schema.test
2025-10-24 19:21:19 -05:00

358 lines
12 KiB
SQL

# name: test/parquet/test_parquet_schema.test
# description: Parquet reader schema parameter tests
# group: [parquet]
require parquet
statement ok
COPY (SELECT 42::INTEGER i) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {i: 0})
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map{})
----
Invalid Input Error: 'schema' expects a STRUCT as the value type of the map
# can't combine with union_by_name
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
}, union_by_name=true)
----
Binder Error: Parquet schema cannot be combined with union_by_name=true or hive_partitioning=true
# can't combine with hive_partitioning
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/*.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
}, hive_partitioning=true)
----
Binder Error: Parquet schema cannot be combined with union_by_name=true or hive_partitioning=true
statement ok
COPY (
SELECT 1 i1, 3 i3, 4 i4, 5 i5 UNION ALL
SELECT 2 i1, 3 i3, 4 i4, 5 i5
) TO '__TEST_DIR__/partitioned' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet, WRITE_PARTITION_COLUMNS)
# auto-detection of hive partitioning is enabled by default,
# but automatically disabled when a schema is supplied, so this should succeed
query IIII
SELECT *
FROM read_parquet('__TEST_DIR__/partitioned/*/*.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
})
----
5 3 2 1
5 3 2 2
# when partition columns are specified in FIELD_IDS, error message should suggest WRITE_PARTITION_COLUMNS option
statement error
COPY (
SELECT 1 i1, 3 i3, 4 i4, 5 i5 UNION ALL
SELECT 2 i1, 3 i3, 4 i4, 5 i5
) TO '__TEST_DIR__/partitioned2' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet)
----
Binder Error: Column name "i1" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this column is a partition column. Available column names:
# cannot duplicate field_ids
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
0: {name: 'new_column', type: 'UTINYINT', default_value: 43}
})
----
Map keys must be unique
# cannot duplicate column names
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'cool_column', type: 'BIGINT', default_value: NULL},
1: {name: 'cool_column', type: 'UTINYINT', default_value: 43}
}) pq
----
Binder Error: table "pq" has duplicate column name "cool_column"
# the supplied default value must be castable to the given type for that column
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'cool_column', type: 'BIGINT', default_value: NULL},
1: {name: 'cool_column', type: 'UTINYINT', default_value: 'bla'}
}) pq
----
Binder Error: Unable to cast Parquet schema default_value "bla" to UTINYINT
query IIIIII
DESCRIBE SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
})
----
renamed_i BIGINT YES NULL NULL NULL
new_column UTINYINT YES NULL NULL NULL
query IIIIII
DESCRIBE SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
}, filename=true)
----
renamed_i BIGINT YES NULL NULL NULL
new_column UTINYINT YES NULL NULL NULL
filename VARCHAR YES NULL NULL NULL
# we'll test if filename works on a persistent file otherwise __TEST_DIR__ will be different every time
query II
SELECT *
FROM read_parquet('data/parquet-testing/enum.parquet', schema=map {
1: {name: 'cool_column', type: 'VARCHAR', default_value: NULL}
}, filename=true)
LIMIT 1
----
1 data/parquet-testing/enum.parquet
query II
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
})
----
42 43
# we just get a cast error when we can't cast to the supplied type
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'DATE', default_value: NULL}
})
----
Conversion Error
# if we don't supply a field id, we can't refer to it using the schema parameter
statement ok
COPY (SELECT 42::INTEGER i) TO '__TEST_DIR__/integers.parquet'
query II
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
})
----
NULL 43
# let's spice it up with more columns
statement ok
COPY (
SELECT 1 i1, 3 i3, 4 i4, 5 i5
) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1})
# this is purposely a bit confusing but we're:
# 1. deleting field id 2
# 2. creating field id 4
# 3. reversing the order of the columns
# 4. renaming them (except i3)
# 5. upcasting them
query IIII
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
})
----
5 3 2 1
# projection still ok
query I
SELECT i1
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
})
----
5
# we can still select virtual columns as well
query III
SELECT file_row_number, filename[-16:], i4
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
})
----
0 integers.parquet 2
# projection still, even with different generated columns
query III
SELECT file_row_number, filename[-16:], i4
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
}, file_row_number=1, filename=1)
----
0 integers.parquet 2
# count(*) still ok
query I
SELECT count(*)
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
})
----
1
# combine with constant column
query II
SELECT i1, filename[-16:]
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
}, filename=true)
----
5 integers.parquet
statement ok
COPY (
SELECT range % 4 g, range i FROM range(1000)
) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {g: 33, i: 42})
# let's also do a query with a filter and a downcast
query II
SELECT my_cool_group, sum(my_cool_value)
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
33: {name: 'my_cool_group', type: 'UINTEGER', default_value: NULL},
42: {name: 'my_cool_value', type: 'UINTEGER', default_value: NULL}
})
WHERE my_cool_group = 2
GROUP BY my_cool_group
----
2 125000
# also test multi-file reading with different field ids
# field id -> value:
# 1 -> 5
# 2 -> 4 (unused)
# 3 -> 3
# 4 -> - (missing)
# 5 -> 1
statement ok
COPY (
SELECT
1 i1,
3 i3,
4 i4,
5 i5
) TO '__TEST_DIR__/multifile1.parquet' (FIELD_IDS {
i1: 5,
i3: 3,
i4: 2,
i5: 1
})
# field_id -> value:
# 1 -> 1
# 2 -> 3 (unused)
# 3 -> 4
# 4 -> 5
# 5 -> - (missing)
statement ok
COPY (
SELECT
1 j1,
3 j3,
4 j4,
5 j5
) TO '__TEST_DIR__/multifile2.parquet' (FIELD_IDS {
j1: 1,
j3: 2,
j4: 3,
j5: 4
})
query IIIII
SELECT i1, i3, i4, i5, filename[-18:]
FROM read_parquet('__TEST_DIR__/multifile*.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
}, filename=true)
ORDER BY filename
----
5 3 2 1 multifile1.parquet
1 4 5 NULL multifile2.parquet
statement error
select * FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
True: {name: 'my_cool_group', type: 'UINTEGER', default_value: NULL},
False: {name: 'my_cool_value', type: 'UINTEGER', default_value: NULL}
});
----
Invalid Input Error: 'schema' expects the value type of the map to be either INTEGER or VARCHAR, not BOOLEAN
query II
SELECT alias(COLUMNS(*)) FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
'i': {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
'j': {name: 'new_column', type: 'UTINYINT', default_value: 43}
}) limit 1;
----
renamed_i new_column
# issue 15504
statement ok
COPY (select 1 as id, list_value('a', 'b', 'c') as arr, { key: 1, v1: 'a', v2: 'b' } as s) TO '__TEST_DIR__/15504.parquet' (field_ids { 'id': 0, 'arr': 1, 's': 2 });
query III
SELECT * FROM read_parquet('__TEST_DIR__/15504.parquet', schema=map { 0: { name: 'id', type: 'int32', default_value: NULL }, 1: { name: 'arr', type: 'varchar[]', default_value: NULL }, 2: { name: 's', type: 'STRUCT(key INT, v1 TEXT, v2 TEXT)', default_value: NULL } });
----
1 [a, b, c] {'key': 1, 'v1': a, 'v2': b}
# issue 16094
statement ok
copy (
select
x
from generate_series(1,100) as g(x)
) to '__TEST_DIR__/16094.parquet'
with (
field_ids {x: 1}
);
statement ok
select
x,
filename
from read_parquet(
'__TEST_DIR__/16094.parquet',
schema=map {
1: {name: 'x', type: 'int', default_value: NULL}
},
filename=True
) where x = 1;