358 lines
12 KiB
SQL
358 lines
12 KiB
SQL
# name: test/parquet/test_parquet_schema.test
|
|
# description: Parquet reader schema parameter tests
|
|
# group: [parquet]
|
|
|
|
require parquet
|
|
|
|
statement ok
|
|
COPY (SELECT 42::INTEGER i) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {i: 0})
|
|
|
|
statement error
|
|
SELECT *
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map{})
|
|
----
|
|
Invalid Input Error: 'schema' expects a STRUCT as the value type of the map
|
|
|
|
# can't combine with union_by_name
|
|
statement error
|
|
SELECT *
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
|
|
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
|
|
}, union_by_name=true)
|
|
----
|
|
Binder Error: Parquet schema cannot be combined with union_by_name=true or hive_partitioning=true
|
|
|
|
# can't combine with hive_partitioning
|
|
statement error
|
|
SELECT *
|
|
FROM read_parquet('__TEST_DIR__/*.parquet', schema=map {
|
|
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
|
|
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
|
|
}, hive_partitioning=true)
|
|
----
|
|
Binder Error: Parquet schema cannot be combined with union_by_name=true or hive_partitioning=true
|
|
|
|
statement ok
|
|
COPY (
|
|
SELECT 1 i1, 3 i3, 4 i4, 5 i5 UNION ALL
|
|
SELECT 2 i1, 3 i3, 4 i4, 5 i5
|
|
) TO '__TEST_DIR__/partitioned' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet, WRITE_PARTITION_COLUMNS)
|
|
|
|
# auto-detection of hive partitioning is enabled by default,
|
|
# but automatically disabled when a schema is supplied, so this should succeed
|
|
query IIII
|
|
SELECT *
|
|
FROM read_parquet('__TEST_DIR__/partitioned/*/*.parquet', schema=map {
|
|
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
|
|
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
|
|
4: {name: 'i4', type: 'BIGINT', default_value: 2},
|
|
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
|
|
})
|
|
----
|
|
5 3 2 1
|
|
5 3 2 2
|
|
|
|
# when partition columns are specified in FIELD_IDS, error message should suggest WRITE_PARTITION_COLUMNS option
|
|
statement error
|
|
COPY (
|
|
SELECT 1 i1, 3 i3, 4 i4, 5 i5 UNION ALL
|
|
SELECT 2 i1, 3 i3, 4 i4, 5 i5
|
|
) TO '__TEST_DIR__/partitioned2' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet)
|
|
----
|
|
Binder Error: Column name "i1" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this column is a partition column. Available column names:
|
|
|
|
# cannot duplicate field_ids
|
|
statement error
|
|
SELECT *
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
|
|
0: {name: 'new_column', type: 'UTINYINT', default_value: 43}
|
|
})
|
|
----
|
|
Map keys must be unique
|
|
|
|
# cannot duplicate column names
|
|
statement error
|
|
SELECT *
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
0: {name: 'cool_column', type: 'BIGINT', default_value: NULL},
|
|
1: {name: 'cool_column', type: 'UTINYINT', default_value: 43}
|
|
}) pq
|
|
----
|
|
Binder Error: table "pq" has duplicate column name "cool_column"
|
|
|
|
# the supplied default value must be castable to the given type for that column
|
|
statement error
|
|
SELECT *
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
0: {name: 'cool_column', type: 'BIGINT', default_value: NULL},
|
|
1: {name: 'cool_column', type: 'UTINYINT', default_value: 'bla'}
|
|
}) pq
|
|
----
|
|
Binder Error: Unable to cast Parquet schema default_value "bla" to UTINYINT
|
|
|
|
query IIIIII
|
|
DESCRIBE SELECT *
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
|
|
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
|
|
})
|
|
----
|
|
renamed_i BIGINT YES NULL NULL NULL
|
|
new_column UTINYINT YES NULL NULL NULL
|
|
|
|
query IIIIII
|
|
DESCRIBE SELECT *
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
|
|
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
|
|
}, filename=true)
|
|
----
|
|
renamed_i BIGINT YES NULL NULL NULL
|
|
new_column UTINYINT YES NULL NULL NULL
|
|
filename VARCHAR YES NULL NULL NULL
|
|
|
|
# we'll test if filename works on a persistent file otherwise __TEST_DIR__ will be different every time
|
|
query II
|
|
SELECT *
|
|
FROM read_parquet('data/parquet-testing/enum.parquet', schema=map {
|
|
1: {name: 'cool_column', type: 'VARCHAR', default_value: NULL}
|
|
}, filename=true)
|
|
LIMIT 1
|
|
----
|
|
1 data/parquet-testing/enum.parquet
|
|
|
|
query II
|
|
SELECT *
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
|
|
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
|
|
})
|
|
----
|
|
42 43
|
|
|
|
# we just get a cast error when we can't cast to the supplied type
|
|
statement error
|
|
SELECT *
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
0: {name: 'renamed_i', type: 'DATE', default_value: NULL}
|
|
})
|
|
----
|
|
Conversion Error
|
|
|
|
# if we don't supply a field id, we can't refer to it using the schema parameter
|
|
statement ok
|
|
COPY (SELECT 42::INTEGER i) TO '__TEST_DIR__/integers.parquet'
|
|
|
|
query II
|
|
SELECT *
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
|
|
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
|
|
})
|
|
----
|
|
NULL 43
|
|
|
|
# let's spice it up with more columns
|
|
statement ok
|
|
COPY (
|
|
SELECT 1 i1, 3 i3, 4 i4, 5 i5
|
|
) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1})
|
|
|
|
# this is purposely a bit confusing but we're:
|
|
# 1. deleting field id 2
|
|
# 2. creating field id 4
|
|
# 3. reversing the order of the columns
|
|
# 4. renaming them (except i3)
|
|
# 5. upcasting them
|
|
query IIII
|
|
SELECT *
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
|
|
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
|
|
4: {name: 'i4', type: 'BIGINT', default_value: 2},
|
|
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
|
|
})
|
|
----
|
|
5 3 2 1
|
|
|
|
# projection still ok
|
|
query I
|
|
SELECT i1
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
|
|
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
|
|
4: {name: 'i4', type: 'BIGINT', default_value: 2},
|
|
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
|
|
})
|
|
----
|
|
5
|
|
|
|
# we can still select virtual columns as well
|
|
query III
|
|
SELECT file_row_number, filename[-16:], i4
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
|
|
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
|
|
4: {name: 'i4', type: 'BIGINT', default_value: 2},
|
|
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
|
|
})
|
|
----
|
|
0 integers.parquet 2
|
|
|
|
# projection still, even with different generated columns
|
|
query III
|
|
SELECT file_row_number, filename[-16:], i4
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
|
|
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
|
|
4: {name: 'i4', type: 'BIGINT', default_value: 2},
|
|
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
|
|
}, file_row_number=1, filename=1)
|
|
----
|
|
0 integers.parquet 2
|
|
|
|
# count(*) still ok
|
|
query I
|
|
SELECT count(*)
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
|
|
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
|
|
4: {name: 'i4', type: 'BIGINT', default_value: 2},
|
|
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
|
|
})
|
|
----
|
|
1
|
|
|
|
# combine with constant column
|
|
query II
|
|
SELECT i1, filename[-16:]
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
|
|
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
|
|
4: {name: 'i4', type: 'BIGINT', default_value: 2},
|
|
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
|
|
}, filename=true)
|
|
----
|
|
5 integers.parquet
|
|
|
|
statement ok
|
|
COPY (
|
|
SELECT range % 4 g, range i FROM range(1000)
|
|
) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {g: 33, i: 42})
|
|
|
|
# let's also do a query with a filter and a downcast
|
|
query II
|
|
SELECT my_cool_group, sum(my_cool_value)
|
|
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
33: {name: 'my_cool_group', type: 'UINTEGER', default_value: NULL},
|
|
42: {name: 'my_cool_value', type: 'UINTEGER', default_value: NULL}
|
|
})
|
|
WHERE my_cool_group = 2
|
|
GROUP BY my_cool_group
|
|
----
|
|
2 125000
|
|
|
|
# also test multi-file reading with different field ids
|
|
# field id -> value:
|
|
# 1 -> 5
|
|
# 2 -> 4 (unused)
|
|
# 3 -> 3
|
|
# 4 -> - (missing)
|
|
# 5 -> 1
|
|
statement ok
|
|
COPY (
|
|
SELECT
|
|
1 i1,
|
|
3 i3,
|
|
4 i4,
|
|
5 i5
|
|
) TO '__TEST_DIR__/multifile1.parquet' (FIELD_IDS {
|
|
i1: 5,
|
|
i3: 3,
|
|
i4: 2,
|
|
i5: 1
|
|
})
|
|
|
|
# field_id -> value:
|
|
# 1 -> 1
|
|
# 2 -> 3 (unused)
|
|
# 3 -> 4
|
|
# 4 -> 5
|
|
# 5 -> - (missing)
|
|
statement ok
|
|
COPY (
|
|
SELECT
|
|
1 j1,
|
|
3 j3,
|
|
4 j4,
|
|
5 j5
|
|
) TO '__TEST_DIR__/multifile2.parquet' (FIELD_IDS {
|
|
j1: 1,
|
|
j3: 2,
|
|
j4: 3,
|
|
j5: 4
|
|
})
|
|
|
|
query IIIII
|
|
SELECT i1, i3, i4, i5, filename[-18:]
|
|
FROM read_parquet('__TEST_DIR__/multifile*.parquet', schema=map {
|
|
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
|
|
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
|
|
4: {name: 'i4', type: 'BIGINT', default_value: 2},
|
|
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
|
|
}, filename=true)
|
|
ORDER BY filename
|
|
----
|
|
5 3 2 1 multifile1.parquet
|
|
1 4 5 NULL multifile2.parquet
|
|
|
|
statement error
|
|
select * FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
True: {name: 'my_cool_group', type: 'UINTEGER', default_value: NULL},
|
|
False: {name: 'my_cool_value', type: 'UINTEGER', default_value: NULL}
|
|
});
|
|
----
|
|
Invalid Input Error: 'schema' expects the value type of the map to be either INTEGER or VARCHAR, not BOOLEAN
|
|
|
|
query II
|
|
SELECT alias(COLUMNS(*)) FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
|
|
'i': {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
|
|
'j': {name: 'new_column', type: 'UTINYINT', default_value: 43}
|
|
}) limit 1;
|
|
----
|
|
renamed_i new_column
|
|
|
|
# issue 15504
|
|
statement ok
|
|
COPY (select 1 as id, list_value('a', 'b', 'c') as arr, { key: 1, v1: 'a', v2: 'b' } as s) TO '__TEST_DIR__/15504.parquet' (field_ids { 'id': 0, 'arr': 1, 's': 2 });
|
|
|
|
query III
|
|
SELECT * FROM read_parquet('__TEST_DIR__/15504.parquet', schema=map { 0: { name: 'id', type: 'int32', default_value: NULL }, 1: { name: 'arr', type: 'varchar[]', default_value: NULL }, 2: { name: 's', type: 'STRUCT(key INT, v1 TEXT, v2 TEXT)', default_value: NULL } });
|
|
----
|
|
1 [a, b, c] {'key': 1, 'v1': a, 'v2': b}
|
|
|
|
# issue 16094
|
|
statement ok
|
|
copy (
|
|
select
|
|
x
|
|
from generate_series(1,100) as g(x)
|
|
) to '__TEST_DIR__/16094.parquet'
|
|
with (
|
|
field_ids {x: 1}
|
|
);
|
|
|
|
statement ok
|
|
select
|
|
x,
|
|
filename
|
|
from read_parquet(
|
|
'__TEST_DIR__/16094.parquet',
|
|
schema=map {
|
|
1: {name: 'x', type: 'int', default_value: NULL}
|
|
},
|
|
filename=True
|
|
) where x = 1;
|