should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,52 @@
# name: test/parquet/bss_roundtrip.test_slow
# description: Test BYTE_STREAM_SPLIT roundtrip
# group: [parquet]
require parquet
require tpch
statement ok
call dbgen(sf=0.01);
statement ok
create view doubles as
select l_quantity::double l_quantity,
l_extendedprice::double l_extendedprice,
l_discount::double l_discount,
l_tax::double l_tax,
from lineitem
query IIII nosort q0
from doubles
----
statement ok
copy doubles to '__TEST_DIR__/bss.parquet' (PARQUET_VERSION V2);
query IIII nosort q0
from '__TEST_DIR__/bss.parquet';
----
statement ok
create view floats as
select l_quantity::float l_quantity,
l_extendedprice::float l_extendedprice,
l_discount::float l_discount,
l_tax::float l_tax,
from lineitem
query IIII nosort q1
from floats
----
statement ok
copy floats to '__TEST_DIR__/bss.parquet' (PARQUET_VERSION V2);
query IIII nosort q1
from '__TEST_DIR__/bss.parquet';
----

View File

@@ -0,0 +1,10 @@
# name: test/parquet/concatenated_gzip_members.test
# description: Test reading Parquet file with concatenated GZIP members
# group: [parquet]
require parquet
query I
from 'data/parquet-testing/concatenated_gzip_members.parquet' offset 512;
----
513

View File

@@ -0,0 +1,26 @@
# name: test/parquet/constant_dictionary_vector_parquet.test
# description: Test that we retain constant/dictionary compression for strings when writing to Parquet (small data)
# group: [parquet]
require vector_size 2048
require parquet
# low memory limit to test that we don't blow up intermediates
statement ok
set memory_limit='10mb'
# we should be able to do this without spilling
statement ok
set temp_directory=null
# 1k strings of ~50kb = ~50 MB
# the ColumnDataCollection should keep the constant string compressed
# and the Parquet writer will use dictionary compression, not blowing them up there either
statement ok
copy (select repeat('a', 50_000) s from range(1000)) to '__TEST_DIR__/cdc_constant.parquet'
# the written file has dictionary compression
# when we copy it over to another file we should still be able to avoid blowing it up
statement ok
copy (from '__TEST_DIR__/cdc_constant.parquet') to '__TEST_DIR__/cdc_dictionary.parquet'

View File

@@ -0,0 +1,26 @@
# name: test/parquet/constant_dictionary_vector_parquet.test_slow
# description: Test that we retain constant/dictionary compression for strings when writing to Parquet (big data)
# group: [parquet]
require vector_size 2048
require parquet
# low memory limit to test that we don't blow up intermediates
statement ok
set memory_limit='100mb'
# we should be able to do this without spilling
statement ok
set temp_directory=null
# 100k strings of ~50kb = ~5 GB
# the ColumnDataCollection should keep the constant string compressed
# and the Parquet writer will use dictionary compression, not blowing them up there either
statement ok
copy (select repeat('a', 50_000) s from range(100_000)) to '__TEST_DIR__/cdc_constant.parquet'
# the written file has dictionary compression
# when we copy it over to another file we should still be able to avoid blowing it up
statement ok
copy (from '__TEST_DIR__/cdc_constant.parquet') to '__TEST_DIR__/cdc_dictionary.parquet'

View File

@@ -0,0 +1,11 @@
# name: test/parquet/dbp_small_decimal.test
# description: Test parquet file with a small decimal column (1,0) in dbp encoding
# group: [parquet]
require parquet
query III
select * from 'data/parquet-testing/dbp_small_decimal.parquet' ;
----
1 10.0 diez
2 20.0 vente

View File

@@ -0,0 +1,19 @@
# name: test/parquet/encrypted_parquet.test
# description: Test Parquet reader on data/parquet-testing/encryption
# group: [parquet]
# TODO: re-enable these tests once we encrypt the full Parquet Encryption spec
# for now, parquet crypto tests are in test/sql/copy/parquet/parquet_encryption.test_slow
mode skip
require parquet
statement error
SELECT * FROM parquet_scan('data/parquet-testing/encryption/encrypted_footer.parquet') limit 50;
----
Invalid Input Error: Encrypted Parquet files are not supported for file 'data/parquet-testing/encryption/encrypted_footer.parquet'
statement error
SELECT * FROM parquet_scan('data/parquet-testing/encryption/encrypted_column.parquet') limit 50;
----
Invalid Error: Failed to read Parquet file "data/parquet-testing/encryption/encrypted_column.parquet": Encrypted Parquet files are not supported

View File

@@ -0,0 +1,134 @@
import duckdb
import os
import sys
try:
import pyarrow
import pyarrow.parquet
can_run = True
except:
can_run = False
def generate_header(f):
f.write(
'''# name: test/parquet/test_parquet_reader.test
# description: Test Parquet Reader with files on data/parquet-testing
# group: [parquet]
require parquet
statement ok
PRAGMA enable_verification
'''
)
def get_files():
files_path = []
path = os.path.dirname(os.path.realpath(__file__))
path = os.path.join(path, '..', '..')
os.chdir(path)
path = os.path.join('data', 'parquet-testing')
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".parquet"):
files_path.append(os.path.join(root, file))
return files_path
def get_duckdb_answer(file_path):
answer = []
try:
answer = duckdb.query("SELECT * FROM parquet_scan('" + file_path + "') limit 50").fetchall()
except Exception as e:
print(e)
answer = 'fail'
return answer
def get_arrow_answer(file_path):
answer = []
try:
arrow = pyarrow.parquet.read_table(file_path)
duck_rel = duckdb.from_arrow(arrow).limit(50)
answer = duck_rel.fetchall()
return answer
except:
return 'fail'
def check_result(duckdb_result, arrow_result):
if arrow_result == 'fail':
return 'skip'
if duckdb_result == 'fail':
return 'fail'
if duckdb_result != arrow_result:
return 'fail'
return 'pass'
def sanitize_string(s):
return str(s).replace('None', 'NULL').replace("b'", "").replace("'", "")
def result_to_string(arrow_result):
result = ''
for row_idx in range(len(arrow_result)):
for col_idx in range(len(arrow_result[0])):
value = arrow_result[row_idx][col_idx]
if isinstance(value, dict):
items = [f"'{k}': {sanitize_string(v)}" for k, v in value.items()] # no quotes
value = "{" + ", ".join(items) + "}"
print(type(value), value)
else:
value = sanitize_string(value)
result += value + "\t"
result += "\n"
result += "\n"
return result
def generate_parquet_test_body(result, arrow_result, file_path):
columns = 'I' * len(arrow_result[0])
test_body = "query " + columns + "\n"
test_body += "SELECT * FROM parquet_scan('" + file_path + "') limit 50 \n"
test_body += "----\n"
test_body += result_to_string(arrow_result)
return test_body
def generate_test(file_path):
duckdb_result = get_duckdb_answer(file_path)
arrow_result = get_arrow_answer(file_path)
result = check_result(duckdb_result, arrow_result)
test_body = ""
if result == 'skip':
return
if result == 'fail':
test_body += "mode skip \n\n"
test_body += generate_parquet_test_body(result, arrow_result, file_path)
test_body += "mode unskip \n\n"
else:
test_body += generate_parquet_test_body(result, duckdb_result, file_path)
return test_body
def generate_body(f):
files_path = get_files()
for file in files_path:
print(file)
test_body = generate_test(file)
if test_body != None:
f.write(test_body)
f = open("test_parquet_reader.test", "w")
generate_header(f)
generate_body(f)
f.close()

View File

@@ -0,0 +1,18 @@
# name: test/parquet/invalid_parquet.test
# description: Test Parquet Reader on data/parquet-testing/invalid.parquet
# group: [parquet]
require parquet
statement error
SELECT * FROM parquet_scan('data/parquet-testing/invalid.parquet') limit 50;
----
Invalid Input Error: Invalid string encoding found in Parquet file: value "TREL\xC3" is not valid UTF8!
statement ok
pragma disable_optimizer
statement error
SELECT * FROM parquet_scan('data/parquet-testing/invalid.parquet') limit 50;
----
Invalid Input Error: Invalid string encoding found in Parquet file: value "TREL\xC3" is not valid UTF8!

View File

@@ -0,0 +1,33 @@
# name: test/parquet/parquet_combine.test
# description: Test Parquet Reader row group combining
# group: [parquet]
require parquet
require vector_size 2048
statement ok
set threads=2;
# before we combined data from threads into multiple row groups,
# this would create 4 row groups, now it should create 3
statement ok
copy (with cte as (from range(2049) union all from range(2049)) from cte) to '__TEST_DIR__/parquet_combine.parquet' (row_group_size 2048);
query I
select count(*) from parquet_metadata('__TEST_DIR__/parquet_combine.parquet')
----
3
# works not just with row_group_size, but also with row_group_size_bytes
statement ok
set preserve_insertion_order=false;
# used to create 4, now it should create 3
statement ok
copy (with cte as (from range(100_000) union all from range(100_000)) from cte) to '__TEST_DIR__/parquet_combine.parquet' (row_group_size_bytes 750_000);
query I
select count(*) from parquet_metadata('__TEST_DIR__/parquet_combine.parquet')
----
3

View File

@@ -0,0 +1,17 @@
# name: test/parquet/parquet_fuzzer_issues.test
# description: Test Parquet fuzzer issues
# group: [parquet]
require parquet
# internal issue 6129
statement error
from 'data/parquet-testing/broken/internal_6129.parquet'
----
invalid number of miniblocks per block
# internal issue 6165
statement error
from 'data/parquet-testing/broken/internal_6165.parquet';
----
row group does not have enough columns

View File

@@ -0,0 +1,26 @@
# name: test/parquet/parquet_long_string_stats.test
# description: Test internal issue #2289 - Performance of Parquet reader
# group: [parquet]
require httpfs
require parquet
# need to disable this otherwise we just cache everything
statement ok
set enable_external_file_cache=false;
statement ok
set parquet_metadata_cache=true;
# the constant comparison that is pushed down is longer than DuckDB's 8 bytes that are used in StringStatistics
# its prefix is equal to the max up to the last byte
# previously, we would read 5.4MB to figure out that we can prune the entire file
# now, we can prune it based on the metadata
query II
explain analyze
select count(*)
FROM 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/event_baserunning_advance_attempt.parquet'
where game_id > 'WS2197109301';
----
analyzed_plan <REGEX>:.*GET: 1.*

View File

@@ -0,0 +1,441 @@
# name: test/parquet/parquet_null_compressed_materialization.test_slow
# description: Test if we can do compressed materialization for all-NULL Parquet columns in a join
# group: [parquet]
require parquet
statement ok
SET preserve_insertion_order=false;
# create a huge Parquet file with mostly NULL in it
statement ok
COPY (
SELECT
range pk,
NULL::VARCHAR c0,
NULL::VARCHAR c1,
NULL::VARCHAR c2,
NULL::VARCHAR c3,
NULL::VARCHAR c4,
NULL::VARCHAR c5,
NULL::VARCHAR c6,
NULL::VARCHAR c7,
NULL::VARCHAR c8,
NULL::VARCHAR c9,
NULL::VARCHAR c10,
NULL::VARCHAR c11,
NULL::VARCHAR c12,
NULL::VARCHAR c13,
NULL::VARCHAR c14,
NULL::VARCHAR c15,
NULL::VARCHAR c16,
NULL::VARCHAR c17,
NULL::VARCHAR c18,
NULL::VARCHAR c19,
NULL::VARCHAR c20,
NULL::VARCHAR c21,
NULL::VARCHAR c22,
NULL::VARCHAR c23,
NULL::VARCHAR c24,
NULL::VARCHAR c25,
NULL::VARCHAR c26,
NULL::VARCHAR c27,
NULL::VARCHAR c28,
NULL::VARCHAR c29,
NULL::VARCHAR c30,
NULL::VARCHAR c31,
NULL::VARCHAR c32,
NULL::VARCHAR c33,
NULL::VARCHAR c34,
NULL::VARCHAR c35,
NULL::VARCHAR c36,
NULL::VARCHAR c37,
NULL::VARCHAR c38,
NULL::VARCHAR c39,
NULL::VARCHAR c40,
NULL::VARCHAR c41,
NULL::VARCHAR c42,
NULL::VARCHAR c43,
NULL::VARCHAR c44,
NULL::VARCHAR c45,
NULL::VARCHAR c46,
NULL::VARCHAR c47,
NULL::VARCHAR c48,
NULL::VARCHAR c49,
NULL::VARCHAR c50,
NULL::VARCHAR c51,
NULL::VARCHAR c52,
NULL::VARCHAR c53,
NULL::VARCHAR c54,
NULL::VARCHAR c55,
NULL::VARCHAR c56,
NULL::VARCHAR c57,
NULL::VARCHAR c58,
NULL::VARCHAR c59,
NULL::VARCHAR c60,
NULL::VARCHAR c61,
NULL::VARCHAR c62,
NULL::VARCHAR c63,
NULL::VARCHAR c64,
NULL::VARCHAR c65,
NULL::VARCHAR c66,
NULL::VARCHAR c67,
NULL::VARCHAR c68,
NULL::VARCHAR c69,
NULL::VARCHAR c70,
NULL::VARCHAR c71,
NULL::VARCHAR c72,
NULL::VARCHAR c73,
NULL::VARCHAR c74,
NULL::VARCHAR c75,
NULL::VARCHAR c76,
NULL::VARCHAR c77,
NULL::VARCHAR c78,
NULL::VARCHAR c79,
NULL::VARCHAR c80,
NULL::VARCHAR c81,
NULL::VARCHAR c82,
NULL::VARCHAR c83,
NULL::VARCHAR c84,
NULL::VARCHAR c85,
NULL::VARCHAR c86,
NULL::VARCHAR c87,
NULL::VARCHAR c88,
NULL::VARCHAR c89,
NULL::VARCHAR c90,
NULL::VARCHAR c91,
NULL::VARCHAR c92,
NULL::VARCHAR c93,
NULL::VARCHAR c94,
NULL::VARCHAR c95,
NULL::VARCHAR c96,
NULL::VARCHAR c97,
NULL::VARCHAR c98,
NULL::VARCHAR c99,
NULL::VARCHAR c100,
NULL::VARCHAR c101,
NULL::VARCHAR c102,
NULL::VARCHAR c103,
NULL::VARCHAR c104,
NULL::VARCHAR c105,
NULL::VARCHAR c106,
NULL::VARCHAR c107,
NULL::VARCHAR c108,
NULL::VARCHAR c109,
NULL::VARCHAR c110,
NULL::VARCHAR c111,
NULL::VARCHAR c112,
NULL::VARCHAR c113,
NULL::VARCHAR c114,
NULL::VARCHAR c115,
NULL::VARCHAR c116,
NULL::VARCHAR c117,
NULL::VARCHAR c118,
NULL::VARCHAR c119,
NULL::VARCHAR c120,
NULL::VARCHAR c121,
NULL::VARCHAR c122,
NULL::VARCHAR c123,
NULL::VARCHAR c124,
NULL::VARCHAR c125,
NULL::VARCHAR c126,
NULL::VARCHAR c127,
NULL::VARCHAR c128,
NULL::VARCHAR c129,
NULL::VARCHAR c130,
NULL::VARCHAR c131,
NULL::VARCHAR c132,
NULL::VARCHAR c133,
NULL::VARCHAR c134,
NULL::VARCHAR c135,
NULL::VARCHAR c136,
NULL::VARCHAR c137,
NULL::VARCHAR c138,
NULL::VARCHAR c139,
NULL::VARCHAR c140,
NULL::VARCHAR c141,
NULL::VARCHAR c142,
NULL::VARCHAR c143,
NULL::VARCHAR c144,
NULL::VARCHAR c145,
NULL::VARCHAR c146,
NULL::VARCHAR c147,
NULL::VARCHAR c148,
NULL::VARCHAR c149,
NULL::VARCHAR c150,
NULL::VARCHAR c151,
NULL::VARCHAR c152,
NULL::VARCHAR c153,
NULL::VARCHAR c154,
NULL::VARCHAR c155,
NULL::VARCHAR c156,
NULL::VARCHAR c157,
NULL::VARCHAR c158,
NULL::VARCHAR c159,
NULL::VARCHAR c160,
NULL::VARCHAR c161,
NULL::VARCHAR c162,
NULL::VARCHAR c163,
NULL::VARCHAR c164,
NULL::VARCHAR c165,
NULL::VARCHAR c166,
NULL::VARCHAR c167,
NULL::VARCHAR c168,
NULL::VARCHAR c169,
NULL::VARCHAR c170,
NULL::VARCHAR c171,
NULL::VARCHAR c172,
NULL::VARCHAR c173,
NULL::VARCHAR c174,
NULL::VARCHAR c175,
NULL::VARCHAR c176,
NULL::VARCHAR c177,
NULL::VARCHAR c178,
NULL::VARCHAR c179,
NULL::VARCHAR c180,
NULL::VARCHAR c181,
NULL::VARCHAR c182,
NULL::VARCHAR c183,
NULL::VARCHAR c184,
NULL::VARCHAR c185,
NULL::VARCHAR c186,
NULL::VARCHAR c187,
NULL::VARCHAR c188,
NULL::VARCHAR c189,
NULL::VARCHAR c190,
NULL::VARCHAR c191,
NULL::VARCHAR c192,
NULL::VARCHAR c193,
NULL::VARCHAR c194,
NULL::VARCHAR c195,
NULL::VARCHAR c196,
NULL::VARCHAR c197,
NULL::VARCHAR c198,
NULL::VARCHAR c199,
NULL::VARCHAR c200,
NULL::VARCHAR c201,
NULL::VARCHAR c202,
NULL::VARCHAR c203,
NULL::VARCHAR c204,
NULL::VARCHAR c205,
NULL::VARCHAR c206,
NULL::VARCHAR c207,
NULL::VARCHAR c208,
NULL::VARCHAR c209,
NULL::VARCHAR c210,
NULL::VARCHAR c211,
NULL::VARCHAR c212,
NULL::VARCHAR c213,
NULL::VARCHAR c214,
NULL::VARCHAR c215,
NULL::VARCHAR c216,
NULL::VARCHAR c217,
NULL::VARCHAR c218,
NULL::VARCHAR c219,
NULL::VARCHAR c220,
NULL::VARCHAR c221,
NULL::VARCHAR c222,
NULL::VARCHAR c223,
NULL::VARCHAR c224,
NULL::VARCHAR c225,
NULL::VARCHAR c226,
NULL::VARCHAR c227,
NULL::VARCHAR c228,
NULL::VARCHAR c229,
NULL::VARCHAR c230,
NULL::VARCHAR c231,
NULL::VARCHAR c232,
NULL::VARCHAR c233,
NULL::VARCHAR c234,
NULL::VARCHAR c235,
NULL::VARCHAR c236,
NULL::VARCHAR c237,
NULL::VARCHAR c238,
NULL::VARCHAR c239,
NULL::VARCHAR c240,
NULL::VARCHAR c241,
NULL::VARCHAR c242,
NULL::VARCHAR c243,
NULL::VARCHAR c244,
NULL::VARCHAR c245,
NULL::VARCHAR c246,
NULL::VARCHAR c247,
NULL::VARCHAR c248,
NULL::VARCHAR c249,
NULL::VARCHAR c250,
NULL::VARCHAR c251,
NULL::VARCHAR c252,
NULL::VARCHAR c253,
NULL::VARCHAR c254,
NULL::VARCHAR c255,
NULL::VARCHAR c256,
NULL::VARCHAR c257,
NULL::VARCHAR c258,
NULL::VARCHAR c259,
NULL::VARCHAR c260,
NULL::VARCHAR c261,
NULL::VARCHAR c262,
NULL::VARCHAR c263,
NULL::VARCHAR c264,
NULL::VARCHAR c265,
NULL::VARCHAR c266,
NULL::VARCHAR c267,
NULL::VARCHAR c268,
NULL::VARCHAR c269,
NULL::VARCHAR c270,
NULL::VARCHAR c271,
NULL::VARCHAR c272,
NULL::VARCHAR c273,
NULL::VARCHAR c274,
NULL::VARCHAR c275,
NULL::VARCHAR c276,
NULL::VARCHAR c277,
NULL::VARCHAR c278,
NULL::VARCHAR c279,
NULL::VARCHAR c280,
NULL::VARCHAR c281,
NULL::VARCHAR c282,
NULL::VARCHAR c283,
NULL::VARCHAR c284,
NULL::VARCHAR c285,
NULL::VARCHAR c286,
NULL::VARCHAR c287,
NULL::VARCHAR c288,
NULL::VARCHAR c289,
NULL::VARCHAR c290,
NULL::VARCHAR c291,
NULL::VARCHAR c292,
NULL::VARCHAR c293,
NULL::VARCHAR c294,
NULL::VARCHAR c295,
NULL::VARCHAR c296,
NULL::VARCHAR c297,
NULL::VARCHAR c298,
NULL::VARCHAR c299,
NULL::VARCHAR c300,
NULL::VARCHAR c301,
NULL::VARCHAR c302,
NULL::VARCHAR c303,
NULL::VARCHAR c304,
NULL::VARCHAR c305,
NULL::VARCHAR c306,
NULL::VARCHAR c307,
NULL::VARCHAR c308,
NULL::VARCHAR c309,
NULL::VARCHAR c310,
NULL::VARCHAR c311,
NULL::VARCHAR c312,
NULL::VARCHAR c313,
NULL::VARCHAR c314,
NULL::VARCHAR c315,
NULL::VARCHAR c316,
NULL::VARCHAR c317,
NULL::VARCHAR c318,
NULL::VARCHAR c319,
NULL::VARCHAR c320,
NULL::VARCHAR c321,
NULL::VARCHAR c322,
NULL::VARCHAR c323,
NULL::VARCHAR c324,
NULL::VARCHAR c325,
NULL::VARCHAR c326,
NULL::VARCHAR c327,
NULL::VARCHAR c328,
NULL::VARCHAR c329,
NULL::VARCHAR c330,
NULL::VARCHAR c331,
NULL::VARCHAR c332,
NULL::VARCHAR c333,
NULL::VARCHAR c334,
NULL::VARCHAR c335,
NULL::VARCHAR c336,
NULL::VARCHAR c337,
NULL::VARCHAR c338,
NULL::VARCHAR c339,
NULL::VARCHAR c340,
NULL::VARCHAR c341,
NULL::VARCHAR c342,
NULL::VARCHAR c343,
NULL::VARCHAR c344,
NULL::VARCHAR c345,
NULL::VARCHAR c346,
NULL::VARCHAR c347,
NULL::VARCHAR c348,
NULL::VARCHAR c349,
NULL::VARCHAR c350,
NULL::VARCHAR c351,
NULL::VARCHAR c352,
NULL::VARCHAR c353,
NULL::VARCHAR c354,
NULL::VARCHAR c355,
NULL::VARCHAR c356,
NULL::VARCHAR c357,
NULL::VARCHAR c358,
NULL::VARCHAR c359,
NULL::VARCHAR c360,
NULL::VARCHAR c361,
NULL::VARCHAR c362,
NULL::VARCHAR c363,
NULL::VARCHAR c364,
NULL::VARCHAR c365,
NULL::VARCHAR c366,
NULL::VARCHAR c367,
NULL::VARCHAR c368,
NULL::VARCHAR c369,
NULL::VARCHAR c370,
NULL::VARCHAR c371,
NULL::VARCHAR c372,
NULL::VARCHAR c373,
NULL::VARCHAR c374,
NULL::VARCHAR c375,
NULL::VARCHAR c376,
NULL::VARCHAR c377,
NULL::VARCHAR c378,
NULL::VARCHAR c379,
NULL::VARCHAR c380,
NULL::VARCHAR c381,
NULL::VARCHAR c382,
NULL::VARCHAR c383,
NULL::VARCHAR c384,
NULL::VARCHAR c385,
NULL::VARCHAR c386,
NULL::VARCHAR c387,
NULL::VARCHAR c388,
NULL::VARCHAR c389,
NULL::VARCHAR c390,
NULL::VARCHAR c391,
NULL::VARCHAR c392,
NULL::VARCHAR c393,
NULL::VARCHAR c394,
NULL::VARCHAR c395,
NULL::VARCHAR c396,
NULL::VARCHAR c397,
NULL::VARCHAR c398,
NULL::VARCHAR c399,
FROM
range(600_000)
) TO '__TEST_DIR__/many_nulls.parquet' (ROW_GROUP_SIZE_BYTES '64mb');
# set a low memory env
statement ok
SET threads=4;
# we creating a build side of approximately:
# uncompressed: 1_200_000 * 400 * 16 = ~8 GB
# uncompressed: 1_200_000 * 400 * 1 = ~0.5 GB
# this memory limit tests if the build side compresses well
statement ok
SET memory_limit='1.5GB';
statement ok
SET temp_directory=NULL
# join, this should take many GBs of memory, but all the NULLs get compressed to UTINYINT so it should fit
# we should see at least 10 compresses in the output (should be 400 but gets truncated)
query II
EXPLAIN ANALYZE SELECT ANY_VALUE(COLUMNS(*))
FROM read_parquet(['__TEST_DIR__/many_nulls.parquet' for _ in range(2)], union_by_name=true) build
JOIN read_parquet(['__TEST_DIR__/many_nulls.parquet' for _ in range(3)]) probe
USING (pk)
----
analyzed_plan <REGEX>:(.*internal_compress_string.*){10,}

View File

@@ -0,0 +1,45 @@
# name: test/parquet/parquet_stats_function.test
# description: Test stats(col) function on Parquet files
# group: [parquet]
require parquet
# we can derive whether all values in a parquet column are NULL
statement ok
copy (select null i) to '__TEST_DIR__/all_null.parquet'
# "Has No Null" is "false", meaning there are no non-NULL values
query I
select stats(i) from '__TEST_DIR__/all_null.parquet'
----
[Min: NULL, Max: NULL][Has Null: true, Has No Null: false]
# create 0-9 with no NULL
statement ok
copy (select range i from range(10)) to '__TEST_DIR__/parquet_stats_function1.parquet'
query I
select stats(i) from read_parquet('__TEST_DIR__/parquet_stats_function1.parquet', union_by_name=true) limit 1
----
[Min: 0, Max: 9][Has Null: false, Has No Null: true]
# create 100-109 with NULL
statement ok
copy (select range i from range(100, 110) union all select null i) to '__TEST_DIR__/parquet_stats_function2.parquet'
query I
select stats(i) from read_parquet('__TEST_DIR__/parquet_stats_function2.parquet', union_by_name=true) limit 1
----
[Min: 100, Max: 109][Has Null: true, Has No Null: true]
# query combined WITHOUT union_by_name (should give back no stats)
query I
select stats(i) from read_parquet('__TEST_DIR__/parquet_stats_function*.parquet', union_by_name=false) limit 1
----
[Min: NULL, Max: NULL][Has Null: true, Has No Null: true]
# now query combined WITH union_by_name (should give back stats)
query I
select stats(i) from read_parquet('__TEST_DIR__/parquet_stats_function*.parquet', union_by_name=true) limit 1
----
[Min: 0, Max: 109][Has Null: true, Has No Null: true]

View File

@@ -0,0 +1,42 @@
# name: test/parquet/parquet_version.test
# description: Test Parquet writer parquet_version parameter
# group: [parquet]
require parquet
statement error
copy (select range i from range(20000)) to '__TEST_DIR__/parquet_version.parquet' (parquet_version)
----
Invalid Input Error
statement error
copy (select range i from range(20000)) to '__TEST_DIR__/parquet_version.parquet' (parquet_version v3)
----
Binder Error
# defaults to V1
statement ok
copy (select range i from range(20000)) to '__TEST_DIR__/parquet_version.parquet'
query I
select encodings from parquet_metadata('__TEST_DIR__/parquet_version.parquet')
----
PLAIN
# we do PLAIN if we can't do dictionary for V1
statement ok
copy (select range i from range(20000)) to '__TEST_DIR__/parquet_version.parquet' (parquet_version v1)
query I
select encodings from parquet_metadata('__TEST_DIR__/parquet_version.parquet')
----
PLAIN
# we do DELTA_BINARY_PACKED for V2
statement ok
copy (select range i from range(20000)) to '__TEST_DIR__/parquet_version.parquet' (parquet_version v2)
query I
select encodings from parquet_metadata('__TEST_DIR__/parquet_version.parquet')
----
DELTA_BINARY_PACKED

View File

@@ -0,0 +1,33 @@
# name: test/parquet/prefetching.test
# description: Test parquet files using the prefetching mechanism
# group: [parquet]
require parquet
# Normally, local files do not use the prefetching mechanism, however this debugging options will force the mechanism
statement ok
set prefetch_all_parquet_files=true;
# With default settings, this query will fail: the incorrectly set index page offsets mess with duckdb's prefetching mechanism
statement error
FROM 'data/parquet-testing/incorrect_index_page_offsets.parquet'
----
IO Error: The parquet file 'data/parquet-testing/incorrect_index_page_offsets.parquet' seems to have incorrectly set page offsets. This interferes with DuckDB's prefetching optimization. DuckDB may still be able to scan this file by manually disabling the prefetching mechanism using: 'SET disable_parquet_prefetching=true'.
# Now we disable prefetching
statement ok
set disable_parquet_prefetching=true;
query IIIIIIIIIII
FROM 'data/parquet-testing/incorrect_index_page_offsets.parquet'
----
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 0
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 1
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 2
0.29 Premium I VS2 62.4 58.0 334 4.2 4.23 2.63 3
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 4
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 5
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 6
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 7
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 8
0.23 Very Good H VS1 59.4 61.0 338 4.0 4.05 2.39 9

View File

@@ -0,0 +1,34 @@
# name: test/parquet/test_18470.test
# description: Test issue #18470 - Internal Error when querying multiple Parquet files that have mixed data type for same column
# group: [parquet]
require parquet
require notwindows
set seed 0.42
statement ok
COPY (
SELECT
(random() * 65535)::UINT16 as column1,
'text_value_' || row_number() OVER () as column2
FROM generate_series(1, 100)
) TO '__TEST_DIR__/20250101.parquet' (FORMAT PARQUET);
statement ok
COPY (
SELECT
(random() * 65535)::UINT64 as column1,
'text_value_' || row_number() OVER () as column2
FROM generate_series(1, 100)
) TO '__TEST_DIR__/20250102.parquet' (FORMAT PARQUET);
query III
SELECT filename = '__TEST_DIR__/20250101.parquet', *
FROM read_parquet('__TEST_DIR__/2025010*.parquet')
WHERE filename >= '__TEST_DIR__/20250101.parquet'
AND column1 = 72
LIMIT 10;
----
true 72 text_value_66

View File

@@ -0,0 +1,62 @@
# name: test/parquet/test_filename_column.test
# description: Test MultiFileReader filename column rename
# group: [parquet]
require parquet
# anything non-VARCHAR will be cast to boolean, and interpreted as such
query I
SELECT pq.filename FROM read_parquet('data/parquet-testing/enum.parquet', filename=true) pq LIMIT 1
----
data/parquet-testing/enum.parquet
query I
SELECT pq.filename FROM read_parquet('data/parquet-testing/enum.parquet', filename=1) pq LIMIT 1
----
data/parquet-testing/enum.parquet
# the string TRUE can be a column name
query I
SELECT "TRUE" FROM read_parquet('data/parquet-testing/enum.parquet', filename='TRUE') pq LIMIT 1
----
data/parquet-testing/enum.parquet
# FALSR too
query I
SELECT "FALSE" FROM read_parquet('data/parquet-testing/enum.parquet', filename='FALSE') pq LIMIT 1
----
data/parquet-testing/enum.parquet
# this is the output without an additional filename column
query IIIIIII nosort q0
SELECT * FROM read_parquet('data/parquet-testing/enum.parquet')
----
# this shouldn't somehow add a column with the name false/0/FALSE
query IIIIIII nosort q0
SELECT * FROM read_parquet('data/parquet-testing/enum.parquet', filename=false)
----
query IIIIIII nosort q0
SELECT * FROM read_parquet('data/parquet-testing/enum.parquet', filename=0)
----
# cool names work too
query I
SELECT my_cool_filename FROM read_parquet('data/parquet-testing/enum.parquet', filename='my_cool_filename') LIMIT 1
----
data/parquet-testing/enum.parquet
query I
SELECT my_cool_filename FROM read_parquet('data/parquet-testing/enum.parquet', filename=my_cool_filename) LIMIT 1
----
data/parquet-testing/enum.parquet
query III
select file_name[22:], row_group_id, bloom_filter_excludes from parquet_bloom_probe('data/parquet-testing/multi_bloom_*.parquet', 'a', 1)
----
multi_bloom_a.parquet 0 false
multi_bloom_b.parquet 0 true
multi_bloom_c.parquet 0 true

View File

@@ -0,0 +1,11 @@
# name: test/parquet/test_internal_5021.test
# description: Internal issue 5021: Assertion failure in DbpEncoder when writing Parquet V2
# group: [parquet]
require parquet
statement ok
CREATE TABLE tbl AS SELECT 'hello world' || i str FROM range(11) t(i);
statement ok
COPY tbl TO '__TEST_DIR__/file.parquet' (PARQUET_VERSION 'V2', DICTIONARY_SIZE_LIMIT 1);

View File

@@ -0,0 +1,9 @@
# name: test/parquet/test_legacy_empty_pandas_parquet.test
# group: [parquet]
require parquet
# This file includes the unsupported NULL (24) ConvertedType
# Which is not supported by the spec, but written by some ancient versions of Pandas (pre-2020)
statement ok
select * from 'data/parquet-testing/empty.parquet'

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,82 @@
# name: test/parquet/test_parquet_reader_compression.test
# description: Test Parquet Reader with files on data/parquet-testing/compression
# group: [parquet]
require parquet
foreach codec NONE SNAPPY GZIP ZSTD LZ4 BROTLI
query IIII
SELECT * FROM parquet_scan('data/parquet-testing/compression/generated/data_page=1_${codec}.parquet', hive_partitioning=0) limit 50
----
0 20 {'string': foo, 'int': 22} []
1 6 {'string': baz, 'int': 10} NULL
2 23 {'string': bar, 'int': NULL} NULL
3 9 {'string': baz, 'int': 12} [25, 7, 5, 22, 24, 18, 30, 7, 19, 7, 17, 11, 30, 40, 30]
4 6 {'string': foo, 'int': 41} NULL
5 23 NULL [5, 22, 17, 7, 9, 37, 28, 37, 26, 30, 38, 40, 2]
6 19 {'string': foo, 'int': NULL} [NULL, 25, 21]
7 20 {'string': baz, 'int': 10} [35, 32, 11, 26, 27, 4, 1, 13, 31, 2, 32, 38, 16, 0, 29, 23, 32, 7, 17]
8 29 {'string': baz, 'int': 35} NULL
9 11 NULL [14, 0, NULL, 29, 23, 14, 13, 13, 15, 26, 29, 32, 5, 13, 32, 29, 38]
10 25 {'string': baz, 'int': 23} [5, 20, 9, 18, 32, 6, 21, 18, 1, 32, 34, 17, 3, 26, NULL, 1, 16, 9, 41]
11 9 NULL []
12 17 {'string': bar, 'int': 25} [8, 37, NULL, 34, 1, 5, 9, 40, 1, 28, 27, 14, 28, 0, 14, 33, 1, 26, 18]
13 17 {'string': foo, 'int': 20} [38, 7, 40, 18, 26]
14 6 NULL [16, 31, 9, 30, 36, 24, 29, 20, 20, 20, 17, 37, 4, 41, 25, 12, 21, 24]
15 5 {'string': bar, 'int': NULL} [38, 35, 41, 4, 34, NULL, 37, 12, 21, 31, 16, 13, 20, 36, 22, 19, 35]
16 6 {'string': bar, 'int': 25} [3]
17 20 {'string': bar, 'int': 35} [6, 11, 25, 14, 38, 19, 9, 21, 12, 41, 36, 31]
18 18 {'string': NULL, 'int': 19} [28]
19 28 NULL [0, 41, 26, 27, 23, 40]
20 21 {'string': bar, 'int': 3} [15, 35, 40, 29, 37, 8, 4, 9, 6, 37, 16, 14, 32, 29, NULL, 18, 1]
21 7 {'string': NULL, 'int': 36} [19]
22 27 NULL [3, 0, 15, 35, 6, 13, 24, 14, 7, 3, 32]
23 28 {'string': NULL, 'int': NULL} [26, 17, 33, 17, 21, 34, 20, 25, 33, 21, 4, 1, 23, 9, 32]
24 21 {'string': foo, 'int': 12} [19, 15, 36, 37, 1, 19, 21, 4, 40, NULL, NULL, 19, 4]
25 20 {'string': foo, 'int': NULL} NULL
26 3 {'string': NULL, 'int': 15} [32, 31, 3, 26, 34, 1, 6, 29, 5, 22, 11, 1, 18]
27 2 {'string': foo, 'int': 25} [19]
28 7 {'string': foo, 'int': 34} [20, 1, 18, 20, 1, 3, 25, 2, 31, 22, NULL, 40, 23, 32, 40, 10]
29 13 {'string': bar, 'int': 8} [40, 32, 9, 2, 2, 40, 7, 0, 32, 31, 11, 14, 4, 14, 40, 20, 29, 17, 41]
query IIII
SELECT * FROM parquet_scan('data/parquet-testing/compression/generated/data_page=2_${codec}.parquet', hive_partitioning=0) limit 50
----
0 20 {'string': foo, 'int': 22} []
1 6 {'string': baz, 'int': 10} NULL
2 23 {'string': bar, 'int': NULL} NULL
3 9 {'string': baz, 'int': 12} [25, 7, 5, 22, 24, 18, 30, 7, 19, 7, 17, 11, 30, 40, 30]
4 6 {'string': foo, 'int': 41} NULL
5 23 NULL [5, 22, 17, 7, 9, 37, 28, 37, 26, 30, 38, 40, 2]
6 19 {'string': foo, 'int': NULL} [NULL, 25, 21]
7 20 {'string': baz, 'int': 10} [35, 32, 11, 26, 27, 4, 1, 13, 31, 2, 32, 38, 16, 0, 29, 23, 32, 7, 17]
8 29 {'string': baz, 'int': 35} NULL
9 11 NULL [14, 0, NULL, 29, 23, 14, 13, 13, 15, 26, 29, 32, 5, 13, 32, 29, 38]
10 25 {'string': baz, 'int': 23} [5, 20, 9, 18, 32, 6, 21, 18, 1, 32, 34, 17, 3, 26, NULL, 1, 16, 9, 41]
11 9 NULL []
12 17 {'string': bar, 'int': 25} [8, 37, NULL, 34, 1, 5, 9, 40, 1, 28, 27, 14, 28, 0, 14, 33, 1, 26, 18]
13 17 {'string': foo, 'int': 20} [38, 7, 40, 18, 26]
14 6 NULL [16, 31, 9, 30, 36, 24, 29, 20, 20, 20, 17, 37, 4, 41, 25, 12, 21, 24]
15 5 {'string': bar, 'int': NULL} [38, 35, 41, 4, 34, NULL, 37, 12, 21, 31, 16, 13, 20, 36, 22, 19, 35]
16 6 {'string': bar, 'int': 25} [3]
17 20 {'string': bar, 'int': 35} [6, 11, 25, 14, 38, 19, 9, 21, 12, 41, 36, 31]
18 18 {'string': NULL, 'int': 19} [28]
19 28 NULL [0, 41, 26, 27, 23, 40]
20 21 {'string': bar, 'int': 3} [15, 35, 40, 29, 37, 8, 4, 9, 6, 37, 16, 14, 32, 29, NULL, 18, 1]
21 7 {'string': NULL, 'int': 36} [19]
22 27 NULL [3, 0, 15, 35, 6, 13, 24, 14, 7, 3, 32]
23 28 {'string': NULL, 'int': NULL} [26, 17, 33, 17, 21, 34, 20, 25, 33, 21, 4, 1, 23, 9, 32]
24 21 {'string': foo, 'int': 12} [19, 15, 36, 37, 1, 19, 21, 4, 40, NULL, NULL, 19, 4]
25 20 {'string': foo, 'int': NULL} NULL
26 3 {'string': NULL, 'int': 15} [32, 31, 3, 26, 34, 1, 6, 29, 5, 22, 11, 1, 18]
27 2 {'string': foo, 'int': 25} [19]
28 7 {'string': foo, 'int': 34} [20, 1, 18, 20, 1, 3, 25, 2, 31, 22, NULL, 40, 23, 32, 40, 10]
29 13 {'string': bar, 'int': 8} [40, 32, 9, 2, 2, 40, 7, 0, 32, 31, 11, 14, 4, 14, 40, 20, 29, 17, 41]
query I
SELECT * FROM parquet_scan('data/parquet-testing/compression/empty_datapage_v2.snappy.parquet', hive_partitioning=0) limit 50
----
NULL
endloop

View File

@@ -0,0 +1,357 @@
# name: test/parquet/test_parquet_schema.test
# description: Parquet reader schema parameter tests
# group: [parquet]
require parquet
statement ok
COPY (SELECT 42::INTEGER i) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {i: 0})
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map{})
----
Invalid Input Error: 'schema' expects a STRUCT as the value type of the map
# can't combine with union_by_name
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
}, union_by_name=true)
----
Binder Error: Parquet schema cannot be combined with union_by_name=true or hive_partitioning=true
# can't combine with hive_partitioning
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/*.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
}, hive_partitioning=true)
----
Binder Error: Parquet schema cannot be combined with union_by_name=true or hive_partitioning=true
statement ok
COPY (
SELECT 1 i1, 3 i3, 4 i4, 5 i5 UNION ALL
SELECT 2 i1, 3 i3, 4 i4, 5 i5
) TO '__TEST_DIR__/partitioned' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet, WRITE_PARTITION_COLUMNS)
# auto-detection of hive partitioning is enabled by default,
# but automatically disabled when a schema is supplied, so this should succeed
query IIII
SELECT *
FROM read_parquet('__TEST_DIR__/partitioned/*/*.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
})
----
5 3 2 1
5 3 2 2
# when partition columns are specified in FIELD_IDS, error message should suggest WRITE_PARTITION_COLUMNS option
statement error
COPY (
SELECT 1 i1, 3 i3, 4 i4, 5 i5 UNION ALL
SELECT 2 i1, 3 i3, 4 i4, 5 i5
) TO '__TEST_DIR__/partitioned2' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet)
----
Binder Error: Column name "i1" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this column is a partition column. Available column names:
# cannot duplicate field_ids
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
0: {name: 'new_column', type: 'UTINYINT', default_value: 43}
})
----
Map keys must be unique
# cannot duplicate column names
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'cool_column', type: 'BIGINT', default_value: NULL},
1: {name: 'cool_column', type: 'UTINYINT', default_value: 43}
}) pq
----
Binder Error: table "pq" has duplicate column name "cool_column"
# the supplied default value must be castable to the given type for that column
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'cool_column', type: 'BIGINT', default_value: NULL},
1: {name: 'cool_column', type: 'UTINYINT', default_value: 'bla'}
}) pq
----
Binder Error: Unable to cast Parquet schema default_value "bla" to UTINYINT
query IIIIII
DESCRIBE SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
})
----
renamed_i BIGINT YES NULL NULL NULL
new_column UTINYINT YES NULL NULL NULL
query IIIIII
DESCRIBE SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
}, filename=true)
----
renamed_i BIGINT YES NULL NULL NULL
new_column UTINYINT YES NULL NULL NULL
filename VARCHAR YES NULL NULL NULL
# we'll test if filename works on a persistent file otherwise __TEST_DIR__ will be different every time
query II
SELECT *
FROM read_parquet('data/parquet-testing/enum.parquet', schema=map {
1: {name: 'cool_column', type: 'VARCHAR', default_value: NULL}
}, filename=true)
LIMIT 1
----
1 data/parquet-testing/enum.parquet
query II
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
})
----
42 43
# we just get a cast error when we can't cast to the supplied type
statement error
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'DATE', default_value: NULL}
})
----
Conversion Error
# if we don't supply a field id, we can't refer to it using the schema parameter
statement ok
COPY (SELECT 42::INTEGER i) TO '__TEST_DIR__/integers.parquet'
query II
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
})
----
NULL 43
# let's spice it up with more columns
statement ok
COPY (
SELECT 1 i1, 3 i3, 4 i4, 5 i5
) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1})
# this is purposely a bit confusing but we're:
# 1. deleting field id 2
# 2. creating field id 4
# 3. reversing the order of the columns
# 4. renaming them (except i3)
# 5. upcasting them
query IIII
SELECT *
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
})
----
5 3 2 1
# projection still ok
query I
SELECT i1
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
})
----
5
# we can still select virtual columns as well
query III
SELECT file_row_number, filename[-16:], i4
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
})
----
0 integers.parquet 2
# projection still, even with different generated columns
query III
SELECT file_row_number, filename[-16:], i4
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
}, file_row_number=1, filename=1)
----
0 integers.parquet 2
# count(*) still ok
query I
SELECT count(*)
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
})
----
1
# combine with constant column
query II
SELECT i1, filename[-16:]
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
}, filename=true)
----
5 integers.parquet
statement ok
COPY (
SELECT range % 4 g, range i FROM range(1000)
) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {g: 33, i: 42})
# let's also do a query with a filter and a downcast
query II
SELECT my_cool_group, sum(my_cool_value)
FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
33: {name: 'my_cool_group', type: 'UINTEGER', default_value: NULL},
42: {name: 'my_cool_value', type: 'UINTEGER', default_value: NULL}
})
WHERE my_cool_group = 2
GROUP BY my_cool_group
----
2 125000
# also test multi-file reading with different field ids
# field id -> value:
# 1 -> 5
# 2 -> 4 (unused)
# 3 -> 3
# 4 -> - (missing)
# 5 -> 1
statement ok
COPY (
SELECT
1 i1,
3 i3,
4 i4,
5 i5
) TO '__TEST_DIR__/multifile1.parquet' (FIELD_IDS {
i1: 5,
i3: 3,
i4: 2,
i5: 1
})
# field_id -> value:
# 1 -> 1
# 2 -> 3 (unused)
# 3 -> 4
# 4 -> 5
# 5 -> - (missing)
statement ok
COPY (
SELECT
1 j1,
3 j3,
4 j4,
5 j5
) TO '__TEST_DIR__/multifile2.parquet' (FIELD_IDS {
j1: 1,
j3: 2,
j4: 3,
j5: 4
})
query IIIII
SELECT i1, i3, i4, i5, filename[-18:]
FROM read_parquet('__TEST_DIR__/multifile*.parquet', schema=map {
1: {name: 'i1', type: 'BIGINT', default_value: NULL},
3: {name: 'i3', type: 'BIGINT', default_value: NULL},
4: {name: 'i4', type: 'BIGINT', default_value: 2},
5: {name: 'i5', type: 'BIGINT', default_value: NULL}
}, filename=true)
ORDER BY filename
----
5 3 2 1 multifile1.parquet
1 4 5 NULL multifile2.parquet
statement error
select * FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
True: {name: 'my_cool_group', type: 'UINTEGER', default_value: NULL},
False: {name: 'my_cool_value', type: 'UINTEGER', default_value: NULL}
});
----
Invalid Input Error: 'schema' expects the value type of the map to be either INTEGER or VARCHAR, not BOOLEAN
query II
SELECT alias(COLUMNS(*)) FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
'i': {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
'j': {name: 'new_column', type: 'UTINYINT', default_value: 43}
}) limit 1;
----
renamed_i new_column
# issue 15504
statement ok
COPY (select 1 as id, list_value('a', 'b', 'c') as arr, { key: 1, v1: 'a', v2: 'b' } as s) TO '__TEST_DIR__/15504.parquet' (field_ids { 'id': 0, 'arr': 1, 's': 2 });
query III
SELECT * FROM read_parquet('__TEST_DIR__/15504.parquet', schema=map { 0: { name: 'id', type: 'int32', default_value: NULL }, 1: { name: 'arr', type: 'varchar[]', default_value: NULL }, 2: { name: 's', type: 'STRUCT(key INT, v1 TEXT, v2 TEXT)', default_value: NULL } });
----
1 [a, b, c] {'key': 1, 'v1': a, 'v2': b}
# issue 16094
statement ok
copy (
select
x
from generate_series(1,100) as g(x)
) to '__TEST_DIR__/16094.parquet'
with (
field_ids {x: 1}
);
statement ok
select
x,
filename
from read_parquet(
'__TEST_DIR__/16094.parquet',
schema=map {
1: {name: 'x', type: 'int', default_value: NULL}
},
filename=True
) where x = 1;

View File

@@ -0,0 +1,54 @@
# name: test/parquet/timens_parquet.test
# description: Round trip of TIME_NS data
# group: [parquet]
require parquet
# Insertion
statement ok
CREATE TABLE times(tns TIME_NS)
statement ok
INSERT INTO times VALUES
('00:00:00'),
('00:01:20'),
('10:21:00.0'),
('10:21:00.1'),
('10:21:00.9'),
('16:04:22.01'),
('16:04:22.12'),
('16:04:22.97'),
('20:08:10.001'),
('20:08:10.123'),
('20:08:10.998'),
('03:45:47.0001'),
('03:45:47.1234'),
('03:45:47.9999'),
('02:27:19.00001'),
('02:27:19.12345'),
('02:27:19.99899'),
('09:01:54.000001'),
('09:01:54.123456'),
('09:01:54.999978'),
('23:35:57.0000001'),
('23:35:57.1234567'),
('23:35:57.9999999'),
('13:00:00.00000001'),
('13:00:00.12345678'),
('13:00:00.99999989'),
('23:59:59.000000001'),
('23:59:59.123456789'),
('23:59:59.999999999'),
('24:00:00.000000000'),
(NULL)
query I nosort t0
from times
----
statement ok
copy times to '__TEST_DIR__/time_ns.parquet' (PARQUET_VERSION V2);
query I nosort t0
from '__TEST_DIR__/time_ns.parquet';
----

View File

@@ -0,0 +1,47 @@
# name: test/parquet/timetz_parquet.test
# description: Test parquet file with time with time zone data
# group: [parquet]
require parquet
query I
select * from 'data/parquet-testing/timetz.parquet' ;
----
14:30:00+00
11:35:00+00
01:59:00+00
query I
select COL_TIME from 'data/parquet-testing/date-with-timezone-int64.parquet' ;
----
12:00:00+00
query II
select pruefbahn_id, arbeits_beginn
from 'data/parquet-testing/timetz-nanos.parquet'
where pruefbahn_id = '58981';
----
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
58981 07:20:00+00
query I
select col33
from 'data/parquet-testing/negative-timetz.parquet';
----
20:08:21+00
09:01:00+00
13:04:04+00

View File

@@ -0,0 +1,57 @@
# name: test/parquet/variant/variant_all_types_shredded.test
# group: [variant]
require parquet
require json
statement ok
create macro data() as table (
select COLUMNS([
x for x in (*) if x NOT IN [
'utinyint',
'usmallint',
'uint',
'ubigint',
'hugeint',
'uhugeint',
'bignum',
'timestamp_s',
'timestamp_ms',
'timestamp_tz',
'time_tz',
'interval',
'bit',
'dec_4_1', -- Parquet VARIANT doesn't have int16_t DECIMAL
-- Conversion isn't 1-to-1
'dec_9_4', -- can't roundtrip with json
'dec_18_6', -- can't roundtrip with json
'dec38_10', -- can't roundtrip with json
'blob' -- data is base64-encoded in parquet read
]
])::VARIANT var from test_all_types()
)
query I nosort expected_res
select IF(VARIANT_TYPEOF(COLUMNS(*)) == 'VARIANT_NULL', NULL, COLUMNS(*)::JSON) from data();
----
foreach type bool tinyint smallint int bigint date time timestamp timestamp_ns timestamp_tz float double dec_9_4 dec_18_6 dec38_10 uuid varchar blob small_enum medium_enum large_enum int_array double_array date_array timestamp_array timestamptz_array varchar_array nested_int_array struct struct_of_arrays array_of_structs
statement ok
SET VARIABLE type_str = (SELECT $$STRUCT("${type}" $$ || typeof("${type}") || ')' from test_all_types() limit 1);
statement ok
COPY (
FROM data()
) TO '__TEST_DIR__/all_types_shredded_${type}.parquet' (
SHREDDING {
'var': getvariable('type_str')
}
)
query I nosort expected_res
select * from '__TEST_DIR__/all_types_shredded_${type}.parquet'
----
endloop

View File

@@ -0,0 +1,227 @@
# name: test/parquet/variant/variant_basic.test
# group: [variant]
require parquet
# Array
query II
from 'data/parquet-testing/variant_array_array_string_and_integer.parquet';
----
1 [["string","iceberg",34],[34,null],[],["string","iceberg"],34]
# String
query II
from 'data/parquet-testing/variant_string.parquet';
----
1 "iceberg"
# BOOL TRUE
query II
from 'data/parquet-testing/variant_bool_true.parquet';
----
1 true
# Decimal4
query II
from 'data/parquet-testing/variant_decimal4_positive.parquet';
----
1 "123456.789"
# UUID
query II
from 'data/parquet-testing/variant_uuid.parquet';
----
1 "f24f9b64-81fa-49d1-b74e-8c09a6e31c56"
# Empty array
query II
from 'data/parquet-testing/variant_array_empty.parquet';
----
1 []
query II
from 'data/parquet-testing/variant_int16.parquet';
----
1 -1234
query II
from 'data/parquet-testing/variant_int32.parquet';
----
1 -12345
# Binary
query II
from 'data/parquet-testing/variant_binary.parquet';
----
1 "CgsMDQ=="
# Decimal16
query II
from 'data/parquet-testing/variant_decimal16.parquet';
----
1 "9876543210.123456789"
query II
from 'data/parquet-testing/variant_int64.parquet';
----
1 -9876543210
# TIMESTAMP_NANOS_NTZ
query II
from 'data/parquet-testing/variant_timestamp_nanos_ntz.parquet';
----
1 "1957-11-07 12:33:54.123456789"
# Array of strings (2-dimensional)
query II
from 'data/parquet-testing/variant_array_array_string.parquet';
----
1 [["string","iceberg"],["apple","banana"]]
# TIMESTAMP_MICROS
query II
from 'data/parquet-testing/variant_timestamp_micros.parquet';
----
1 "1957-11-07 12:33:54.123456+00"
# Object {'a': .., 'c': ...}
query II
from 'data/parquet-testing/variant_object_primitives.parquet';
----
1 {"a":123456789,"c":"string"}
query II
from 'data/parquet-testing/variant_timestamp_micros_positive.parquet';
----
1 "2024-11-07 12:33:54.123456+00"
query II
from 'data/parquet-testing/variant_int16_positive.parquet';
----
1 1234
query II
from 'data/parquet-testing/variant_time_ntz.parquet';
----
1 "12:33:54.123456"
query II
from 'data/parquet-testing/variant_decimal16_negative.parquet';
----
1 "-9876543210.123456789"
query II
from 'data/parquet-testing/variant_timestamp_nanos1.parquet';
----
1 "1957-11-07 12:33:54.123457+00"
query II
from 'data/parquet-testing/variant_decimal8_negative.parquet';
----
1 "-123456789.987654321"
query II
from 'data/parquet-testing/variant_timestamp_micros_negative.parquet';
----
1 "1957-11-07 12:33:54.123456"
query II
from 'data/parquet-testing/variant_int8_positive.parquet';
----
1 34
query II
from 'data/parquet-testing/variant_timestamp_nanos2.parquet';
----
1 "2024-11-07 12:33:54.123456+00"
query II
from 'data/parquet-testing/variant_int8_negative.parquet';
----
1 -34
query II
from 'data/parquet-testing/variant_array_string.parquet';
----
1 ["iceberg","string"]
query II
from 'data/parquet-testing/variant_date_negative.parquet';
----
1 "1957-11-07"
query II
from 'data/parquet-testing/variant_int64_positive.parquet';
----
1 9876543210
query II
from 'data/parquet-testing/variant_array_object_string_and_integer.parquet';
----
1 [{"a":123456789,"c":"string"},{"a":123456789,"c":"string"},"iceberg",34]
query II
from 'data/parquet-testing/variant_int32_positive.parquet';
----
1 12345
query II
from 'data/parquet-testing/variant_double_negative.parquet';
----
1 -14.3
query II
from 'data/parquet-testing/variant_object_empty.parquet';
----
1 {}
query II
from 'data/parquet-testing/variant_null.parquet';
----
1 NULL
# -10.11 in the test that it was generated from
query II
from 'data/parquet-testing/variant_float_negative.parquet';
----
1 -10.109999656677246
query II
from 'data/parquet-testing/variant_object_string_and_array.parquet';
----
1 {"a":123456789,"c":["string","iceberg"]}
query II
from 'data/parquet-testing/variant_object_null_and_string.parquet';
----
1 {"a":null,"d":"iceberg"}
query II
from 'data/parquet-testing/variant_date_positive.parquet';
----
1 "2024-11-07"
query II
from 'data/parquet-testing/variant_bool_false.parquet';
----
1 false
query II
from 'data/parquet-testing/variant_array_object_string.parquet';
----
1 [{"a":123456789,"c":"string"},{"a":123456789,"c":"string"}]
query II
from 'data/parquet-testing/variant_decimal4_negative.parquet';
----
1 "-123456.789"
query II
from 'data/parquet-testing/variant_double_positive.parquet';
----
1 14.3
query II
from 'data/parquet-testing/variant_timestamp_micros_ntz_positive.parquet';
----
1 "2024-11-07 12:33:54.123456"

View File

@@ -0,0 +1,49 @@
# name: test/parquet/variant/variant_basic_shredded_writing.test
# group: [variant]
require parquet
require json
statement ok
create macro data() AS TABLE (
FROM (VALUES
({'a': 21::INTEGER, 'b': NULL}::VARIANT),
({'a': 42::INTEGER, 'd': 'test'}::VARIANT),
([]::VARIANT),
(NULL::VARIANT),
([{'b': True, 'c': 'test'}::VARIANT, 'test', 21, {'a': True}, [1::VARIANT, 2, True, 'false']]::VARIANT),
('this is a long string'::VARIANT),
('this is big enough to not be classified as a "short string" by parquet VARIANT'::VARIANT)
) t(a)
)
query I nosort expected_res
select IF(VARIANT_TYPEOF(COLUMNS(*)) == 'VARIANT_NULL', NULL, COLUMNS(*)::JSON) from data();
----
statement ok
COPY (
from data() t(a)
) TO '__TEST_DIR__/shredded_struct.parquet' (
shredding {
a: 'STRUCT(a INTEGER, b VARIANT, c BOOLEAN)'
}
)
query I nosort expected_res
select * from '__TEST_DIR__/shredded_struct.parquet';
----
statement ok
COPY (
select a from data()
) TO '__TEST_DIR__/shredded_list.parquet' (
shredding {
a: 'VARCHAR[]'
}
)
query I nosort expected_res
select * from '__TEST_DIR__/shredded_list.parquet';
----

View File

@@ -0,0 +1,116 @@
# name: test/parquet/variant/variant_basic_writing.test
# group: [variant]
require parquet
require json
# STRUCT(a INTEGER, b INTEGER[])
statement ok
COPY (select
{
'a': 42,
'b': [null, 1, 2]
}::VARIANT
from range(10)
) TO '__TEST_DIR__/integer_variant.parquet';
query I
select * from '__TEST_DIR__/integer_variant.parquet';
----
{"a":42,"b":[null,1,2]}
{"a":42,"b":[null,1,2]}
{"a":42,"b":[null,1,2]}
{"a":42,"b":[null,1,2]}
{"a":42,"b":[null,1,2]}
{"a":42,"b":[null,1,2]}
{"a":42,"b":[null,1,2]}
{"a":42,"b":[null,1,2]}
{"a":42,"b":[null,1,2]}
{"a":42,"b":[null,1,2]}
statement ok
COPY (select
'[["string","iceberg",-34],[-34,null],[],["string","iceberg"],-34]'::JSON::VARIANT
from range(5)
) TO '__TEST_DIR__/list_of_list_variant.parquet'
query I
select * from '__TEST_DIR__/list_of_list_variant.parquet';
----
[["string","iceberg",-34],[-34,null],[],["string","iceberg"],-34]
[["string","iceberg",-34],[-34,null],[],["string","iceberg"],-34]
[["string","iceberg",-34],[-34,null],[],["string","iceberg"],-34]
[["string","iceberg",-34],[-34,null],[],["string","iceberg"],-34]
[["string","iceberg",-34],[-34,null],[],["string","iceberg"],-34]
statement ok
COPY (
with cte as (
FROM (VALUES
({'a': 21, 'b': NULL}::VARIANT),
([]::VARIANT),
(NULL::VARIANT),
([{'b': True, 'c': 'test'}]::VARIANT),
('this is a long string'::VARIANT),
('this is big enough to not be classified as a "short string" by parquet VARIANT'::VARIANT)
) t(a)
)
select a from cte
) TO '__TEST_DIR__/varied_variant.parquet'
query I
select * from '__TEST_DIR__/varied_variant.parquet';
----
{"a":21,"b":null}
[]
NULL
[{"b":true,"c":"test"}]
"this is a long string"
"this is big enough to not be classified as a \"short string\" by parquet VARIANT"
# VARIANT is only supported at the root for now
statement error
COPY (select [123::VARIANT]) TO '__TEST_DIR__/list_of_variant.parquet'
----
Not implemented Error: Unimplemented type for Parquet "VARIANT"
statement ok
create macro data() as table (
select COLUMNS([
x for x in (*) if x NOT IN [
'utinyint',
'usmallint',
'uint',
'ubigint',
'hugeint',
'uhugeint',
'bignum',
'timestamp_s',
'timestamp_ms',
'timestamp_tz',
'time_tz',
'interval',
'bit',
'dec_4_1', -- Parquet VARIANT doesn't have int16_t DECIMAL
-- Conversion isn't 1-to-1
'dec_9_4', -- can't roundtrip with json
'dec_18_6', -- can't roundtrip with json
'dec38_10', -- can't roundtrip with json
'blob' -- data is base64-encoded in parquet read
]
])::VARIANT as "\0" from test_all_types()
)
statement ok
COPY (
from data()
) TO '__TEST_DIR__/variant_test_all_types.parquet';
query I nosort expected_res
select IF(VARIANT_TYPEOF(COLUMNS(*)) == 'VARIANT_NULL', NULL, COLUMNS(*)::JSON) from data();
----
query I nosort expected_res
select * from '__TEST_DIR__/variant_test_all_types.parquet';
----

View File

@@ -0,0 +1,64 @@
# name: test/parquet/variant/variant_list_of_struct_partial_shredding.test
# group: [variant]
require parquet
require json
statement ok
create macro data() AS TABLE (
FROM (VALUES
(
[
{a:['foo'::VARIANT,42], b:true, c:{a:'nested1'}}::VARIANT, -- element of list in field 'a' is a different type
{a: 42, b: true, c:{a:'nested2'}}, -- field 'a' is a different type
{b: true, c:{a:'nested3'}}, -- field 'a' is missing
{a:[], b:false, c:{a:NULL}},
{a: [], c:{a:'nested4'}} -- field 'b' is missing
]::VARIANT
),
(
[]
),
(
[
{a:NULL, b:NULL, c:{a:'inner'}},
{a:['baz'], b:false, c:{a:NULL}}
]
),
(
NULL
),
(
[
{a:['alpha'], b:true, c:{a:'deep'}}::VARIANT,
{a: [[1,2]::VARIANT, 'hello', {a: 42}]}, -- fields 'b' and 'c' are missing, 'a' element is of a wrong type
{b: false}, -- fields 'a' and 'c' are missing
{a:[], b:NULL, c:{a:'leaf'}}
]
),
(
[
{a:NULL, b:false, c:{a:NULL}},
{a:['x',NULL,'z'], b:true, c:{a:'final'}}
]
)
) t(a)
);
query I nosort expected_res
select IF(VARIANT_TYPEOF(COLUMNS(*)) == 'VARIANT_NULL', NULL, COLUMNS(*)::JSON) from data();
----
statement ok
COPY (
select a from data()
) TO '__TEST_DIR__/shredded_list_of_structs.parquet' (
shredding {
a: 'STRUCT(a VARCHAR[], b BOOLEAN, c STRUCT(a VARCHAR))[]'
}
)
query I nosort expected_res
select * from '__TEST_DIR__/shredded_list_of_structs.parquet';
----

View File

@@ -0,0 +1,59 @@
# name: test/parquet/variant/variant_list_of_struct_shredding.test
# group: [variant]
require parquet
require json
statement ok
create macro data() AS TABLE (
FROM (VALUES
(
[
{a:['foo','bar'], b:true, c:{a:'nested1'}},
{a:[], b:false, c:{a:NULL}}
]::VARIANT
),
(
[]
),
(
[
{a:NULL, b:NULL, c:{a:'inner'}},
{a:['baz'], b:false, c:{a:NULL}}
]
),
(
NULL
),
(
[
{a:['alpha'], b:true, c:{a:'deep'}},
{a:[], b:NULL, c:{a:'leaf'}}
]
),
(
[
{a:NULL, b:false, c:{a:NULL}},
{a:['x',NULL,'z'], b:true, c:{a:'final'}}
]
)
) t(a)
);
query I nosort expected_res
select IF(VARIANT_TYPEOF(COLUMNS(*)) == 'VARIANT_NULL', NULL, COLUMNS(*)::JSON) from data();
----
statement ok
COPY (
select a from data()
) TO '__TEST_DIR__/shredded_list_of_structs.parquet' (
shredding {
a: 'STRUCT(a VARCHAR[], b BOOLEAN, c STRUCT(a VARCHAR))[]'
}
)
query I nosort expected_res
select * from '__TEST_DIR__/shredded_list_of_structs.parquet';
----

View File

@@ -0,0 +1,34 @@
# name: test/parquet/variant/variant_list_shredding.test
# group: [variant]
require parquet
require json
statement ok
create macro data() AS TABLE (
FROM (VALUES
([['test', NULL, 'this is a long string'],[],['hello'],NULL,[],[1, 2, 3]::VARIANT]::VARIANT),
(NULL::VARIANT),
([]::VARIANT),
([[{'a': 'test'}::VARIANT, [1, 2, 3]]::VARIANT, {'a': 21}, {'b': 42}, [['hello']]]),
([[], NULL, [1::VARIANT, 2, 'test'],['hello', 'world']]::VARIANT)
) t(a)
)
query I nosort expected_res
select IF(VARIANT_TYPEOF(COLUMNS(*)) == 'VARIANT_NULL', NULL, COLUMNS(*)::JSON) from data();
----
statement ok
COPY (
select a from data()
) TO '__TEST_DIR__/shredded_list_of_list_of_string.parquet' (
shredding {
a: 'VARCHAR[][]'
}
)
query I nosort expected_res
select * from '__TEST_DIR__/shredded_list_of_list_of_string.parquet';
----

View File

@@ -0,0 +1,31 @@
# name: test/parquet/variant/variant_nanos_tz.test
# group: [variant]
require parquet
statement ok
set variant_legacy_encoding=true;
# Timestamp NS - negative (with timezone) (shredded)
query II
from 'data/parquet-testing/variant_shredded_timestamp_nanos_tz_negative_no_logical_type.parquet';
----
1 "1957-11-07 12:33:54.123457+00"
# Timestamp NS - positive (with timezone) (shredded)
query II
from 'data/parquet-testing/variant_shredded_timestamp_nanos_tz_positive_no_logical_type.parquet';
----
1 "2024-11-07 12:33:54.123456+00"
# Timestamp NS - positive (with timezone) (unshredded)
query II
from 'data/parquet-testing/variant_timestamp_nanos_tz_positive_no_logical_type.parquet';
----
1 "2024-11-07 12:33:54.123456+00"
# Timestamp NS - negative (with timezone) (unshredded)
query II
from 'data/parquet-testing/variant_timestamp_nanos_tz_negative_no_logical_type.parquet';
----
1 "1957-11-07 12:33:54.123457+00"

View File

@@ -0,0 +1,44 @@
# name: test/parquet/variant/variant_nested_with_nulls.test
# group: [variant]
require parquet
query IIIIII
describe from parquet_scan('data/parquet-testing/variant_unshredded_nested_nulls.parquet')
----
id BIGINT YES NULL NULL NULL
v STRUCT("value" BLOB, metadata BLOB) YES NULL NULL NULL
array_of_variants STRUCT("value" BLOB, metadata BLOB)[] YES NULL NULL NULL
struct_of_variants STRUCT(v STRUCT("value" BLOB, metadata BLOB)) YES NULL NULL NULL
map_of_variants MAP(VARCHAR, STRUCT("value" BLOB, metadata BLOB)) YES NULL NULL NULL
array_of_struct_of_variants STRUCT(v STRUCT("value" BLOB, metadata BLOB))[] YES NULL NULL NULL
struct_of_array_of_variants STRUCT(v STRUCT("value" BLOB, metadata BLOB)[]) YES NULL NULL NULL
statement ok
set variant_legacy_encoding=true;
# Now the variant column gets emitted as JSON
query IIIIII
describe from parquet_scan('data/parquet-testing/variant_unshredded_nested_nulls.parquet')
----
id BIGINT YES NULL NULL NULL
v JSON YES NULL NULL NULL
array_of_variants JSON[] YES NULL NULL NULL
struct_of_variants STRUCT(v JSON) YES NULL NULL NULL
map_of_variants MAP(VARCHAR, JSON) YES NULL NULL NULL
array_of_struct_of_variants STRUCT(v JSON)[] YES NULL NULL NULL
struct_of_array_of_variants STRUCT(v JSON[]) YES NULL NULL NULL
query IIIIIII
select * from parquet_scan('data/parquet-testing/variant_unshredded_nested_nulls.parquet') order by id limit 10;
----
0 {"key":0} ['{"key":0}', NULL, '{"key":0}', NULL, '{"key":0}'] {'v': '{"key":0}'} {0='{"key":0}', nullKey=NULL} [{'v': '{"key":0}'}, {'v': NULL}, NULL, {'v': '{"key":0}'}, NULL, {'v': '{"key":0}'}] {'v': [NULL, '{"key":0}']}
0 {"key":0} ['{"key":0}', NULL, '{"key":0}', NULL, '{"key":0}'] {'v': '{"key":0}'} {0='{"key":0}', nullKey=NULL} [{'v': '{"key":0}'}, {'v': NULL}, NULL, {'v': '{"key":0}'}, NULL, {'v': '{"key":0}'}] {'v': [NULL, '{"key":0}']}
1 {"key":1} ['{"key":1}', NULL, '{"key":1}', NULL, '{"key":1}'] {'v': '{"key":1}'} {1='{"key":1}', nullKey=NULL} [{'v': '{"key":1}'}, {'v': NULL}, NULL, {'v': '{"key":1}'}, NULL, {'v': '{"key":1}'}] {'v': [NULL, '{"key":1}']}
1 {"key":1} ['{"key":1}', NULL, '{"key":1}', NULL, '{"key":1}'] {'v': '{"key":1}'} {1='{"key":1}', nullKey=NULL} [{'v': '{"key":1}'}, {'v': NULL}, NULL, {'v': '{"key":1}'}, NULL, {'v': '{"key":1}'}] {'v': [NULL, '{"key":1}']}
2 {"key":2} ['{"key":2}', NULL, '{"key":2}', NULL, '{"key":2}'] {'v': '{"key":2}'} {2='{"key":2}', nullKey=NULL} [{'v': '{"key":2}'}, {'v': NULL}, NULL, {'v': '{"key":2}'}, NULL, {'v': '{"key":2}'}] {'v': [NULL, '{"key":2}']}
3 {"key":3} ['{"key":3}', NULL, '{"key":3}', NULL, '{"key":3}'] {'v': '{"key":3}'} {3='{"key":3}', nullKey=NULL} [{'v': '{"key":3}'}, {'v': NULL}, NULL, {'v': '{"key":3}'}, NULL, {'v': '{"key":3}'}] {'v': [NULL, '{"key":3}']}
4 {"key":4} ['{"key":4}', NULL, '{"key":4}', NULL, '{"key":4}'] {'v': '{"key":4}'} {4='{"key":4}', nullKey=NULL} [{'v': '{"key":4}'}, {'v': NULL}, NULL, {'v': '{"key":4}'}, NULL, {'v': '{"key":4}'}] {'v': [NULL, '{"key":4}']}
5 {"key":5} ['{"key":5}', NULL, '{"key":5}', NULL, '{"key":5}'] {'v': '{"key":5}'} {5='{"key":5}', nullKey=NULL} [{'v': '{"key":5}'}, {'v': NULL}, NULL, {'v': '{"key":5}'}, NULL, {'v': '{"key":5}'}] {'v': [NULL, '{"key":5}']}
6 {"key":6} ['{"key":6}', NULL, '{"key":6}', NULL, '{"key":6}'] {'v': '{"key":6}'} {6='{"key":6}', nullKey=NULL} [{'v': '{"key":6}'}, {'v': NULL}, NULL, {'v': '{"key":6}'}, NULL, {'v': '{"key":6}'}] {'v': [NULL, '{"key":6}']}
7 {"key":7} ['{"key":7}', NULL, '{"key":7}', NULL, '{"key":7}'] {'v': '{"key":7}'} {7='{"key":7}', nullKey=NULL} [{'v': '{"key":7}'}, {'v': NULL}, NULL, {'v': '{"key":7}'}, NULL, {'v': '{"key":7}'}] {'v': [NULL, '{"key":7}']}

View File

@@ -0,0 +1,189 @@
# name: test/parquet/variant/variant_partially_shredded.test
# group: [variant]
require parquet
query II nosort result
from 'data/parquet-testing/variant_partial_shredded0.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded1.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded2.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded3.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded4.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded5.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded6.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded7.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded8.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded9.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded10.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded11.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded12.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded13.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded14.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded15.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded16.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded17.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded18.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded19.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded20.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded21.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded22.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded23.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded24.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded25.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded26.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded27.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded28.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded29.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded30.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded31.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded32.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded33.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded34.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded35.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded36.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded37.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded38.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded39.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded40.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded41.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded42.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded43.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded44.parquet';
----
query II nosort result
from 'data/parquet-testing/variant_partial_shredded45.parquet';
----

View File

@@ -0,0 +1,39 @@
# name: test/parquet/variant/variant_roundtrip.test_slow
# group: [variant]
require parquet
require json
foreach parquet_file p2strings.parquet p2.parquet pandas-date.parquet parquet_with_json.parquet spark-store.parquet struct_skip_test.parquet timestamp.parquet candidate.parquet
statement ok
COPY (
SELECT
COLUMNS(*)::VARIANT
FROM read_parquet('data/parquet-testing/${parquet_file}')
) TO '__TEST_DIR__/variant_${parquet_file}' (FORMAT PARQUET);
query I nosort expected_res
SELECT COLUMNS(*)::VARIANT FROM read_parquet('data/parquet-testing/${parquet_file}')
query I nosort expected_res
SELECT COLUMNS(*)::VARIANT FROM read_parquet('__TEST_DIR__/variant_${parquet_file}')
reset label expected_res
endloop
foreach parquet_file 7-set.snappy.arrow2.parquet adam_genotypes.parquet apkwan.parquet arrow_nan.parquet aws_kinesis.parquet aws1.snappy.parquet aws2.parquet bigdecimal.parquet binary_string.parquet blob.parquet boolean_stats.parquet bug13053-2.parquet bug13053.parquet bug14120-dict-nulls-only.parquet bug1554.parquet bug1588.parquet bug1589.parquet bug1618_struct_strings.parquet bug2267.parquet bug2557.parquet bug3734.parquet bug4442.parquet bug4859.parquet bug4903.parquet bug687_nulls.parquet byte_stream_split.parquet CASE_INSENSITIVE.PARQUET complex.parquet corrupt_stats.parquet data-types.parquet date.parquet delta_byte_array.parquet delta_length_byte_array.parquet empty.parquet enum.parquet file_row_number.parquet filter_bug1391.parquet fixed.parquet float16.parquet incorrect_index_page_offsets.parquet issue_6013.parquet issue10279_delta_encoding.parquet issue12621.parquet issue6630_1.parquet issue6630_2.parquet issue6990.parquet issue9417.parquet leftdate3_192_loop_1.parquet lineitem-top10000.gzip.parquet list_sort_segfault.parquet manyrowgroups.parquet manyrowgroups2.parquet map.parquet multi_bloom_a.parquet multi_bloom_b.parquet multi_bloom_c.parquet nan-float.parquet nullbyte_multiple.parquet nullbyte.parquet parquet_go.parquet rle_boolean_encoding.parquet seqs_table.parquet signed_stats.parquet silly-names.parquet simple.parquet sorted.zstd_18_131072_small.parquet spark-ontime.parquet struct.parquet test_unnest_rewriter.parquet timestamp-ms.parquet tz.parquet upsert_bug.parquet userdata1.parquet varchar_stats.parquet zstd.parquet
statement ok
COPY (
SELECT
COLUMNS(*)::VARIANT
FROM read_parquet('data/parquet-testing/${parquet_file}')
) TO '__TEST_DIR__/variant_${parquet_file}' (FORMAT PARQUET);
statement ok
SELECT COLUMNS(*)::VARIANT FROM read_parquet('__TEST_DIR__/variant_${parquet_file}')
endloop

View File

@@ -0,0 +1,210 @@
# name: test/parquet/variant/variant_shredded.test
# group: [variant]
require parquet
# Timestamp NS - positive (no timezone)
query II
from 'data/parquet-testing/variant_shredded_timestamp_nanos_ntz_positive.parquet';
----
1 "2024-11-07 12:33:54.123456789"
# Float - negative
query II
from 'data/parquet-testing/variant_shredded_float_negative.parquet';
----
1 -10.109999656677246
# Int64 - negative
query II
from 'data/parquet-testing/variant_shredded_int64_negative.parquet';
----
1 -9876543210
# Decimal16 - negative
query II
from 'data/parquet-testing/variant_shredded_decimal16_negative.parquet';
----
1 "-9876543210.123456789"
# UUID
query II
from 'data/parquet-testing/variant_shredded_uuid.parquet';
----
1 "f24f9b64-81fa-49d1-b74e-8c09a6e31c56"
# Decimal4 - negative
query II
from 'data/parquet-testing/variant_shredded_decimal4_negative.parquet';
----
1 "-123456.789"
# Decimal4 - positive
query II
from 'data/parquet-testing/variant_shredded_decimal4_positive.parquet';
----
1 "123456.789"
# Timestamp Micros - negative (no timezone)
query II
from 'data/parquet-testing/variant_shredded_timestamp_micros_ntz_negative.parquet';
----
1 "1957-11-07 12:33:54.123456"
# Date - negative
query II
from 'data/parquet-testing/variant_shredded_date_negative.parquet';
----
1 "1957-11-07"
# int8 - positive
query II
from 'data/parquet-testing/variant_shredded_int8_positive.parquet';
----
1 34
# int16 - positive
query II
from 'data/parquet-testing/variant_shredded_int16_positive.parquet';
----
1 1234
# decimal8 - negative
query II
from 'data/parquet-testing/variant_shredded_decimal8_negative.parquet';
----
1 "-123456789.987654321"
# string
query II
from 'data/parquet-testing/variant_shredded_string.parquet';
----
1 "iceberg"
# FIXME: this is actually a Timestamp Nanos - positive (with timezone)
# Timestamp Micros - positive (with timezone)
query II
from 'data/parquet-testing/variant_shredded_timestamp_micros_tz_positive.parquet';
----
1 "2024-11-07 12:33:54.123456+00"
# binary
query II
from 'data/parquet-testing/variant_shredded_binary.parquet';
----
1 "CgsMDQ=="
# float - positive
query II
from 'data/parquet-testing/variant_shredded_float_positive.parquet';
----
1 10.109999656677246
# double - positive
query II
from 'data/parquet-testing/variant_shredded_double_positive.parquet';
----
1 14.3
# decimal16 - positive
query II
from 'data/parquet-testing/variant_shredded_decimal16_positive.parquet';
----
1 "9876543210.123456789"
# Timestamp Micros - positive (no timezone)
query II
from 'data/parquet-testing/variant_shredded_timestamp_micros_ntz_positive.parquet';
----
1 "2024-11-07 12:33:54.123456"
# int16 - negative
query II
from 'data/parquet-testing/variant_shredded_int16_negative.parquet';
----
1 -1234
# Timestamp Micros - positive (with timezone)
query II
from 'data/parquet-testing/variant_shredded_timestamp_micros_tz_positive2.parquet';
----
1 "2024-11-07 12:33:54.123456+00"
# Timestamp Micros - negative (with timezone)
query II
from 'data/parquet-testing/variant_shredded_timestamp_micros_tz_negative.parquet';
----
1 "1957-11-07 12:33:54.123456+00"
# decimal8 - positive
query II
from 'data/parquet-testing/variant_shredded_decimal8_positive.parquet';
----
1 "123456789.987654321"
# Timestamp Nanos - negative (no timezone)
query II
from 'data/parquet-testing/variant_shredded_timestamp_nanos_ntz_negative.parquet';
----
1 "1957-11-07 12:33:54.123456789"
# int32 - positive
query II
from 'data/parquet-testing/variant_shredded_int32_positive.parquet';
----
1 12345
# int32 - negative
query II
from 'data/parquet-testing/variant_shredded_int32_negative.parquet';
----
1 -12345
# FIXME: this is actually a Timestamp Nanos - negative (with timezone)
# Timestamp Micros - negative (with timezone)
query II
from 'data/parquet-testing/variant_shredded_timestamp_micros_tz_negative2.parquet';
----
1 "1957-11-07 12:33:54.123457+00"
# int8 - negative
query II
from 'data/parquet-testing/variant_shredded_int8_negative.parquet';
----
1 -34
# Time Micros (no timezone)
query II
from 'data/parquet-testing/variant_shredded_time_micros_ntz.parquet';
----
1 "12:33:54.123456"
# Date - positive
query II
from 'data/parquet-testing/variant_shredded_date_positive.parquet';
----
1 "2024-11-07"
# bool - true
query II
from 'data/parquet-testing/variant_shredded_bool_true.parquet';
----
1 true
# int64 - positive
query II
from 'data/parquet-testing/variant_shredded_int64_positive.parquet';
----
1 9876543210
# double - negative
query II
from 'data/parquet-testing/variant_shredded_double_negative.parquet';
----
1 -14.3
# bool - false
query II
from 'data/parquet-testing/variant_shredded_bool_false.parquet';
----
1 false

View File

@@ -0,0 +1,40 @@
# name: test/parquet/variant/variant_shredded_nested.test
# group: [variant]
require parquet
# Array
query II
from 'data/parquet-testing/variant_shredded_array1.parquet';
----
1 [["string","iceberg"],["apple","banana"]]
# Array
query II
from 'data/parquet-testing/variant_shredded_array2.parquet';
----
1 [{"a":123456789,"c":"string"},{"a":123456789,"c":"string"}]
# Array
query II
from 'data/parquet-testing/variant_shredded_array3.parquet';
----
1 ["iceberg","string"]
# Object
query II
from 'data/parquet-testing/variant_shredded_object1.parquet';
----
1 {"a":123456789,"c":"string"}
# Object
query II
from 'data/parquet-testing/variant_shredded_object2.parquet';
----
1 {"a":null,"d":"iceberg"}
# Object
query II
from 'data/parquet-testing/variant_shredded_object3.parquet';
----
1 {"a":123456789,"c":["string","iceberg"]}

View File

@@ -0,0 +1,17 @@
# name: test/parquet/variant/variant_to_parquet_variant.test
# group: [variant]
require parquet
require json
query I
select variant_to_parquet_variant(NULL)
----
{'metadata': \x11\x00\x00, 'value': \x00}
# We don't expose the overload with a shredded type, only internally will we use that
statement error
select variant_to_parquet_variant(NULL, 'STRUCT(a VARCHAR)'::VARCHAR)
----
Binder Error