Files
email-tracker/external/duckdb/test/sql/copy/parquet/parquet_stats.test
2025-10-24 19:21:19 -05:00

220 lines
8.3 KiB
SQL

# name: test/sql/copy/parquet/parquet_stats.test
# description: Test stats reading in parquet reader
# group: [parquet]
require parquet
# boolean values
query IIII
select stats_min, stats_max, stats_min_value, stats_max_value from parquet_metadata('data/parquet-testing/boolean_stats.parquet');
----
false true false true
# signed numbers
query IIII
select stats_min, stats_max, stats_min_value, stats_max_value from parquet_metadata('data/parquet-testing/signed_stats.parquet');
----
-128 127 -128 127
-32768 32767 -32768 32767
-2147483648 2147483647 -2147483648 2147483647
-9223372036854775808 9223372036854775807 -9223372036854775808 9223372036854775807
query IIII
select * from 'data/parquet-testing/signed_stats.parquet';
----
-128 -32768 -2147483648 -9223372036854775808
127 32767 2147483647 9223372036854775807
# unsigned numbers
query IIII
select stats_min, stats_max, stats_min_value, stats_max_value from parquet_metadata('data/parquet-testing/unsigned_stats.parquet');
----
NULL NULL 0 255
NULL NULL 0 65535
0 4294967295 0 4294967295
NULL NULL 0 18446744073709551615
query IIII
select * from 'data/parquet-testing/unsigned_stats.parquet';
----
0 0 0 0
255 65535 4294967295 18446744073709551615
# dates/times/timestamps
query IIII
select stats_min, stats_max, stats_min_value, stats_max_value from parquet_metadata('data/parquet-testing/date_stats.parquet');
----
1900-01-01 2030-12-31 1900-01-01 2030-12-31
00:00:00+00 23:59:59+00 00:00:00+00 23:59:59+00
1990-01-01 00:00:00 2030-12-31 23:59:59 1990-01-01 00:00:00 2030-12-31 23:59:59
1900-01-01 00:00:00 2030-12-31 23:59:59 1900-01-01 00:00:00 2030-12-31 23:59:59
1900-01-01 00:00:00 2030-12-31 23:59:59 1900-01-01 00:00:00 2030-12-31 23:59:59
1900-01-01 00:00:00 2030-12-31 23:59:59 1900-01-01 00:00:00 2030-12-31 23:59:59
query IIIIII
select * from 'data/parquet-testing/date_stats.parquet';
----
1900-01-01 00:00:00+00 1990-01-01 00:00:00 1900-01-01 00:00:00 1900-01-01 00:00:00 1900-01-01 00:00:00
2030-12-31 23:59:59+00 2030-12-31 23:59:59 2030-12-31 23:59:59 2030-12-31 23:59:59 2030-12-31 23:59:59
# varchar/blob stats
query IIII
select stats_min, stats_max, stats_min_value, stats_max_value from parquet_metadata('data/parquet-testing/varchar_stats.parquet');
----
NULL NULL hello world world hello
NULL NULL hello\x00world world\x00hello
# should be the same as computing min/max over these columns
query IIII
select min(str_val), max(str_val), min("hello\x00world"), max("hello\x00world") from 'data/parquet-testing/varchar_stats.parquet';
----
hello world world hello hello\x00world world\x00hello
query II
select * from 'data/parquet-testing/varchar_stats.parquet';
----
hello world hello\x00world
world hello world\x00hello
# decimal stats
query IIII
select stats_min, stats_max, stats_min_value, stats_max_value from parquet_metadata('data/parquet-testing/decimal_stats.parquet');
----
-999.9 999.9 -999.9 999.9
-999999.999 999999.999 -999999.999 999999.999
-9999999999999.99999 9999999999999.99999 -9999999999999.99999 9999999999999.99999
-999999999999999999999999999999999.99999 999999999999999999999999999999999.99999 -999999999999999999999999999999999.99999 999999999999999999999999999999999.99999
query IIII
select * from 'data/parquet-testing/decimal_stats.parquet';
----
-999.9 -999999.999 -9999999999999.99999 -999999999999999999999999999999999.99999
999.9 999999.999 9999999999999.99999 999999999999999999999999999999999.99999
# int32 decimal stats
query IIII
select stats_min, stats_max, stats_min_value, stats_max_value from parquet_metadata('data/parquet-testing/arrow/int32_decimal.parquet');
----
1.00 24.00 NULL NULL
query I
SELECT * FROM 'data/parquet-testing/arrow/int32_decimal.parquet'
----
1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
# int64 decimal stats
query IIII
select stats_min, stats_max, stats_min_value, stats_max_value from parquet_metadata('data/parquet-testing/arrow/int64_decimal.parquet');
----
1.00 24.00 NULL NULL
query I
SELECT * FROM 'data/parquet-testing/arrow/int64_decimal.parquet'
----
1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
# data-types stats
query IIII
SELECT stats_min, stats_max, stats_min_value, stats_max_value FROM parquet_metadata('data/parquet-testing/data-types.parquet')
----
-127 127 -127 127
-32767 32767 -32767 32767
-2147483647 2147483647 -2147483647 2147483647
-9223372036854775807 9223372036854775807 -9223372036854775807 9223372036854775807
-4.6 4.6 -4.6 4.6
-4.7 4.7 -4.7 4.7
4.80 4.80 4.80 4.80
49 49 49 49
50 50 50 50
false true false true
2019-11-26 20:11:42.501 2019-11-26 20:11:42.501 2019-11-26 20:11:42.501 2019-11-26 20:11:42.501
2020-01-10 2020-01-10 2020-01-10 2020-01-10
query IIIIIIIIIIII
SELECT * FROM 'data/parquet-testing/data-types.parquet'
----
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
42 43 44 45 4.600000 4.700000 4.80 49 50 True 2019-11-26 20:11:42.501 2020-01-10
-127 -32767 -2147483647 -9223372036854775807 -4.600000 -4.700000 NULL NULL NULL False NULL NULL
127 32767 2147483647 9223372036854775807 NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
# parquet stats for all parquet files
foreach parquet_file data/parquet-testing/manyrowgroups.parquet data/parquet-testing/map.parquet data/parquet-testing/arrow/int32_decimal.parquet data/parquet-testing/arrow/nonnullable.impala.parquet data/parquet-testing/bug687_nulls.parquet data/parquet-testing/bug1554.parquet data/parquet-testing/apkwan.parquet data/parquet-testing/arrow/nested_lists.snappy.parquet data/parquet-testing/arrow/nulls.snappy.parquet data/parquet-testing/nan-float.parquet data/parquet-testing/manyrowgroups2.parquet data/parquet-testing/struct.parquet data/parquet-testing/arrow/list_columns.parquet data/parquet-testing/timestamp-ms.parquet data/parquet-testing/arrow/alltypes_dictionary.parquet data/parquet-testing/arrow/binary.parquet data/parquet-testing/arrow/nation.dict-malformed.parquet data/parquet-testing/lineitem-top10000.gzip.parquet data/parquet-testing/arrow/nested_maps.snappy.parquet data/parquet-testing/arrow/dict-page-offset-zero.parquet data/parquet-testing/silly-names.parquet data/parquet-testing/zstd.parquet data/parquet-testing/bug1618_struct_strings.parquet data/parquet-testing/arrow/single_nan.parquet data/parquet-testing/arrow/int64_decimal.parquet data/parquet-testing/filter_bug1391.parquet data/parquet-testing/arrow/fixed_length_decimal_legacy.parquet data/parquet-testing/timestamp.parquet data/parquet-testing/arrow/fixed_length_decimal.parquet data/parquet-testing/leftdate3_192_loop_1.parquet data/parquet-testing/blob.parquet data/parquet-testing/bug1588.parquet data/parquet-testing/bug1589.parquet data/parquet-testing/arrow/alltypes_plain.parquet data/parquet-testing/arrow/repeated_no_annotation.parquet data/parquet-testing/data-types.parquet data/parquet-testing/unsigned.parquet data/parquet-testing/pandas-date.parquet data/parquet-testing/date.parquet data/parquet-testing/arrow/nullable.impala.parquet data/parquet-testing/fixed.parquet data/parquet-testing/arrow/alltypes_plain.snappy.parquet data/parquet-testing/decimal/int32_decimal.parquet data/parquet-testing/decimal/pandas_decimal.parquet data/parquet-testing/decimal/decimal_dc.parquet data/parquet-testing/decimal/int64_decimal.parquet data/parquet-testing/decimal/fixed_length_decimal_legacy.parquet data/parquet-testing/decimal/fixed_length_decimal.parquet data/parquet-testing/glob2/t1.parquet data/parquet-testing/cache/cache1.parquet data/parquet-testing/cache/cache2.parquet data/parquet-testing/glob/t2.parquet data/parquet-testing/glob/t1.parquet data/parquet-testing/bug2557.parquet
statement ok
select * from parquet_metadata('${parquet_file}');
endloop
# internal issue 2037
statement ok
copy (select '' i) to '__TEST_DIR__/test.parquet';
query I
select i is null c0 from '__TEST_DIR__/test.parquet';
----
false
query II
select stats_min_value is null c0, stats_max_value is null c1 from parquet_metadata('__TEST_DIR__/test.parquet');
----
false false
query II
select row_group_bytes, row_group_compressed_bytes from parquet_metadata('__TEST_DIR__/test.parquet');
----
27 29
query II
select row_group_bytes, row_group_compressed_bytes from parquet_metadata('data/parquet-testing/varchar_stats.parquet');
----
200 208
200 208