Files
email-tracker/external/duckdb/test/sql/copy/parquet/parquet_metadata.test
2025-10-24 19:21:19 -05:00

95 lines
2.5 KiB
SQL

# name: test/sql/copy/parquet/parquet_metadata.test
# description: Test parquet metadata function
# group: [parquet]
require parquet
statement ok
SELECT * FROM parquet_metadata('data/parquet-testing/lineitem-top10000.gzip.parquet');
statement ok
SELECT * FROM parquet_schema('data/parquet-testing/lineitem-top10000.gzip.parquet');
query I
SELECT COUNT(*) > 0 FROM parquet_metadata('data/parquet-testing/lineitem-top10000.gzip.parquet');
----
true
query I
SELECT COUNT(*) > 0 FROM parquet_schema('data/parquet-testing/lineitem-top10000.gzip.parquet');
----
true
statement ok
select * from parquet_schema('data/parquet-testing/decimal/decimal_dc.parquet');
statement ok
select * from parquet_schema('data/parquet-testing/decimal/int64_decimal.parquet');
# with globs
statement ok
select * from parquet_metadata('data/parquet-testing/glob/*.parquet');
statement ok
select * from parquet_schema('data/parquet-testing/glob/*.parquet');
# list parameters
statement ok
select * from parquet_schema(['data/parquet-testing/decimal/int64_decimal.parquet', 'data/parquet-testing/decimal/int64_decimal.parquet']);
query III
SELECT name, type, duckdb_type FROM parquet_schema('data/parquet-testing/lineitem-top10000.gzip.parquet') WHERE type IS NOT NULL;
----
l_orderkey INT64 BIGINT
l_partkey INT64 BIGINT
l_suppkey INT64 BIGINT
l_linenumber INT32 INTEGER
l_quantity INT32 INTEGER
l_extendedprice DOUBLE DOUBLE
l_discount DOUBLE DOUBLE
l_tax DOUBLE DOUBLE
l_returnflag BYTE_ARRAY VARCHAR
l_linestatus BYTE_ARRAY VARCHAR
l_shipdate BYTE_ARRAY VARCHAR
l_commitdate BYTE_ARRAY VARCHAR
l_receiptdate BYTE_ARRAY VARCHAR
l_shipinstruct BYTE_ARRAY VARCHAR
l_shipmode BYTE_ARRAY VARCHAR
l_comment BYTE_ARRAY VARCHAR
# column_id
query II
SELECT column_id, name FROM parquet_schema('data/parquet-testing/lineitem-top10000.gzip.parquet') ORDER BY column_id;
----
0 spark_schema
1 l_orderkey
2 l_partkey
3 l_suppkey
4 l_linenumber
5 l_quantity
6 l_extendedprice
7 l_discount
8 l_tax
9 l_returnflag
10 l_linestatus
11 l_shipdate
12 l_commitdate
13 l_receiptdate
14 l_shipinstruct
15 l_shipmode
16 l_comment
query III
WITH per_file AS (
SELECT file_name, COUNT(*) AS rows_per_file
FROM parquet_schema('data/parquet-testing/glob3/**/*.parquet')
GROUP BY file_name
)
SELECT
SUM(rows_per_file) AS total_rows,
MAX(rows_per_file) AS max_rows_per_filename,
(SELECT COUNT(DISTINCT column_id) FROM parquet_schema('data/parquet-testing/glob3/**/*.parquet')) AS distinct_column_ids
FROM per_file;
----
9 3 3