should be it

2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions
--- a/external/duckdb/test/parquet/test_parquet_schema.test
+++ b/external/duckdb/test/parquet/test_parquet_schema.test
@@ -0,0 +1,357 @@
+# name: test/parquet/test_parquet_schema.test
+# description: Parquet reader schema parameter tests
+# group: [parquet]
+
+require parquet
+
+statement ok
+COPY (SELECT 42::INTEGER i) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {i: 0})
+
+statement error
+SELECT *
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map{})
+----
+Invalid Input Error: 'schema' expects a STRUCT as the value type of the map
+
+# can't combine with union_by_name
+statement error
+SELECT *
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
+                    1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
+                  }, union_by_name=true)
+----
+Binder Error: Parquet schema cannot be combined with union_by_name=true or hive_partitioning=true
+
+# can't combine with hive_partitioning
+statement error
+SELECT *
+FROM read_parquet('__TEST_DIR__/*.parquet', schema=map {
+                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
+                    1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
+                  }, hive_partitioning=true)
+----
+Binder Error: Parquet schema cannot be combined with union_by_name=true or hive_partitioning=true
+
+statement ok
+COPY (
+    SELECT 1 i1, 3 i3, 4 i4, 5 i5 UNION ALL
+    SELECT 2 i1, 3 i3, 4 i4, 5 i5
+) TO '__TEST_DIR__/partitioned' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet, WRITE_PARTITION_COLUMNS)
+
+# auto-detection of hive partitioning is enabled by default,
+# but automatically disabled when a schema is supplied, so this should succeed
+query IIII
+SELECT *
+FROM read_parquet('__TEST_DIR__/partitioned/*/*.parquet', schema=map {
+                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
+                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
+                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
+                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
+                  })
+----
+5	3	2	1
+5	3	2	2
+
+# when partition columns are specified in FIELD_IDS, error message should suggest WRITE_PARTITION_COLUMNS option
+statement error
+COPY (
+    SELECT 1 i1, 3 i3, 4 i4, 5 i5 UNION ALL
+    SELECT 2 i1, 3 i3, 4 i4, 5 i5
+) TO '__TEST_DIR__/partitioned2' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1}, PARTITION_BY i1, FORMAT parquet)
+----
+Binder Error: Column name "i1" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this column is a partition column. Available column names:
+
+# cannot duplicate field_ids
+statement error
+SELECT *
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
+                    0: {name: 'new_column', type: 'UTINYINT', default_value: 43}
+                  })
+----
+Map keys must be unique
+
+# cannot duplicate column names
+statement error
+SELECT *
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    0: {name: 'cool_column', type: 'BIGINT', default_value: NULL},
+                    1: {name: 'cool_column', type: 'UTINYINT', default_value: 43}
+                  }) pq
+----
+Binder Error: table "pq" has duplicate column name "cool_column"
+
+# the supplied default value must be castable to the given type for that column
+statement error
+SELECT *
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    0: {name: 'cool_column', type: 'BIGINT', default_value: NULL},
+                    1: {name: 'cool_column', type: 'UTINYINT', default_value: 'bla'}
+                  }) pq
+----
+Binder Error: Unable to cast Parquet schema default_value "bla" to UTINYINT
+
+query IIIIII
+DESCRIBE SELECT *
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
+                    1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
+                  })
+----
+renamed_i	BIGINT	YES	NULL	NULL	NULL
+new_column	UTINYINT	YES	NULL	NULL	NULL
+
+query IIIIII
+DESCRIBE SELECT *
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
+                    1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
+                  }, filename=true)
+----
+renamed_i	BIGINT	YES	NULL	NULL	NULL
+new_column	UTINYINT	YES	NULL	NULL	NULL
+filename	VARCHAR	YES	NULL	NULL	NULL
+
+# we'll test if filename works on a persistent file otherwise __TEST_DIR__ will be different every time
+query II
+SELECT *
+FROM read_parquet('data/parquet-testing/enum.parquet', schema=map {
+                    1: {name: 'cool_column', type: 'VARCHAR', default_value: NULL}
+                  }, filename=true)
+LIMIT 1
+----
+1	data/parquet-testing/enum.parquet
+
+query II
+SELECT *
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
+                    1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
+                  })
+----
+42	43
+
+# we just get a cast error when we can't cast to the supplied type
+statement error
+SELECT *
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    0: {name: 'renamed_i', type: 'DATE', default_value: NULL}
+                  })
+----
+Conversion Error
+
+# if we don't supply a field id, we can't refer to it using the schema parameter
+statement ok
+COPY (SELECT 42::INTEGER i) TO '__TEST_DIR__/integers.parquet'
+
+query II
+SELECT *
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    0: {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
+                    1: {name: 'new_column', type: 'UTINYINT', default_value: 43}
+                  })
+----
+NULL	43
+
+# let's spice it up with more columns
+statement ok
+COPY (
+    SELECT 1 i1, 3 i3, 4 i4, 5 i5
+) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {i1: 5, i3: 3, i4: 2, i5: 1})
+
+# this is purposely a bit confusing but we're:
+# 1. deleting field id 2
+# 2. creating field id 4
+# 3. reversing the order of the columns
+# 4. renaming them (except i3)
+# 5. upcasting them
+query IIII
+SELECT *
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
+                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
+                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
+                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
+                  })
+----
+5	3	2	1
+
+# projection still ok
+query I
+SELECT i1
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
+                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
+                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
+                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
+                  })
+----
+5
+
+# we can still select virtual columns as well
+query III
+SELECT file_row_number, filename[-16:], i4
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
+                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
+                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
+                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
+                  })
+----
+0	integers.parquet	2
+
+# projection still, even with different generated columns
+query III
+SELECT file_row_number, filename[-16:], i4
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
+                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
+                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
+                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
+                  }, file_row_number=1, filename=1)
+----
+0	integers.parquet	2
+
+# count(*) still ok
+query I
+SELECT count(*)
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
+                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
+                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
+                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
+                  })
+----
+1
+
+# combine with constant column
+query II
+SELECT i1, filename[-16:]
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
+                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
+                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
+                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
+                  }, filename=true)
+----
+5	integers.parquet
+
+statement ok
+COPY (
+    SELECT range % 4 g, range i FROM range(1000)
+) TO '__TEST_DIR__/integers.parquet' (FIELD_IDS {g: 33, i: 42})
+
+# let's also do a query with a filter and a downcast
+query II
+SELECT my_cool_group, sum(my_cool_value)
+FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    33: {name: 'my_cool_group', type: 'UINTEGER', default_value: NULL},
+                    42: {name: 'my_cool_value', type: 'UINTEGER', default_value: NULL}
+                  })
+WHERE my_cool_group = 2
+GROUP BY my_cool_group
+----
+2	125000
+
+# also test multi-file reading with different field ids
+# field id -> value:
+# 1 -> 5
+# 2 -> 4 (unused)
+# 3 -> 3
+# 4 -> - (missing)
+# 5 -> 1
+statement ok
+COPY (
+    SELECT
+		1 i1,
+		3 i3,
+		4 i4,
+		5 i5
+) TO '__TEST_DIR__/multifile1.parquet' (FIELD_IDS {
+	i1: 5,
+	i3: 3,
+	i4: 2,
+	i5: 1
+})
+
+# field_id -> value:
+# 1 -> 1
+# 2 -> 3 (unused)
+# 3 -> 4
+# 4 -> 5
+# 5 -> - (missing)
+statement ok
+COPY (
+    SELECT
+		1 j1,
+		3 j3,
+		4 j4,
+		5 j5
+) TO '__TEST_DIR__/multifile2.parquet' (FIELD_IDS {
+	j1: 1,
+	j3: 2,
+	j4: 3,
+	j5: 4
+})
+
+query IIIII
+SELECT i1, i3, i4, i5, filename[-18:]
+FROM read_parquet('__TEST_DIR__/multifile*.parquet', schema=map {
+                    1: {name: 'i1', type: 'BIGINT', default_value: NULL},
+                    3: {name: 'i3', type: 'BIGINT', default_value: NULL},
+                    4: {name: 'i4', type: 'BIGINT', default_value: 2},
+                    5: {name: 'i5', type: 'BIGINT', default_value: NULL}
+                  }, filename=true)
+ORDER BY filename
+----
+5	3	2	1	multifile1.parquet
+1	4	5	NULL	multifile2.parquet
+
+statement error
+select * FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+	True: {name: 'my_cool_group', type: 'UINTEGER', default_value: NULL},
+	False: {name: 'my_cool_value', type: 'UINTEGER', default_value: NULL}
+});
+----
+Invalid Input Error: 'schema' expects the value type of the map to be either INTEGER or VARCHAR, not BOOLEAN
+
+query II
+SELECT alias(COLUMNS(*)) FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map {
+                    'i': {name: 'renamed_i', type: 'BIGINT', default_value: NULL},
+                    'j': {name: 'new_column', type: 'UTINYINT', default_value: 43}
+                  }) limit 1;
+----
+renamed_i	new_column
+
+# issue 15504
+statement ok
+COPY (select 1 as id, list_value('a', 'b', 'c') as arr, { key: 1, v1: 'a', v2: 'b' } as s) TO '__TEST_DIR__/15504.parquet' (field_ids { 'id': 0, 'arr': 1, 's': 2 });
+
+query III
+SELECT * FROM read_parquet('__TEST_DIR__/15504.parquet', schema=map { 0: { name: 'id', type: 'int32', default_value: NULL }, 1: { name: 'arr', type: 'varchar[]', default_value: NULL }, 2: { name: 's', type: 'STRUCT(key INT, v1 TEXT, v2 TEXT)', default_value: NULL } });
+----
+1	[a, b, c]	{'key': 1, 'v1': a, 'v2': b}
+
+# issue 16094
+statement ok
+copy (
+	select
+		x
+	from generate_series(1,100) as g(x)
+) to '__TEST_DIR__/16094.parquet'
+with (
+	field_ids {x: 1}
+);
+
+statement ok
+select
+	x,
+	filename
+from read_parquet(
+	'__TEST_DIR__/16094.parquet',
+	schema=map {
+		1: {name: 'x', type: 'int', default_value: NULL}
+	},
+	filename=True
+) where x = 1;