should be it

2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions
--- a/external/duckdb/extension/parquet/writer/CMakeLists.txt
+++ b/external/duckdb/extension/parquet/writer/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_library_unity(
+  duckdb_parquet_writers
+  OBJECT
+  array_column_writer.cpp
+  boolean_column_writer.cpp
+  decimal_column_writer.cpp
+  enum_column_writer.cpp
+  list_column_writer.cpp
+  primitive_column_writer.cpp
+  struct_column_writer.cpp)
+
+add_subdirectory(variant)
+
+set(PARQUET_EXTENSION_FILES
+    ${PARQUET_EXTENSION_FILES} $<TARGET_OBJECTS:duckdb_parquet_writers>
+    PARENT_SCOPE)
--- a/external/duckdb/extension/parquet/writer/array_column_writer.cpp
+++ b/external/duckdb/extension/parquet/writer/array_column_writer.cpp
@@ -0,0 +1,75 @@
+#include "writer/array_column_writer.hpp"
+
+namespace duckdb {
+
+void ArrayColumnWriter::Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
+	auto &state = state_p.Cast<ListColumnWriterState>();
+	auto &array_child = ArrayVector::GetEntry(vector);
+	auto array_size = ArrayType::GetSize(vector.GetType());
+	GetChildWriter().Analyze(*state.child_state, &state_p, array_child, array_size * count);
+}
+
+void ArrayColumnWriter::WriteArrayState(ListColumnWriterState &state, idx_t array_size, uint16_t first_repeat_level,
+                                        idx_t define_value, const bool is_empty) {
+	state.definition_levels.push_back(define_value);
+	state.repetition_levels.push_back(first_repeat_level);
+	state.is_empty.push_back(is_empty);
+
+	if (is_empty) {
+		return;
+	}
+	for (idx_t k = 1; k < array_size; k++) {
+		state.repetition_levels.push_back(MaxRepeat() + 1);
+		state.definition_levels.push_back(define_value);
+		state.is_empty.push_back(false);
+	}
+}
+
+void ArrayColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count,
+                                bool vector_can_span_multiple_pages) {
+	auto &state = state_p.Cast<ListColumnWriterState>();
+
+	auto array_size = ArrayType::GetSize(vector.GetType());
+	auto &validity = FlatVector::Validity(vector);
+
+	// write definition levels and repeats
+	// the main difference between this and ListColumnWriter::Prepare is that we need to make sure to write out
+	// repetition levels and definitions for the child elements of the array even if the array itself is NULL.
+	idx_t vcount = parent ? parent->definition_levels.size() - state.parent_index : count;
+	idx_t vector_index = 0;
+	for (idx_t i = 0; i < vcount; i++) {
+		idx_t parent_index = state.parent_index + i;
+		if (parent && !parent->is_empty.empty() && parent->is_empty[parent_index]) {
+			WriteArrayState(state, array_size, parent->repetition_levels[parent_index],
+			                parent->definition_levels[parent_index], true);
+			continue;
+		}
+		auto first_repeat_level =
+		    parent && !parent->repetition_levels.empty() ? parent->repetition_levels[parent_index] : MaxRepeat();
+		if (parent && parent->definition_levels[parent_index] != PARQUET_DEFINE_VALID) {
+			WriteArrayState(state, array_size, first_repeat_level, parent->definition_levels[parent_index]);
+		} else if (validity.RowIsValid(vector_index)) {
+			// push the repetition levels
+			WriteArrayState(state, array_size, first_repeat_level, PARQUET_DEFINE_VALID);
+		} else {
+			//! Produce a null
+			WriteArrayState(state, array_size, first_repeat_level, MaxDefine() - 1);
+		}
+		vector_index++;
+	}
+	state.parent_index += vcount;
+
+	auto &array_child = ArrayVector::GetEntry(vector);
+	// The elements of a single array should not span multiple Parquet pages
+	// So, we force the entire vector to fit on a single page by setting "vector_can_span_multiple_pages=false"
+	GetChildWriter().Prepare(*state.child_state, &state_p, array_child, count * array_size, false);
+}
+
+void ArrayColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
+	auto &state = state_p.Cast<ListColumnWriterState>();
+	auto array_size = ArrayType::GetSize(vector.GetType());
+	auto &array_child = ArrayVector::GetEntry(vector);
+	GetChildWriter().Write(*state.child_state, array_child, count * array_size);
+}
+
+} // namespace duckdb
--- a/external/duckdb/extension/parquet/writer/boolean_column_writer.cpp
+++ b/external/duckdb/extension/parquet/writer/boolean_column_writer.cpp
@@ -0,0 +1,105 @@
+#include "writer/boolean_column_writer.hpp"
+
+namespace duckdb {
+
+class BooleanStatisticsState : public ColumnWriterStatistics {
+public:
+	BooleanStatisticsState() : min(true), max(false) {
+	}
+
+	bool min;
+	bool max;
+
+public:
+	bool HasStats() override {
+		return !(min && !max);
+	}
+
+	string GetMin() override {
+		return GetMinValue();
+	}
+	string GetMax() override {
+		return GetMaxValue();
+	}
+	string GetMinValue() override {
+		return HasStats() ? string(const_char_ptr_cast(&min), sizeof(bool)) : string();
+	}
+	string GetMaxValue() override {
+		return HasStats() ? string(const_char_ptr_cast(&max), sizeof(bool)) : string();
+	}
+};
+
+class BooleanWriterPageState : public ColumnWriterPageState {
+public:
+	uint8_t byte = 0;
+	uint8_t byte_pos = 0;
+};
+
+BooleanColumnWriter::BooleanColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
+                                         vector<string> schema_path_p, bool can_have_nulls)
+    : PrimitiveColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
+}
+
+unique_ptr<ColumnWriterStatistics> BooleanColumnWriter::InitializeStatsState() {
+	return make_uniq<BooleanStatisticsState>();
+}
+
+void BooleanColumnWriter::WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p,
+                                      ColumnWriterPageState *state_p, Vector &input_column, idx_t chunk_start,
+                                      idx_t chunk_end) {
+	auto &stats = stats_p->Cast<BooleanStatisticsState>();
+	auto &state = state_p->Cast<BooleanWriterPageState>();
+	const auto &mask = FlatVector::Validity(input_column);
+
+	const auto *const ptr = FlatVector::GetData<bool>(input_column);
+	if (stats.max && !stats.min && mask.AllValid()) {
+		// Fast path: stats have already been set, and there's no NULLs
+		for (idx_t r = chunk_start; r < chunk_end; r++) {
+			const auto &val = ptr[r];
+			state.byte |= val << state.byte_pos;
+			if (++state.byte_pos == 8) {
+				temp_writer.Write(state.byte);
+				state.byte = 0;
+				state.byte_pos = 0;
+			}
+		}
+	} else {
+		for (idx_t r = chunk_start; r < chunk_end; r++) {
+			if (!mask.RowIsValid(r)) {
+				continue;
+			}
+			const auto &val = ptr[r];
+
+			stats.max |= val;
+			stats.min &= val;
+			state.byte |= val << state.byte_pos;
+
+			if (++state.byte_pos == 8) {
+				temp_writer.Write(state.byte);
+				state.byte = 0;
+				state.byte_pos = 0;
+			}
+		}
+	}
+}
+
+unique_ptr<ColumnWriterPageState> BooleanColumnWriter::InitializePageState(PrimitiveColumnWriterState &state,
+                                                                           idx_t page_idx) {
+	return make_uniq<BooleanWriterPageState>();
+}
+
+void BooleanColumnWriter::FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) {
+	auto &state = state_p->Cast<BooleanWriterPageState>();
+	if (state.byte_pos > 0) {
+		temp_writer.Write<uint8_t>(state.byte);
+		state.byte = 0;
+		state.byte_pos = 0;
+	}
+}
+
+idx_t BooleanColumnWriter::GetRowSize(const Vector &vector, const idx_t index,
+                                      const PrimitiveColumnWriterState &state) const {
+	return sizeof(bool);
+}
+
+} // namespace duckdb
--- a/external/duckdb/extension/parquet/writer/decimal_column_writer.cpp
+++ b/external/duckdb/extension/parquet/writer/decimal_column_writer.cpp
@@ -0,0 +1,100 @@
+#include "writer/decimal_column_writer.hpp"
+
+namespace duckdb {
+
+static void WriteParquetDecimal(hugeint_t input, data_ptr_t result) {
+	bool positive = input >= 0;
+	// numbers are stored as two's complement so some muckery is required
+	if (!positive) {
+		input = NumericLimits<hugeint_t>::Maximum() + input + 1;
+	}
+	uint64_t high_bytes = uint64_t(input.upper);
+	uint64_t low_bytes = input.lower;
+
+	for (idx_t i = 0; i < sizeof(uint64_t); i++) {
+		auto shift_count = (sizeof(uint64_t) - i - 1) * 8;
+		result[i] = (high_bytes >> shift_count) & 0xFF;
+	}
+	for (idx_t i = 0; i < sizeof(uint64_t); i++) {
+		auto shift_count = (sizeof(uint64_t) - i - 1) * 8;
+		result[sizeof(uint64_t) + i] = (low_bytes >> shift_count) & 0xFF;
+	}
+	if (!positive) {
+		result[0] |= 0x80;
+	}
+}
+
+class FixedDecimalStatistics : public ColumnWriterStatistics {
+public:
+	FixedDecimalStatistics() : min(NumericLimits<hugeint_t>::Maximum()), max(NumericLimits<hugeint_t>::Minimum()) {
+	}
+
+	hugeint_t min;
+	hugeint_t max;
+
+public:
+	string GetStats(hugeint_t &input) {
+		data_t buffer[16];
+		WriteParquetDecimal(input, buffer);
+		return string(const_char_ptr_cast(buffer), 16);
+	}
+
+	bool HasStats() override {
+		return min <= max;
+	}
+
+	void Update(hugeint_t &val) {
+		if (LessThan::Operation(val, min)) {
+			min = val;
+		}
+		if (GreaterThan::Operation(val, max)) {
+			max = val;
+		}
+	}
+
+	string GetMin() override {
+		return GetMinValue();
+	}
+	string GetMax() override {
+		return GetMaxValue();
+	}
+	string GetMinValue() override {
+		return HasStats() ? GetStats(min) : string();
+	}
+	string GetMaxValue() override {
+		return HasStats() ? GetStats(max) : string();
+	}
+};
+
+FixedDecimalColumnWriter::FixedDecimalColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
+                                                   vector<string> schema_path_p, bool can_have_nulls)
+    : PrimitiveColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
+}
+
+unique_ptr<ColumnWriterStatistics> FixedDecimalColumnWriter::InitializeStatsState() {
+	return make_uniq<FixedDecimalStatistics>();
+}
+
+void FixedDecimalColumnWriter::WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p,
+                                           ColumnWriterPageState *page_state, Vector &input_column, idx_t chunk_start,
+                                           idx_t chunk_end) {
+	auto &mask = FlatVector::Validity(input_column);
+	auto *ptr = FlatVector::GetData<hugeint_t>(input_column);
+	auto &stats = stats_p->Cast<FixedDecimalStatistics>();
+
+	data_t temp_buffer[16];
+	for (idx_t r = chunk_start; r < chunk_end; r++) {
+		if (mask.RowIsValid(r)) {
+			stats.Update(ptr[r]);
+			WriteParquetDecimal(ptr[r], temp_buffer);
+			temp_writer.WriteData(temp_buffer, 16);
+		}
+	}
+}
+
+idx_t FixedDecimalColumnWriter::GetRowSize(const Vector &vector, const idx_t index,
+                                           const PrimitiveColumnWriterState &state) const {
+	return sizeof(hugeint_t);
+}
+
+} // namespace duckdb
--- a/external/duckdb/extension/parquet/writer/enum_column_writer.cpp
+++ b/external/duckdb/extension/parquet/writer/enum_column_writer.cpp
@@ -0,0 +1,119 @@
+#include "writer/enum_column_writer.hpp"
+#include "parquet_rle_bp_decoder.hpp"
+#include "parquet_rle_bp_encoder.hpp"
+#include "parquet_writer.hpp"
+#include "duckdb/common/serializer/memory_stream.hpp"
+
+namespace duckdb {
+using duckdb_parquet::Encoding;
+
+class EnumWriterPageState : public ColumnWriterPageState {
+public:
+	explicit EnumWriterPageState(uint32_t bit_width) : encoder(bit_width), written_value(false) {
+	}
+
+	RleBpEncoder encoder;
+	bool written_value;
+};
+
+EnumColumnWriter::EnumColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
+                                   vector<string> schema_path_p, bool can_have_nulls)
+    : PrimitiveColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
+	bit_width = RleBpDecoder::ComputeBitWidth(EnumType::GetSize(Type()));
+}
+
+unique_ptr<ColumnWriterStatistics> EnumColumnWriter::InitializeStatsState() {
+	return make_uniq<StringStatisticsState>();
+}
+
+template <class T>
+void EnumColumnWriter::WriteEnumInternal(WriteStream &temp_writer, Vector &input_column, idx_t chunk_start,
+                                         idx_t chunk_end, EnumWriterPageState &page_state) {
+	auto &mask = FlatVector::Validity(input_column);
+	auto *ptr = FlatVector::GetData<T>(input_column);
+	for (idx_t r = chunk_start; r < chunk_end; r++) {
+		if (mask.RowIsValid(r)) {
+			if (!page_state.written_value) {
+				// first value: write the bit-width as a one-byte entry and initialize writer
+				temp_writer.Write<uint8_t>(bit_width);
+				page_state.encoder.BeginWrite();
+				page_state.written_value = true;
+			}
+			page_state.encoder.WriteValue(temp_writer, ptr[r]);
+		}
+	}
+}
+
+void EnumColumnWriter::WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p,
+                                   ColumnWriterPageState *page_state_p, Vector &input_column, idx_t chunk_start,
+                                   idx_t chunk_end) {
+	auto &page_state = page_state_p->Cast<EnumWriterPageState>();
+	switch (Type().InternalType()) {
+	case PhysicalType::UINT8:
+		WriteEnumInternal<uint8_t>(temp_writer, input_column, chunk_start, chunk_end, page_state);
+		break;
+	case PhysicalType::UINT16:
+		WriteEnumInternal<uint16_t>(temp_writer, input_column, chunk_start, chunk_end, page_state);
+		break;
+	case PhysicalType::UINT32:
+		WriteEnumInternal<uint32_t>(temp_writer, input_column, chunk_start, chunk_end, page_state);
+		break;
+	default:
+		throw InternalException("Unsupported internal enum type");
+	}
+}
+
+unique_ptr<ColumnWriterPageState> EnumColumnWriter::InitializePageState(PrimitiveColumnWriterState &state,
+                                                                        idx_t page_idx) {
+	return make_uniq<EnumWriterPageState>(bit_width);
+}
+
+void EnumColumnWriter::FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) {
+	auto &page_state = state_p->Cast<EnumWriterPageState>();
+	if (!page_state.written_value) {
+		// all values are null
+		// just write the bit width
+		temp_writer.Write<uint8_t>(bit_width);
+		return;
+	}
+	page_state.encoder.FinishWrite(temp_writer);
+}
+
+duckdb_parquet::Encoding::type EnumColumnWriter::GetEncoding(PrimitiveColumnWriterState &state) {
+	return Encoding::RLE_DICTIONARY;
+}
+
+bool EnumColumnWriter::HasDictionary(PrimitiveColumnWriterState &state) {
+	return true;
+}
+
+idx_t EnumColumnWriter::DictionarySize(PrimitiveColumnWriterState &state_p) {
+	return EnumType::GetSize(Type());
+}
+
+void EnumColumnWriter::FlushDictionary(PrimitiveColumnWriterState &state, ColumnWriterStatistics *stats_p) {
+	auto &stats = stats_p->Cast<StringStatisticsState>();
+	// write the enum values to a dictionary page
+	auto &enum_values = EnumType::GetValuesInsertOrder(Type());
+	auto enum_count = EnumType::GetSize(Type());
+	auto string_values = FlatVector::GetData<string_t>(enum_values);
+	// first write the contents of the dictionary page to a temporary buffer
+	auto temp_writer = make_uniq<MemoryStream>(BufferAllocator::Get(writer.GetContext()));
+	for (idx_t r = 0; r < enum_count; r++) {
+		D_ASSERT(!FlatVector::IsNull(enum_values, r));
+		// update the statistics
+		stats.Update(string_values[r]);
+		// write this string value to the dictionary
+		temp_writer->Write<uint32_t>(string_values[r].GetSize());
+		temp_writer->WriteData(const_data_ptr_cast(string_values[r].GetData()), string_values[r].GetSize());
+	}
+	// flush the dictionary page and add it to the to-be-written pages
+	WriteDictionary(state, std::move(temp_writer), enum_count);
+}
+
+idx_t EnumColumnWriter::GetRowSize(const Vector &vector, const idx_t index,
+                                   const PrimitiveColumnWriterState &state) const {
+	return (bit_width + 7) / 8;
+}
+
+} // namespace duckdb
--- a/external/duckdb/extension/parquet/writer/list_column_writer.cpp
+++ b/external/duckdb/extension/parquet/writer/list_column_writer.cpp
@@ -0,0 +1,144 @@
+#include "writer/list_column_writer.hpp"
+
+namespace duckdb {
+
+unique_ptr<ColumnWriterState> ListColumnWriter::InitializeWriteState(duckdb_parquet::RowGroup &row_group) {
+	auto result = make_uniq<ListColumnWriterState>(row_group, row_group.columns.size());
+	result->child_state = GetChildWriter().InitializeWriteState(row_group);
+	return std::move(result);
+}
+
+bool ListColumnWriter::HasAnalyze() {
+	return GetChildWriter().HasAnalyze();
+}
+void ListColumnWriter::Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
+	auto &state = state_p.Cast<ListColumnWriterState>();
+	auto &list_child = ListVector::GetEntry(vector);
+	auto list_count = ListVector::GetListSize(vector);
+	GetChildWriter().Analyze(*state.child_state, &state_p, list_child, list_count);
+}
+
+void ListColumnWriter::FinalizeAnalyze(ColumnWriterState &state_p) {
+	auto &state = state_p.Cast<ListColumnWriterState>();
+	GetChildWriter().FinalizeAnalyze(*state.child_state);
+}
+
+static idx_t GetConsecutiveChildList(Vector &list, Vector &result, idx_t offset, idx_t count) {
+	// returns a consecutive child list that fully flattens and repeats all required elements
+	auto &validity = FlatVector::Validity(list);
+	auto list_entries = FlatVector::GetData<list_entry_t>(list);
+	bool is_consecutive = true;
+	idx_t total_length = 0;
+	for (idx_t c = offset; c < offset + count; c++) {
+		if (!validity.RowIsValid(c)) {
+			continue;
+		}
+		if (list_entries[c].offset != total_length) {
+			is_consecutive = false;
+		}
+		total_length += list_entries[c].length;
+	}
+	if (is_consecutive) {
+		// already consecutive - leave it as-is
+		return total_length;
+	}
+	SelectionVector sel(total_length);
+	idx_t index = 0;
+	for (idx_t c = offset; c < offset + count; c++) {
+		if (!validity.RowIsValid(c)) {
+			continue;
+		}
+		for (idx_t k = 0; k < list_entries[c].length; k++) {
+			sel.set_index(index++, list_entries[c].offset + k);
+		}
+	}
+	result.Slice(sel, total_length);
+	result.Flatten(total_length);
+	return total_length;
+}
+
+void ListColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count,
+                               bool vector_can_span_multiple_pages) {
+	auto &state = state_p.Cast<ListColumnWriterState>();
+
+	auto list_data = FlatVector::GetData<list_entry_t>(vector);
+	auto &validity = FlatVector::Validity(vector);
+
+	// write definition levels and repeats
+	idx_t start = 0;
+	idx_t vcount = parent ? parent->definition_levels.size() - state.parent_index : count;
+	idx_t vector_index = 0;
+	for (idx_t i = start; i < vcount; i++) {
+		idx_t parent_index = state.parent_index + i;
+		if (parent && !parent->is_empty.empty() && parent->is_empty[parent_index]) {
+			state.definition_levels.push_back(parent->definition_levels[parent_index]);
+			state.repetition_levels.push_back(parent->repetition_levels[parent_index]);
+			state.is_empty.push_back(true);
+			continue;
+		}
+		auto first_repeat_level =
+		    parent && !parent->repetition_levels.empty() ? parent->repetition_levels[parent_index] : MaxRepeat();
+		if (parent && parent->definition_levels[parent_index] != PARQUET_DEFINE_VALID) {
+			state.definition_levels.push_back(parent->definition_levels[parent_index]);
+			state.repetition_levels.push_back(first_repeat_level);
+			state.is_empty.push_back(true);
+		} else if (validity.RowIsValid(vector_index)) {
+			// push the repetition levels
+			if (list_data[vector_index].length == 0) {
+				state.definition_levels.push_back(MaxDefine());
+				state.is_empty.push_back(true);
+			} else {
+				state.definition_levels.push_back(PARQUET_DEFINE_VALID);
+				state.is_empty.push_back(false);
+			}
+			state.repetition_levels.push_back(first_repeat_level);
+			for (idx_t k = 1; k < list_data[vector_index].length; k++) {
+				state.repetition_levels.push_back(MaxRepeat() + 1);
+				state.definition_levels.push_back(PARQUET_DEFINE_VALID);
+				state.is_empty.push_back(false);
+			}
+		} else {
+			if (!can_have_nulls) {
+				throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
+			}
+			state.definition_levels.push_back(MaxDefine() - 1);
+			state.repetition_levels.push_back(first_repeat_level);
+			state.is_empty.push_back(true);
+		}
+		vector_index++;
+	}
+	state.parent_index += vcount;
+
+	auto &list_child = ListVector::GetEntry(vector);
+	Vector child_list(list_child);
+	auto child_length = GetConsecutiveChildList(vector, child_list, 0, count);
+	// The elements of a single list should not span multiple Parquet pages
+	// So, we force the entire vector to fit on a single page by setting "vector_can_span_multiple_pages=false"
+	GetChildWriter().Prepare(*state.child_state, &state_p, child_list, child_length, false);
+}
+
+void ListColumnWriter::BeginWrite(ColumnWriterState &state_p) {
+	auto &state = state_p.Cast<ListColumnWriterState>();
+	GetChildWriter().BeginWrite(*state.child_state);
+}
+
+void ListColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
+	auto &state = state_p.Cast<ListColumnWriterState>();
+
+	auto &list_child = ListVector::GetEntry(vector);
+	Vector child_list(list_child);
+	auto child_length = GetConsecutiveChildList(vector, child_list, 0, count);
+	GetChildWriter().Write(*state.child_state, child_list, child_length);
+}
+
+void ListColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
+	auto &state = state_p.Cast<ListColumnWriterState>();
+	GetChildWriter().FinalizeWrite(*state.child_state);
+}
+
+ColumnWriter &ListColumnWriter::GetChildWriter() {
+	D_ASSERT(child_writers.size() == 1);
+	return *child_writers[0];
+}
+
+} // namespace duckdb
--- a/external/duckdb/extension/parquet/writer/primitive_column_writer.cpp
+++ b/external/duckdb/extension/parquet/writer/primitive_column_writer.cpp
@@ -0,0 +1,435 @@
+#include "writer/primitive_column_writer.hpp"
+#include "parquet_rle_bp_decoder.hpp"
+#include "parquet_rle_bp_encoder.hpp"
+#include "parquet_writer.hpp"
+
+namespace duckdb {
+using duckdb_parquet::Encoding;
+using duckdb_parquet::PageType;
+
+constexpr const idx_t PrimitiveColumnWriter::MAX_UNCOMPRESSED_PAGE_SIZE;
+constexpr const idx_t PrimitiveColumnWriter::MAX_UNCOMPRESSED_DICT_PAGE_SIZE;
+
+PrimitiveColumnWriter::PrimitiveColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
+                                             vector<string> schema_path, bool can_have_nulls)
+    : ColumnWriter(writer, column_schema, std::move(schema_path), can_have_nulls) {
+}
+
+unique_ptr<ColumnWriterState> PrimitiveColumnWriter::InitializeWriteState(duckdb_parquet::RowGroup &row_group) {
+	auto result = make_uniq<PrimitiveColumnWriterState>(writer, row_group, row_group.columns.size());
+	RegisterToRowGroup(row_group);
+	return std::move(result);
+}
+
+void PrimitiveColumnWriter::RegisterToRowGroup(duckdb_parquet::RowGroup &row_group) {
+	duckdb_parquet::ColumnChunk column_chunk;
+	column_chunk.__isset.meta_data = true;
+	column_chunk.meta_data.codec = writer.GetCodec();
+	column_chunk.meta_data.path_in_schema = schema_path;
+	column_chunk.meta_data.num_values = 0;
+	column_chunk.meta_data.type = writer.GetType(SchemaIndex());
+	row_group.columns.push_back(std::move(column_chunk));
+}
+
+unique_ptr<ColumnWriterPageState> PrimitiveColumnWriter::InitializePageState(PrimitiveColumnWriterState &state,
+                                                                             idx_t page_idx) {
+	return nullptr;
+}
+
+void PrimitiveColumnWriter::FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state) {
+}
+
+void PrimitiveColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count,
+                                    bool vector_can_span_multiple_pages) {
+	auto &state = state_p.Cast<PrimitiveColumnWriterState>();
+	auto &col_chunk = state.row_group.columns[state.col_idx];
+
+	idx_t vcount = parent ? parent->definition_levels.size() - state.definition_levels.size() : count;
+	idx_t parent_index = state.definition_levels.size();
+	auto &validity = FlatVector::Validity(vector);
+	HandleRepeatLevels(state, parent, count);
+	HandleDefineLevels(state, parent, validity, count, MaxDefine(), MaxDefine() - 1);
+
+	idx_t vector_index = 0;
+	reference<PageInformation> page_info_ref = state.page_info.back();
+	col_chunk.meta_data.num_values += NumericCast<int64_t>(vcount);
+
+	const bool check_parent_empty = parent && !parent->is_empty.empty();
+	if (!check_parent_empty && validity.AllValid() && TypeIsConstantSize(vector.GetType().InternalType()) &&
+	    page_info_ref.get().estimated_page_size + GetRowSize(vector, vector_index, state) * vcount <
+	        MAX_UNCOMPRESSED_PAGE_SIZE) {
+		// Fast path: fixed-size type, all valid, and it fits on the current page
+		auto &page_info = page_info_ref.get();
+		page_info.row_count += vcount;
+		page_info.estimated_page_size += GetRowSize(vector, vector_index, state) * vcount;
+	} else {
+		for (idx_t i = 0; i < vcount; i++) {
+			auto &page_info = page_info_ref.get();
+			page_info.row_count++;
+			if (check_parent_empty && parent->is_empty[parent_index + i]) {
+				page_info.empty_count++;
+				continue;
+			}
+			if (validity.RowIsValid(vector_index)) {
+				page_info.estimated_page_size += GetRowSize(vector, vector_index, state);
+				if (page_info.estimated_page_size >= MAX_UNCOMPRESSED_PAGE_SIZE) {
+					if (!vector_can_span_multiple_pages && i != 0) {
+						// Vector is not allowed to span multiple pages, and we already started writing it
+						continue;
+					}
+					PageInformation new_info;
+					new_info.offset = page_info.offset + page_info.row_count;
+					state.page_info.push_back(new_info);
+					page_info_ref = state.page_info.back();
+				}
+			} else {
+				page_info.null_count++;
+			}
+			vector_index++;
+		}
+	}
+}
+
+duckdb_parquet::Encoding::type PrimitiveColumnWriter::GetEncoding(PrimitiveColumnWriterState &state) {
+	return Encoding::PLAIN;
+}
+
+void PrimitiveColumnWriter::BeginWrite(ColumnWriterState &state_p) {
+	auto &state = state_p.Cast<PrimitiveColumnWriterState>();
+
+	// set up the page write info
+	state.stats_state = InitializeStatsState();
+	for (idx_t page_idx = 0; page_idx < state.page_info.size(); page_idx++) {
+		auto &page_info = state.page_info[page_idx];
+		if (page_info.row_count == 0) {
+			D_ASSERT(page_idx + 1 == state.page_info.size());
+			state.page_info.erase_at(page_idx);
+			break;
+		}
+		PageWriteInformation write_info;
+		// set up the header
+		auto &hdr = write_info.page_header;
+		hdr.compressed_page_size = 0;
+		hdr.uncompressed_page_size = 0;
+		hdr.type = PageType::DATA_PAGE;
+		hdr.__isset.data_page_header = true;
+
+		hdr.data_page_header.num_values = NumericCast<int32_t>(page_info.row_count);
+		hdr.data_page_header.encoding = GetEncoding(state);
+		hdr.data_page_header.definition_level_encoding = Encoding::RLE;
+		hdr.data_page_header.repetition_level_encoding = Encoding::RLE;
+
+		write_info.temp_writer = make_uniq<MemoryStream>(
+		    BufferAllocator::Get(writer.GetContext()),
+		    MaxValue<idx_t>(NextPowerOfTwo(page_info.estimated_page_size), MemoryStream::DEFAULT_INITIAL_CAPACITY));
+		write_info.write_count = page_info.empty_count;
+		write_info.max_write_count = page_info.row_count;
+		write_info.page_state = InitializePageState(state, page_idx);
+
+		write_info.compressed_size = 0;
+		write_info.compressed_data = nullptr;
+
+		state.write_info.push_back(std::move(write_info));
+	}
+
+	// start writing the first page
+	NextPage(state);
+}
+
+void PrimitiveColumnWriter::WriteLevels(Allocator &allocator, WriteStream &temp_writer,
+                                        const unsafe_vector<uint16_t> &levels, idx_t max_value, idx_t offset,
+                                        idx_t count, optional_idx null_count) {
+	if (levels.empty() || count == 0) {
+		return;
+	}
+
+	// write the levels using the RLE-BP encoding
+	const auto bit_width = RleBpDecoder::ComputeBitWidth((max_value));
+	RleBpEncoder rle_encoder(bit_width);
+
+	// have to write to an intermediate stream first because we need to know the size
+	MemoryStream intermediate_stream(allocator);
+
+	rle_encoder.BeginWrite();
+	if (null_count.IsValid() && null_count.GetIndex() == 0) {
+		// Fast path: no nulls
+		rle_encoder.WriteMany(intermediate_stream, levels[0], count);
+	} else {
+		for (idx_t i = offset; i < offset + count; i++) {
+			rle_encoder.WriteValue(intermediate_stream, levels[i]);
+		}
+	}
+	rle_encoder.FinishWrite(intermediate_stream);
+
+	// start off by writing the byte count as a uint32_t
+	temp_writer.Write(NumericCast<uint32_t>(intermediate_stream.GetPosition()));
+	// copy over the written data
+	temp_writer.WriteData(intermediate_stream.GetData(), intermediate_stream.GetPosition());
+}
+
+void PrimitiveColumnWriter::NextPage(PrimitiveColumnWriterState &state) {
+	if (state.current_page > 0) {
+		// need to flush the current page
+		FlushPage(state);
+	}
+	if (state.current_page >= state.write_info.size()) {
+		state.current_page = state.write_info.size() + 1;
+		return;
+	}
+	auto &page_info = state.page_info[state.current_page];
+	auto &write_info = state.write_info[state.current_page];
+	state.current_page++;
+
+	auto &temp_writer = *write_info.temp_writer;
+
+	// write the repetition levels
+	auto &allocator = BufferAllocator::Get(writer.GetContext());
+	WriteLevels(allocator, temp_writer, state.repetition_levels, MaxRepeat(), page_info.offset, page_info.row_count);
+
+	// write the definition levels
+	WriteLevels(allocator, temp_writer, state.definition_levels, MaxDefine(), page_info.offset, page_info.row_count,
+	            state.null_count + state.parent_null_count);
+}
+
+void PrimitiveColumnWriter::FlushPage(PrimitiveColumnWriterState &state) {
+	D_ASSERT(state.current_page > 0);
+	if (state.current_page > state.write_info.size()) {
+		return;
+	}
+
+	// compress the page info
+	auto &write_info = state.write_info[state.current_page - 1];
+	auto &temp_writer = *write_info.temp_writer;
+	auto &hdr = write_info.page_header;
+
+	FlushPageState(temp_writer, write_info.page_state.get());
+
+	// now that we have finished writing the data we know the uncompressed size
+	if (temp_writer.GetPosition() > idx_t(NumericLimits<int32_t>::Maximum())) {
+		throw InternalException("Parquet writer: %d uncompressed page size out of range for type integer",
+		                        temp_writer.GetPosition());
+	}
+	hdr.uncompressed_page_size = UnsafeNumericCast<int32_t>(temp_writer.GetPosition());
+
+	// compress the data
+	CompressPage(temp_writer, write_info.compressed_size, write_info.compressed_data, write_info.compressed_buf);
+	hdr.compressed_page_size = UnsafeNumericCast<int32_t>(write_info.compressed_size);
+	D_ASSERT(hdr.uncompressed_page_size > 0);
+	D_ASSERT(hdr.compressed_page_size > 0);
+
+	if (write_info.compressed_buf) {
+		// if the data has been compressed, we no longer need the uncompressed data
+		D_ASSERT(write_info.compressed_buf.get() == write_info.compressed_data);
+		write_info.temp_writer.reset();
+	}
+}
+
+unique_ptr<ColumnWriterStatistics> PrimitiveColumnWriter::InitializeStatsState() {
+	return make_uniq<ColumnWriterStatistics>();
+}
+
+idx_t PrimitiveColumnWriter::GetRowSize(const Vector &vector, const idx_t index,
+                                        const PrimitiveColumnWriterState &state) const {
+	throw InternalException("GetRowSize unsupported for struct/list column writers");
+}
+
+void PrimitiveColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
+	auto &state = state_p.Cast<PrimitiveColumnWriterState>();
+
+	idx_t remaining = count;
+	idx_t offset = 0;
+	while (remaining > 0) {
+		auto &write_info = state.write_info[state.current_page - 1];
+		if (!write_info.temp_writer) {
+			throw InternalException("Writes are not correctly aligned!?");
+		}
+		auto &temp_writer = *write_info.temp_writer;
+		idx_t write_count = MinValue<idx_t>(remaining, write_info.max_write_count - write_info.write_count);
+		D_ASSERT(write_count > 0);
+
+		WriteVector(temp_writer, state.stats_state.get(), write_info.page_state.get(), vector, offset,
+		            offset + write_count);
+
+		write_info.write_count += write_count;
+		if (write_info.write_count == write_info.max_write_count) {
+			NextPage(state);
+		}
+		offset += write_count;
+		remaining -= write_count;
+	}
+}
+
+void PrimitiveColumnWriter::SetParquetStatistics(PrimitiveColumnWriterState &state,
+                                                 duckdb_parquet::ColumnChunk &column_chunk) {
+	if (!state.stats_state) {
+		return;
+	}
+	if (MaxRepeat() == 0) {
+		column_chunk.meta_data.statistics.null_count = NumericCast<int64_t>(state.null_count);
+		column_chunk.meta_data.statistics.__isset.null_count = true;
+		column_chunk.meta_data.__isset.statistics = true;
+	}
+	// if we have NaN values - don't write the min/max here
+	if (!state.stats_state->HasNaN()) {
+		// set min/max/min_value/max_value
+		// this code is not going to win any beauty contests, but well
+		auto min = state.stats_state->GetMin();
+		if (!min.empty()) {
+			column_chunk.meta_data.statistics.min = std::move(min);
+			column_chunk.meta_data.statistics.__isset.min = true;
+			column_chunk.meta_data.__isset.statistics = true;
+		}
+		auto max = state.stats_state->GetMax();
+		if (!max.empty()) {
+			column_chunk.meta_data.statistics.max = std::move(max);
+			column_chunk.meta_data.statistics.__isset.max = true;
+			column_chunk.meta_data.__isset.statistics = true;
+		}
+
+		if (state.stats_state->HasStats()) {
+			column_chunk.meta_data.statistics.min_value = state.stats_state->GetMinValue();
+			column_chunk.meta_data.statistics.__isset.min_value = true;
+			column_chunk.meta_data.__isset.statistics = true;
+			column_chunk.meta_data.statistics.is_min_value_exact = state.stats_state->MinIsExact();
+			column_chunk.meta_data.statistics.__isset.is_min_value_exact = true;
+
+			column_chunk.meta_data.statistics.max_value = state.stats_state->GetMaxValue();
+			column_chunk.meta_data.statistics.__isset.max_value = true;
+			column_chunk.meta_data.__isset.statistics = true;
+			column_chunk.meta_data.statistics.is_max_value_exact = state.stats_state->MaxIsExact();
+			column_chunk.meta_data.statistics.__isset.is_max_value_exact = true;
+		}
+	}
+	if (HasDictionary(state)) {
+		column_chunk.meta_data.statistics.distinct_count = UnsafeNumericCast<int64_t>(DictionarySize(state));
+		column_chunk.meta_data.statistics.__isset.distinct_count = true;
+		column_chunk.meta_data.__isset.statistics = true;
+	}
+
+	if (state.stats_state->HasGeoStats()) {
+
+		auto gpq_version = writer.GetGeoParquetVersion();
+
+		const auto has_real_stats = gpq_version == GeoParquetVersion::NONE || gpq_version == GeoParquetVersion::BOTH ||
+		                            gpq_version == GeoParquetVersion::V2;
+		const auto has_json_stats = gpq_version == GeoParquetVersion::V1 || gpq_version == GeoParquetVersion::BOTH ||
+		                            gpq_version == GeoParquetVersion::V2;
+
+		if (has_real_stats) {
+			// Write the parquet native geospatial statistics
+			column_chunk.meta_data.__isset.geospatial_statistics = true;
+			state.stats_state->WriteGeoStats(column_chunk.meta_data.geospatial_statistics);
+		}
+		if (has_json_stats) {
+			// Add the geospatial statistics to the extra GeoParquet metadata
+			writer.GetGeoParquetData().AddGeoParquetStats(column_schema.name, column_schema.type,
+			                                              *state.stats_state->GetGeoStats());
+		}
+	}
+
+	for (const auto &write_info : state.write_info) {
+		// only care about data page encodings, data_page_header.encoding is meaningless for dict
+		if (write_info.page_header.type != PageType::DATA_PAGE &&
+		    write_info.page_header.type != PageType::DATA_PAGE_V2) {
+			continue;
+		}
+		column_chunk.meta_data.encodings.push_back(write_info.page_header.data_page_header.encoding);
+	}
+}
+
+void PrimitiveColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
+	auto &state = state_p.Cast<PrimitiveColumnWriterState>();
+	auto &column_chunk = state.row_group.columns[state.col_idx];
+
+	// flush the last page (if any remains)
+	FlushPage(state);
+
+	auto &column_writer = writer.GetWriter();
+	auto start_offset = column_writer.GetTotalWritten();
+	// flush the dictionary
+	if (HasDictionary(state)) {
+		column_chunk.meta_data.statistics.distinct_count = UnsafeNumericCast<int64_t>(DictionarySize(state));
+		column_chunk.meta_data.statistics.__isset.distinct_count = true;
+		column_chunk.meta_data.dictionary_page_offset = UnsafeNumericCast<int64_t>(column_writer.GetTotalWritten());
+		column_chunk.meta_data.__isset.dictionary_page_offset = true;
+		FlushDictionary(state, state.stats_state.get());
+	}
+
+	// record the start position of the pages for this column
+	column_chunk.meta_data.data_page_offset = 0;
+	SetParquetStatistics(state, column_chunk);
+
+	// write the individual pages to disk
+	idx_t total_uncompressed_size = 0;
+	for (auto &write_info : state.write_info) {
+		// set the data page offset whenever we see the *first* data page
+		if (column_chunk.meta_data.data_page_offset == 0 && (write_info.page_header.type == PageType::DATA_PAGE ||
+		                                                     write_info.page_header.type == PageType::DATA_PAGE_V2)) {
+			column_chunk.meta_data.data_page_offset = UnsafeNumericCast<int64_t>(column_writer.GetTotalWritten());
+		}
+		D_ASSERT(write_info.page_header.uncompressed_page_size > 0);
+		auto header_start_offset = column_writer.GetTotalWritten();
+		writer.Write(write_info.page_header);
+		// total uncompressed size in the column chunk includes the header size (!)
+		total_uncompressed_size += column_writer.GetTotalWritten() - header_start_offset;
+		total_uncompressed_size += write_info.page_header.uncompressed_page_size;
+		writer.WriteData(write_info.compressed_data, write_info.compressed_size);
+	}
+	column_chunk.meta_data.total_compressed_size =
+	    UnsafeNumericCast<int64_t>(column_writer.GetTotalWritten() - start_offset);
+	column_chunk.meta_data.total_uncompressed_size = UnsafeNumericCast<int64_t>(total_uncompressed_size);
+	state.row_group.total_byte_size += column_chunk.meta_data.total_uncompressed_size;
+
+	if (state.bloom_filter) {
+		writer.BufferBloomFilter(state.col_idx, std::move(state.bloom_filter));
+	}
+
+	// finalize the stats
+	writer.FlushColumnStats(state.col_idx, column_chunk, state.stats_state.get());
+}
+
+void PrimitiveColumnWriter::FlushDictionary(PrimitiveColumnWriterState &state, ColumnWriterStatistics *stats) {
+	throw InternalException("This page does not have a dictionary");
+}
+
+idx_t PrimitiveColumnWriter::DictionarySize(PrimitiveColumnWriterState &state) {
+	throw InternalException("This page does not have a dictionary");
+}
+
+void PrimitiveColumnWriter::WriteDictionary(PrimitiveColumnWriterState &state, unique_ptr<MemoryStream> temp_writer,
+                                            idx_t row_count) {
+	D_ASSERT(temp_writer);
+	D_ASSERT(temp_writer->GetPosition() > 0);
+
+	// write the dictionary page header
+	PageWriteInformation write_info;
+	// set up the header
+	auto &hdr = write_info.page_header;
+	hdr.uncompressed_page_size = UnsafeNumericCast<int32_t>(temp_writer->GetPosition());
+	hdr.type = PageType::DICTIONARY_PAGE;
+	hdr.__isset.dictionary_page_header = true;
+
+	hdr.dictionary_page_header.encoding = Encoding::PLAIN;
+	hdr.dictionary_page_header.is_sorted = false;
+	hdr.dictionary_page_header.num_values = UnsafeNumericCast<int32_t>(row_count);
+
+	write_info.temp_writer = std::move(temp_writer);
+	write_info.write_count = 0;
+	write_info.max_write_count = 0;
+
+	// compress the contents of the dictionary page
+	CompressPage(*write_info.temp_writer, write_info.compressed_size, write_info.compressed_data,
+	             write_info.compressed_buf);
+	hdr.compressed_page_size = UnsafeNumericCast<int32_t>(write_info.compressed_size);
+
+	if (write_info.compressed_buf) {
+		// if the data has been compressed, we no longer need the uncompressed data
+		D_ASSERT(write_info.compressed_buf.get() == write_info.compressed_data);
+		write_info.temp_writer.reset();
+	}
+
+	// insert the dictionary page as the first page to write for this column
+	state.write_info.insert(state.write_info.begin(), std::move(write_info));
+}
+
+} // namespace duckdb
--- a/external/duckdb/extension/parquet/writer/struct_column_writer.cpp
+++ b/external/duckdb/extension/parquet/writer/struct_column_writer.cpp
@@ -0,0 +1,103 @@
+#include "writer/struct_column_writer.hpp"
+
+namespace duckdb {
+
+class StructColumnWriterState : public ColumnWriterState {
+public:
+	StructColumnWriterState(duckdb_parquet::RowGroup &row_group, idx_t col_idx)
+	    : row_group(row_group), col_idx(col_idx) {
+	}
+	~StructColumnWriterState() override = default;
+
+	duckdb_parquet::RowGroup &row_group;
+	idx_t col_idx;
+	vector<unique_ptr<ColumnWriterState>> child_states;
+};
+
+unique_ptr<ColumnWriterState> StructColumnWriter::InitializeWriteState(duckdb_parquet::RowGroup &row_group) {
+	auto result = make_uniq<StructColumnWriterState>(row_group, row_group.columns.size());
+
+	result->child_states.reserve(child_writers.size());
+	for (auto &child_writer : child_writers) {
+		result->child_states.push_back(child_writer->InitializeWriteState(row_group));
+	}
+	return std::move(result);
+}
+
+bool StructColumnWriter::HasAnalyze() {
+	for (auto &child_writer : child_writers) {
+		if (child_writer->HasAnalyze()) {
+			return true;
+		}
+	}
+	return false;
+}
+
+void StructColumnWriter::Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
+	auto &state = state_p.Cast<StructColumnWriterState>();
+	auto &child_vectors = StructVector::GetEntries(vector);
+	for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
+		// Need to check again. It might be that just one child needs it but the rest not
+		if (child_writers[child_idx]->HasAnalyze()) {
+			child_writers[child_idx]->Analyze(*state.child_states[child_idx], &state_p, *child_vectors[child_idx],
+			                                  count);
+		}
+	}
+}
+
+void StructColumnWriter::FinalizeAnalyze(ColumnWriterState &state_p) {
+	auto &state = state_p.Cast<StructColumnWriterState>();
+	for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
+		// Need to check again. It might be that just one child needs it but the rest not
+		if (child_writers[child_idx]->HasAnalyze()) {
+			child_writers[child_idx]->FinalizeAnalyze(*state.child_states[child_idx]);
+		}
+	}
+}
+
+void StructColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count,
+                                 bool vector_can_span_multiple_pages) {
+	auto &state = state_p.Cast<StructColumnWriterState>();
+
+	auto &validity = FlatVector::Validity(vector);
+	if (parent) {
+		// propagate empty entries from the parent
+		if (state.is_empty.size() < parent->is_empty.size()) {
+			state.is_empty.insert(state.is_empty.end(), parent->is_empty.begin() + state.is_empty.size(),
+			                      parent->is_empty.end());
+		}
+	}
+	HandleRepeatLevels(state_p, parent, count);
+	HandleDefineLevels(state_p, parent, validity, count, PARQUET_DEFINE_VALID, MaxDefine() - 1);
+	auto &child_vectors = StructVector::GetEntries(vector);
+	for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
+		child_writers[child_idx]->Prepare(*state.child_states[child_idx], &state_p, *child_vectors[child_idx], count,
+		                                  vector_can_span_multiple_pages);
+	}
+}
+
+void StructColumnWriter::BeginWrite(ColumnWriterState &state_p) {
+	auto &state = state_p.Cast<StructColumnWriterState>();
+	for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
+		child_writers[child_idx]->BeginWrite(*state.child_states[child_idx]);
+	}
+}
+
+void StructColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
+	auto &state = state_p.Cast<StructColumnWriterState>();
+	auto &child_vectors = StructVector::GetEntries(vector);
+	for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
+		child_writers[child_idx]->Write(*state.child_states[child_idx], *child_vectors[child_idx], count);
+	}
+}
+
+void StructColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
+	auto &state = state_p.Cast<StructColumnWriterState>();
+	for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
+		// we add the null count of the struct to the null count of the children
+		state.child_states[child_idx]->null_count += state_p.null_count;
+		child_writers[child_idx]->FinalizeWrite(*state.child_states[child_idx]);
+	}
+}
+
+} // namespace duckdb
--- a/external/duckdb/extension/parquet/writer/variant/CMakeLists.txt
+++ b/external/duckdb/extension/parquet/writer/variant/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_library_unity(duckdb_parquet_writer_variant OBJECT convert_variant.cpp)
+
+set(PARQUET_EXTENSION_FILES
+    ${PARQUET_EXTENSION_FILES} $<TARGET_OBJECTS:duckdb_parquet_writer_variant>
+    PARENT_SCOPE)
--- a/external/duckdb/extension/parquet/writer/variant/convert_variant.cpp
+++ b/external/duckdb/extension/parquet/writer/variant/convert_variant.cpp