email-tracker/external/duckdb/extension/parquet/writer/variant/convert_variant.cpp

#include "writer/variant_column_writer.hpp"
#include "duckdb/common/types/variant.hpp"
#include "duckdb/planner/expression/bound_function_expression.hpp"
#include "duckdb/function/scalar/variant_utils.hpp"
#include "reader/variant/variant_binary_decoder.hpp"
#include "parquet_shredding.hpp"
#include "duckdb/common/types/decimal.hpp"
#include "duckdb/common/types/uuid.hpp"

namespace duckdb {

static idx_t CalculateByteLength(idx_t value) {
	if (value == 0) {
		return 1;
	}
	auto value_data = reinterpret_cast<data_ptr_t>(&value);
	idx_t irrelevant_bytes = 0;
	//! Check how many of the most significant bytes are 0
	for (idx_t i = sizeof(idx_t); i > 0 && value_data[i - 1] == 0; i--) {
		irrelevant_bytes++;
	}
	return sizeof(idx_t) - irrelevant_bytes;
}

static uint8_t EncodeMetadataHeader(idx_t byte_length) {
	D_ASSERT(byte_length <= 4);

	uint8_t header_byte = 0;
	//! Set 'version' to 1
	header_byte |= static_cast<uint8_t>(1);
	//! Set 'sorted_strings' to 1
	header_byte |= static_cast<uint8_t>(1) << 4;
	//! Set 'offset_size_minus_one' to byte_length-1
	header_byte |= (static_cast<uint8_t>(byte_length) - 1) << 6;

#ifdef DEBUG
	auto decoded_header = VariantMetadataHeader::FromHeaderByte(header_byte);
	D_ASSERT(decoded_header.offset_size == byte_length);
#endif

	return header_byte;
}

static void CreateMetadata(UnifiedVariantVectorData &variant, Vector &metadata, idx_t count) {
	auto &keys = variant.keys;
	auto keys_data = variant.keys_data;

	//! NOTE: the parquet variant is limited to a max dictionary size of NumericLimits<uint32_t>::Maximum()
	//! Whereas we can have NumericLimits<uint32_t>::Maximum() *per* string in DuckDB
	auto metadata_data = FlatVector::GetData<string_t>(metadata);
	for (idx_t row = 0; row < count; row++) {
		uint64_t dictionary_count = 0;
		if (variant.RowIsValid(row)) {
			auto list_entry = keys_data[keys.sel->get_index(row)];
			dictionary_count = list_entry.length;
		}
		idx_t dictionary_size = 0;
		for (idx_t i = 0; i < dictionary_count; i++) {
			auto &key = variant.GetKey(row, i);
			dictionary_size += key.GetSize();
		}
		if (dictionary_size >= NumericLimits<uint32_t>::Maximum()) {
			throw InvalidInputException("The total length of the dictionary exceeds a 4 byte value (uint32_t), failed "
			                            "to export VARIANT to Parquet");
		}

		auto byte_length = CalculateByteLength(dictionary_size);
		auto total_length = 1 + (byte_length * (dictionary_count + 2)) + dictionary_size;

		metadata_data[row] = StringVector::EmptyString(metadata, total_length);
		auto &metadata_blob = metadata_data[row];
		auto metadata_blob_data = metadata_blob.GetDataWriteable();

		metadata_blob_data[0] = EncodeMetadataHeader(byte_length);
		memcpy(metadata_blob_data + 1, reinterpret_cast<data_ptr_t>(&dictionary_count), byte_length);

		auto offset_ptr = metadata_blob_data + 1 + byte_length;
		auto string_ptr = metadata_blob_data + 1 + byte_length + ((dictionary_count + 1) * byte_length);
		idx_t total_offset = 0;
		for (idx_t i = 0; i < dictionary_count; i++) {
			memcpy(offset_ptr + (i * byte_length), reinterpret_cast<data_ptr_t>(&total_offset), byte_length);
			auto &key = variant.GetKey(row, i);

			memcpy(string_ptr + total_offset, key.GetData(), key.GetSize());
			total_offset += key.GetSize();
		}
		memcpy(offset_ptr + (dictionary_count * byte_length), reinterpret_cast<data_ptr_t>(&total_offset), byte_length);
		D_ASSERT(offset_ptr + ((dictionary_count + 1) * byte_length) == string_ptr);
		D_ASSERT(string_ptr + total_offset == metadata_blob_data + total_length);
		metadata_blob.SetSizeAndFinalize(total_length, total_length);

#ifdef DEBUG
		auto decoded_metadata = VariantMetadata(metadata_blob);
		D_ASSERT(decoded_metadata.strings.size() == dictionary_count);
		for (idx_t i = 0; i < dictionary_count; i++) {
			D_ASSERT(decoded_metadata.strings[i] == variant.GetKey(row, i).GetString());
		}
#endif
	}
}

namespace {

static unordered_set<VariantLogicalType> GetVariantType(const LogicalType &type) {
	if (type.id() == LogicalTypeId::ANY) {
		return {};
	}
	switch (type.id()) {
	case LogicalTypeId::STRUCT:
		return {VariantLogicalType::OBJECT};
	case LogicalTypeId::LIST:
		return {VariantLogicalType::ARRAY};
	case LogicalTypeId::BOOLEAN:
		return {VariantLogicalType::BOOL_TRUE, VariantLogicalType::BOOL_FALSE};
	case LogicalTypeId::TINYINT:
		return {VariantLogicalType::INT8};
	case LogicalTypeId::SMALLINT:
		return {VariantLogicalType::INT16};
	case LogicalTypeId::INTEGER:
		return {VariantLogicalType::INT32};
	case LogicalTypeId::BIGINT:
		return {VariantLogicalType::INT64};
	case LogicalTypeId::FLOAT:
		return {VariantLogicalType::FLOAT};
	case LogicalTypeId::DOUBLE:
		return {VariantLogicalType::DOUBLE};
	case LogicalTypeId::DECIMAL:
		return {VariantLogicalType::DECIMAL};
	case LogicalTypeId::DATE:
		return {VariantLogicalType::DATE};
	case LogicalTypeId::TIME:
		return {VariantLogicalType::TIME_MICROS};
	case LogicalTypeId::TIMESTAMP_TZ:
		return {VariantLogicalType::TIMESTAMP_MICROS_TZ};
	case LogicalTypeId::TIMESTAMP:
		return {VariantLogicalType::TIMESTAMP_MICROS};
	case LogicalTypeId::TIMESTAMP_NS:
		return {VariantLogicalType::TIMESTAMP_NANOS};
	case LogicalTypeId::BLOB:
		return {VariantLogicalType::BLOB};
	case LogicalTypeId::VARCHAR:
		return {VariantLogicalType::VARCHAR};
	case LogicalTypeId::UUID:
		return {VariantLogicalType::UUID};
	default:
		throw BinderException("Type '%s' can't be translated to a VARIANT type", type.ToString());
	}
}

struct ShreddingState {
public:
	explicit ShreddingState(const LogicalType &type, idx_t total_count)
	    : type(type), shredded_sel(total_count), values_index_sel(total_count), result_sel(total_count) {
		variant_types = GetVariantType(type);
	}

public:
	bool ValueIsShredded(UnifiedVariantVectorData &variant, idx_t row, idx_t values_index) {
		auto type_id = variant.GetTypeId(row, values_index);
		if (!variant_types.count(type_id)) {
			return false;
		}
		if (type_id == VariantLogicalType::DECIMAL) {
			auto physical_type = type.InternalType();
			auto decimal_data = VariantUtils::DecodeDecimalData(variant, row, values_index);
			auto decimal_physical_type = decimal_data.GetPhysicalType();
			return physical_type == decimal_physical_type;
		}
		return true;
	}
	void SetShredded(idx_t row, idx_t values_index, idx_t result_idx) {
		shredded_sel[count] = row;
		values_index_sel[count] = values_index;
		result_sel[count] = result_idx;
		count++;
	}
	case_insensitive_string_set_t ObjectFields() {
		D_ASSERT(type.id() == LogicalTypeId::STRUCT);
		case_insensitive_string_set_t res;
		auto &child_types = StructType::GetChildTypes(type);
		for (auto &entry : child_types) {
			auto &type = entry.first;
			res.emplace(string_t(type.c_str(), type.size()));
		}
		return res;
	}

public:
	//! The type the field is shredded on
	const LogicalType &type;
	unordered_set<VariantLogicalType> variant_types;
	//! row that is shredded
	SelectionVector shredded_sel;
	//! 'values_index' of the shredded value
	SelectionVector values_index_sel;
	//! result row of the shredded value
	SelectionVector result_sel;
	//! The amount of rows that are shredded on
	idx_t count = 0;
};

} // namespace

vector<idx_t> GetChildIndices(const UnifiedVariantVectorData &variant, idx_t row, const VariantNestedData &nested_data,
                              optional_ptr<ShreddingState> shredding_state) {
	vector<idx_t> child_indices;
	if (!shredding_state || shredding_state->type.id() != LogicalTypeId::STRUCT) {
		for (idx_t i = 0; i < nested_data.child_count; i++) {
			child_indices.push_back(i);
		}
		return child_indices;
	}
	//! FIXME: The variant spec says that field names should be case-sensitive, not insensitive
	case_insensitive_string_set_t shredded_fields = shredding_state->ObjectFields();

	for (idx_t i = 0; i < nested_data.child_count; i++) {
		auto keys_index = variant.GetKeysIndex(row, i + nested_data.children_idx);
		auto &key = variant.GetKey(row, keys_index);

		if (shredded_fields.count(key)) {
			//! This field is shredded on, omit it from the value
			continue;
		}
		child_indices.push_back(i);
	}
	return child_indices;
}

static idx_t AnalyzeValueData(const UnifiedVariantVectorData &variant, idx_t row, uint32_t values_index,
                              vector<uint32_t> &offsets, optional_ptr<ShreddingState> shredding_state) {
	idx_t total_size = 0;
	//! Every value has at least a value header
	total_size++;

	idx_t offset_size = offsets.size();
	VariantLogicalType type_id = VariantLogicalType::VARIANT_NULL;
	if (variant.RowIsValid(row)) {
		type_id = variant.GetTypeId(row, values_index);
	}
	switch (type_id) {
	case VariantLogicalType::OBJECT: {
		auto nested_data = VariantUtils::DecodeNestedData(variant, row, values_index);

		//! Calculate value and key offsets for all children
		idx_t total_offset = 0;
		uint32_t highest_keys_index = 0;

		auto child_indices = GetChildIndices(variant, row, nested_data, shredding_state);
		if (nested_data.child_count && child_indices.empty()) {
			//! All fields of the object are shredded, omit the object entirely
			return 0;
		}

		auto num_elements = child_indices.size();
		offsets.resize(offset_size + num_elements + 1);

		for (idx_t entry = 0; entry < child_indices.size(); entry++) {
			auto i = child_indices[entry];
			auto keys_index = variant.GetKeysIndex(row, i + nested_data.children_idx);
			auto values_index = variant.GetValuesIndex(row, i + nested_data.children_idx);
			offsets[offset_size + entry] = total_offset;

			total_offset += AnalyzeValueData(variant, row, values_index, offsets, nullptr);
			highest_keys_index = MaxValue(highest_keys_index, keys_index);
		}
		offsets[offset_size + num_elements] = total_offset;

		//! Calculate the sizes for the objects value data
		auto field_id_size = CalculateByteLength(highest_keys_index);
		auto field_offset_size = CalculateByteLength(total_offset);
		const bool is_large = num_elements > NumericLimits<uint8_t>::Maximum();

		//! Now add the sizes for the objects value data
		if (is_large) {
			total_size += sizeof(uint32_t);
		} else {
			total_size += sizeof(uint8_t);
		}
		total_size += num_elements * field_id_size;
		total_size += (num_elements + 1) * field_offset_size;
		total_size += total_offset;
		break;
	}
	case VariantLogicalType::ARRAY: {
		auto nested_data = VariantUtils::DecodeNestedData(variant, row, values_index);

		idx_t total_offset = 0;
		offsets.resize(offset_size + nested_data.child_count + 1);
		for (idx_t i = 0; i < nested_data.child_count; i++) {
			auto values_index = variant.GetValuesIndex(row, i + nested_data.children_idx);
			offsets[offset_size + i] = total_offset;

			total_offset += AnalyzeValueData(variant, row, values_index, offsets, nullptr);
		}
		offsets[offset_size + nested_data.child_count] = total_offset;

		auto field_offset_size = CalculateByteLength(total_offset);
		auto num_elements = nested_data.child_count;
		const bool is_large = num_elements > NumericLimits<uint8_t>::Maximum();

		if (is_large) {
			total_size += sizeof(uint32_t);
		} else {
			total_size += sizeof(uint8_t);
		}
		total_size += (num_elements + 1) * field_offset_size;
		total_size += total_offset;
		break;
	}
	case VariantLogicalType::BLOB:
	case VariantLogicalType::VARCHAR: {
		auto string_value = VariantUtils::DecodeStringData(variant, row, values_index);
		total_size += string_value.GetSize();
		if (type_id == VariantLogicalType::BLOB || string_value.GetSize() > 64) {
			//! Save as regular string value
			total_size += sizeof(uint32_t);
		}
		break;
	}
	case VariantLogicalType::VARIANT_NULL:
	case VariantLogicalType::BOOL_TRUE:
	case VariantLogicalType::BOOL_FALSE:
		break;
	case VariantLogicalType::INT8:
		total_size += sizeof(uint8_t);
		break;
	case VariantLogicalType::INT16:
		total_size += sizeof(uint16_t);
		break;
	case VariantLogicalType::INT32:
		total_size += sizeof(uint32_t);
		break;
	case VariantLogicalType::INT64:
		total_size += sizeof(uint64_t);
		break;
	case VariantLogicalType::FLOAT:
		total_size += sizeof(float);
		break;
	case VariantLogicalType::DOUBLE:
		total_size += sizeof(double);
		break;
	case VariantLogicalType::DECIMAL: {
		auto decimal_data = VariantUtils::DecodeDecimalData(variant, row, values_index);
		total_size += 1;
		if (decimal_data.width <= 9) {
			total_size += sizeof(int32_t);
		} else if (decimal_data.width <= 18) {
			total_size += sizeof(int64_t);
		} else if (decimal_data.width <= 38) {
			total_size += sizeof(uhugeint_t);
		} else {
			throw InvalidInputException("Can't convert VARIANT DECIMAL(%d, %d) to Parquet VARIANT", decimal_data.width,
			                            decimal_data.scale);
		}
		break;
	}
	case VariantLogicalType::UUID:
		total_size += sizeof(uhugeint_t);
		break;
	case VariantLogicalType::DATE:
		total_size += sizeof(uint32_t);
		break;
	case VariantLogicalType::TIME_MICROS:
	case VariantLogicalType::TIMESTAMP_MICROS:
	case VariantLogicalType::TIMESTAMP_NANOS:
	case VariantLogicalType::TIMESTAMP_MICROS_TZ:
		total_size += sizeof(uint64_t);
		break;
	case VariantLogicalType::INTERVAL:
	case VariantLogicalType::BIGNUM:
	case VariantLogicalType::BITSTRING:
	case VariantLogicalType::TIMESTAMP_MILIS:
	case VariantLogicalType::TIMESTAMP_SEC:
	case VariantLogicalType::TIME_MICROS_TZ:
	case VariantLogicalType::TIME_NANOS:
	case VariantLogicalType::UINT8:
	case VariantLogicalType::UINT16:
	case VariantLogicalType::UINT32:
	case VariantLogicalType::UINT64:
	case VariantLogicalType::UINT128:
	case VariantLogicalType::INT128:
	default:
		throw InvalidInputException("Can't convert VARIANT of type '%s' to Parquet VARIANT",
		                            EnumUtil::ToString(type_id));
	}

	return total_size;
}

template <VariantPrimitiveType TYPE_ID>
void WritePrimitiveTypeHeader(data_ptr_t &value_data) {
	uint8_t value_header = 0;
	value_header |= static_cast<uint8_t>(VariantBasicType::PRIMITIVE);
	value_header |= static_cast<uint8_t>(TYPE_ID) << 2;

	*value_data = value_header;
	value_data++;
}

template <class T>
void CopySimplePrimitiveData(const UnifiedVariantVectorData &variant, data_ptr_t &value_data, idx_t row,
                             uint32_t values_index) {
	auto byte_offset = variant.GetByteOffset(row, values_index);
	auto data = const_data_ptr_cast(variant.GetData(row).GetData());
	auto ptr = data + byte_offset;
	memcpy(value_data, ptr, sizeof(T));
	value_data += sizeof(T);
}

void CopyUUIDData(const UnifiedVariantVectorData &variant, data_ptr_t &value_data, idx_t row, uint32_t values_index) {

	auto byte_offset = variant.GetByteOffset(row, values_index);
	auto data = const_data_ptr_cast(variant.GetData(row).GetData());
	auto ptr = data + byte_offset;

	auto uuid = Load<uhugeint_t>(ptr);
	BaseUUID::ToBlob(uuid, value_data);
	value_data += sizeof(uhugeint_t);
}

static void WritePrimitiveValueData(const UnifiedVariantVectorData &variant, idx_t row, uint32_t values_index,
                                    data_ptr_t &value_data, const vector<uint32_t> &offsets, idx_t &offset_index) {
	VariantLogicalType type_id = VariantLogicalType::VARIANT_NULL;
	if (variant.RowIsValid(row)) {
		type_id = variant.GetTypeId(row, values_index);
	}

	D_ASSERT(type_id != VariantLogicalType::OBJECT && type_id != VariantLogicalType::ARRAY);
	switch (type_id) {
	case VariantLogicalType::BLOB:
	case VariantLogicalType::VARCHAR: {
		auto string_value = VariantUtils::DecodeStringData(variant, row, values_index);
		auto string_size = string_value.GetSize();
		if (type_id == VariantLogicalType::BLOB || string_size > 64) {
			if (type_id == VariantLogicalType::BLOB) {
				WritePrimitiveTypeHeader<VariantPrimitiveType::BINARY>(value_data);
			} else {
				WritePrimitiveTypeHeader<VariantPrimitiveType::STRING>(value_data);
			}
			Store<uint32_t>(string_size, value_data);
			value_data += sizeof(uint32_t);
		} else {
			uint8_t value_header = 0;
			value_header |= static_cast<uint8_t>(VariantBasicType::SHORT_STRING);
			value_header |= static_cast<uint8_t>(string_size) << 2;

			*value_data = value_header;
			value_data++;
		}
		memcpy(value_data, reinterpret_cast<const char *>(string_value.GetData()), string_size);
		value_data += string_size;
		break;
	}
	case VariantLogicalType::VARIANT_NULL:
		WritePrimitiveTypeHeader<VariantPrimitiveType::NULL_TYPE>(value_data);
		break;
	case VariantLogicalType::BOOL_TRUE:
		WritePrimitiveTypeHeader<VariantPrimitiveType::BOOLEAN_TRUE>(value_data);
		break;
	case VariantLogicalType::BOOL_FALSE:
		WritePrimitiveTypeHeader<VariantPrimitiveType::BOOLEAN_FALSE>(value_data);
		break;
	case VariantLogicalType::INT8:
		WritePrimitiveTypeHeader<VariantPrimitiveType::INT8>(value_data);
		CopySimplePrimitiveData<int8_t>(variant, value_data, row, values_index);
		break;
	case VariantLogicalType::INT16:
		WritePrimitiveTypeHeader<VariantPrimitiveType::INT16>(value_data);
		CopySimplePrimitiveData<int16_t>(variant, value_data, row, values_index);
		break;
	case VariantLogicalType::INT32:
		WritePrimitiveTypeHeader<VariantPrimitiveType::INT32>(value_data);
		CopySimplePrimitiveData<int32_t>(variant, value_data, row, values_index);
		break;
	case VariantLogicalType::INT64:
		WritePrimitiveTypeHeader<VariantPrimitiveType::INT64>(value_data);
		CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index);
		break;
	case VariantLogicalType::FLOAT:
		WritePrimitiveTypeHeader<VariantPrimitiveType::FLOAT>(value_data);
		CopySimplePrimitiveData<float>(variant, value_data, row, values_index);
		break;
	case VariantLogicalType::DOUBLE:
		WritePrimitiveTypeHeader<VariantPrimitiveType::DOUBLE>(value_data);
		CopySimplePrimitiveData<double>(variant, value_data, row, values_index);
		break;
	case VariantLogicalType::UUID:
		WritePrimitiveTypeHeader<VariantPrimitiveType::UUID>(value_data);
		CopyUUIDData(variant, value_data, row, values_index);
		break;
	case VariantLogicalType::DATE:
		WritePrimitiveTypeHeader<VariantPrimitiveType::DATE>(value_data);
		CopySimplePrimitiveData<int32_t>(variant, value_data, row, values_index);
		break;
	case VariantLogicalType::TIME_MICROS:
		WritePrimitiveTypeHeader<VariantPrimitiveType::TIME_NTZ_MICROS>(value_data);
		CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index);
		break;
	case VariantLogicalType::TIMESTAMP_MICROS:
		WritePrimitiveTypeHeader<VariantPrimitiveType::TIMESTAMP_NTZ_MICROS>(value_data);
		CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index);
		break;
	case VariantLogicalType::TIMESTAMP_NANOS:
		WritePrimitiveTypeHeader<VariantPrimitiveType::TIMESTAMP_NTZ_NANOS>(value_data);
		CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index);
		break;
	case VariantLogicalType::TIMESTAMP_MICROS_TZ:
		WritePrimitiveTypeHeader<VariantPrimitiveType::TIMESTAMP_MICROS>(value_data);
		CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index);
		break;
	case VariantLogicalType::DECIMAL: {
		auto decimal_data = VariantUtils::DecodeDecimalData(variant, row, values_index);

		if (decimal_data.width <= 4 || decimal_data.width > 38) {
			throw InvalidInputException("Can't convert VARIANT DECIMAL(%d, %d) to Parquet VARIANT", decimal_data.width,
			                            decimal_data.scale);
		} else if (decimal_data.width <= 9) {
			WritePrimitiveTypeHeader<VariantPrimitiveType::DECIMAL4>(value_data);
			Store<int8_t>(decimal_data.scale, value_data);
			value_data++;
			memcpy(value_data, decimal_data.value_ptr, sizeof(int32_t));
			value_data += sizeof(int32_t);
		} else if (decimal_data.width <= 18) {
			WritePrimitiveTypeHeader<VariantPrimitiveType::DECIMAL8>(value_data);
			Store<int8_t>(decimal_data.scale, value_data);
			value_data++;
			memcpy(value_data, decimal_data.value_ptr, sizeof(int64_t));
			value_data += sizeof(int64_t);
		} else if (decimal_data.width <= 38) {
			WritePrimitiveTypeHeader<VariantPrimitiveType::DECIMAL16>(value_data);
			Store<int8_t>(decimal_data.scale, value_data);
			value_data++;
			memcpy(value_data, decimal_data.value_ptr, sizeof(hugeint_t));
			value_data += sizeof(hugeint_t);
		} else {
			throw InternalException(
			    "Uncovered VARIANT(DECIMAL) -> Parquet VARIANT conversion for type 'DECIMAL(%d, %d)'",
			    decimal_data.width, decimal_data.scale);
		}
		break;
	}
	case VariantLogicalType::INTERVAL:
	case VariantLogicalType::BIGNUM:
	case VariantLogicalType::BITSTRING:
	case VariantLogicalType::TIMESTAMP_MILIS:
	case VariantLogicalType::TIMESTAMP_SEC:
	case VariantLogicalType::TIME_MICROS_TZ:
	case VariantLogicalType::TIME_NANOS:
	case VariantLogicalType::UINT8:
	case VariantLogicalType::UINT16:
	case VariantLogicalType::UINT32:
	case VariantLogicalType::UINT64:
	case VariantLogicalType::UINT128:
	case VariantLogicalType::INT128:
	default:
		throw InvalidInputException("Can't convert VARIANT of type '%s' to Parquet VARIANT",
		                            EnumUtil::ToString(type_id));
	}
}

static void WriteValueData(const UnifiedVariantVectorData &variant, idx_t row, uint32_t values_index,
                           data_ptr_t &value_data, const vector<uint32_t> &offsets, idx_t &offset_index,
                           optional_ptr<ShreddingState> shredding_state) {

	VariantLogicalType type_id = VariantLogicalType::VARIANT_NULL;
	if (variant.RowIsValid(row)) {
		type_id = variant.GetTypeId(row, values_index);
	}
	if (type_id == VariantLogicalType::OBJECT) {
		auto nested_data = VariantUtils::DecodeNestedData(variant, row, values_index);

		//! -- Object value header --

		auto child_indices = GetChildIndices(variant, row, nested_data, shredding_state);
		if (nested_data.child_count && child_indices.empty()) {
			throw InternalException(
			    "The entire should be omitted, should have been handled by the Analyze step already");
		}
		auto num_elements = child_indices.size();

		//! Determine the 'field_id_size'
		uint32_t highest_keys_index = 0;
		for (auto &i : child_indices) {
			auto keys_index = variant.GetKeysIndex(row, i + nested_data.children_idx);
			highest_keys_index = MaxValue(highest_keys_index, keys_index);
		}
		auto field_id_size = CalculateByteLength(highest_keys_index);

		uint32_t last_offset = 0;
		if (num_elements) {
			last_offset = offsets[offset_index + num_elements];
		}
		offset_index += num_elements + 1;
		auto field_offset_size = CalculateByteLength(last_offset);

		const bool is_large = num_elements > NumericLimits<uint8_t>::Maximum();

		uint8_t value_header = 0;
		value_header |= static_cast<uint8_t>(VariantBasicType::OBJECT);
		value_header |= static_cast<uint8_t>(is_large) << 6;
		value_header |= (static_cast<uint8_t>(field_id_size) - 1) << 4;
		value_header |= (static_cast<uint8_t>(field_offset_size) - 1) << 2;

#ifdef DEBUG
		auto object_value_header = VariantValueMetadata::FromHeaderByte(value_header);
		D_ASSERT(object_value_header.basic_type == VariantBasicType::OBJECT);
		D_ASSERT(object_value_header.is_large == is_large);
		D_ASSERT(object_value_header.field_offset_size == field_offset_size);
		D_ASSERT(object_value_header.field_id_size == field_id_size);
#endif

		*value_data = value_header;
		value_data++;

		//! Write the 'num_elements'
		if (is_large) {
			Store<uint32_t>(static_cast<uint32_t>(num_elements), value_data);
			value_data += sizeof(uint32_t);
		} else {
			Store<uint8_t>(static_cast<uint8_t>(num_elements), value_data);
			value_data += sizeof(uint8_t);
		}

		//! Write the 'field_id' entries
		for (auto &i : child_indices) {
			auto keys_index = variant.GetKeysIndex(row, i + nested_data.children_idx);
			memcpy(value_data, reinterpret_cast<data_ptr_t>(&keys_index), field_id_size);
			value_data += field_id_size;
		}

		//! Write the 'field_offset' entries and the child 'value's
		auto children_ptr = value_data + ((num_elements + 1) * field_offset_size);
		idx_t total_offset = 0;
		for (auto &i : child_indices) {
			auto values_index = variant.GetValuesIndex(row, i + nested_data.children_idx);

			memcpy(value_data, reinterpret_cast<data_ptr_t>(&total_offset), field_offset_size);
			value_data += field_offset_size;
			auto start_ptr = children_ptr;
			WriteValueData(variant, row, values_index, children_ptr, offsets, offset_index, nullptr);
			total_offset += (children_ptr - start_ptr);
		}
		memcpy(value_data, reinterpret_cast<data_ptr_t>(&total_offset), field_offset_size);
		value_data += field_offset_size;
		D_ASSERT(children_ptr - total_offset == value_data);
		value_data = children_ptr;
	} else if (type_id == VariantLogicalType::ARRAY) {
		auto nested_data = VariantUtils::DecodeNestedData(variant, row, values_index);

		//! -- Array value header --

		uint32_t last_offset = 0;
		if (nested_data.child_count) {
			last_offset = offsets[offset_index + nested_data.child_count];
		}
		offset_index += nested_data.child_count + 1;
		auto field_offset_size = CalculateByteLength(last_offset);

		auto num_elements = nested_data.child_count;
		const bool is_large = num_elements > NumericLimits<uint8_t>::Maximum();

		uint8_t value_header = 0;
		value_header |= static_cast<uint8_t>(VariantBasicType::ARRAY);
		value_header |= static_cast<uint8_t>(is_large) << 4;
		value_header |= (static_cast<uint8_t>(field_offset_size) - 1) << 2;

#ifdef DEBUG
		auto array_value_header = VariantValueMetadata::FromHeaderByte(value_header);
		D_ASSERT(array_value_header.basic_type == VariantBasicType::ARRAY);
		D_ASSERT(array_value_header.is_large == is_large);
		D_ASSERT(array_value_header.field_offset_size == field_offset_size);
#endif

		*value_data = value_header;
		value_data++;

		//! Write the 'num_elements'
		if (is_large) {
			Store<uint32_t>(static_cast<uint32_t>(num_elements), value_data);
			value_data += sizeof(uint32_t);
		} else {
			Store<uint8_t>(static_cast<uint8_t>(num_elements), value_data);
			value_data += sizeof(uint8_t);
		}

		//! Write the 'field_offset' entries and the child 'value's
		auto children_ptr = value_data + ((num_elements + 1) * field_offset_size);
		idx_t total_offset = 0;
		for (idx_t i = 0; i < nested_data.child_count; i++) {
			auto values_index = variant.GetValuesIndex(row, i + nested_data.children_idx);

			memcpy(value_data, reinterpret_cast<data_ptr_t>(&total_offset), field_offset_size);
			value_data += field_offset_size;
			auto start_ptr = children_ptr;
			WriteValueData(variant, row, values_index, children_ptr, offsets, offset_index, nullptr);
			total_offset += (children_ptr - start_ptr);
		}
		memcpy(value_data, reinterpret_cast<data_ptr_t>(&total_offset), field_offset_size);
		value_data += field_offset_size;
		D_ASSERT(children_ptr - total_offset == value_data);
		value_data = children_ptr;
	} else {
		WritePrimitiveValueData(variant, row, values_index, value_data, offsets, offset_index);
	}
}

static void CreateValues(UnifiedVariantVectorData &variant, Vector &value, optional_ptr<const SelectionVector> sel,
                         optional_ptr<const SelectionVector> value_index_sel,
                         optional_ptr<const SelectionVector> result_sel, optional_ptr<ShreddingState> shredding_state,
                         idx_t count) {
	auto &validity = FlatVector::Validity(value);
	auto value_data = FlatVector::GetData<string_t>(value);

	for (idx_t i = 0; i < count; i++) {
		idx_t value_index = 0;
		if (value_index_sel) {
			value_index = value_index_sel->get_index(i);
		}

		idx_t row = i;
		if (sel) {
			row = sel->get_index(i);
		}

		idx_t result_index = i;
		if (result_sel) {
			result_index = result_sel->get_index(i);
		}

		bool is_shredded = false;
		if (variant.RowIsValid(row) && shredding_state && shredding_state->ValueIsShredded(variant, row, value_index)) {
			shredding_state->SetShredded(row, value_index, result_index);
			is_shredded = true;
			if (shredding_state->type.id() != LogicalTypeId::STRUCT) {
				//! Value is shredded, directly write a NULL to the 'value' if the type is not an OBJECT
				//! When the type is OBJECT, all excess fields would still need to be written to the 'value'
				validity.SetInvalid(result_index);
				continue;
			}
		}

		//! The (relative) offsets for each value, in the case of nesting
		vector<uint32_t> offsets;
		//! Determine the size of this 'value' blob
		idx_t blob_length = AnalyzeValueData(variant, row, value_index, offsets, shredding_state);
		if (!blob_length) {
			//! This is only allowed to happen for a shredded OBJECT, where there are no excess fields to write for the
			//! OBJECT
			(void)is_shredded;
			D_ASSERT(is_shredded);
			validity.SetInvalid(result_index);
			continue;
		}
		value_data[result_index] = StringVector::EmptyString(value, blob_length);
		auto &value_blob = value_data[result_index];
		auto value_blob_data = reinterpret_cast<data_ptr_t>(value_blob.GetDataWriteable());

		idx_t offset_index = 0;
		WriteValueData(variant, row, value_index, value_blob_data, offsets, offset_index, shredding_state);
		D_ASSERT(data_ptr_cast(value_blob.GetDataWriteable() + blob_length) == value_blob_data);
		value_blob.SetSizeAndFinalize(blob_length, blob_length);
	}
}

//! fwd-declare static method
static void WriteVariantValues(UnifiedVariantVectorData &variant, Vector &result,
                               optional_ptr<const SelectionVector> sel,
                               optional_ptr<const SelectionVector> value_index_sel,
                               optional_ptr<const SelectionVector> result_sel, idx_t count);

static void WriteTypedObjectValues(UnifiedVariantVectorData &variant, Vector &result, const SelectionVector &sel,
                                   const SelectionVector &value_index_sel, const SelectionVector &result_sel,
                                   idx_t count) {
	auto &type = result.GetType();
	D_ASSERT(type.id() == LogicalTypeId::STRUCT);

	auto &validity = FlatVector::Validity(result);
	(void)validity;

	//! Collect the nested data for the objects
	auto nested_data = make_unsafe_uniq_array_uninitialized<VariantNestedData>(count);
	for (idx_t i = 0; i < count; i++) {
		auto row = sel[i];
		//! When we're shredding an object, the top-level struct of it should always be valid
		D_ASSERT(validity.RowIsValid(result_sel[i]));
		auto value_index = value_index_sel[i];
		D_ASSERT(variant.GetTypeId(row, value_index) == VariantLogicalType::OBJECT);
		nested_data[i] = VariantUtils::DecodeNestedData(variant, row, value_index);
	}

	auto &shredded_types = StructType::GetChildTypes(type);
	auto &shredded_fields = StructVector::GetEntries(result);
	D_ASSERT(shredded_types.size() == shredded_fields.size());

	SelectionVector child_values_indexes;
	SelectionVector child_row_sel;
	SelectionVector child_result_sel;
	child_values_indexes.Initialize(count);
	child_row_sel.Initialize(count);
	child_result_sel.Initialize(count);

	for (idx_t child_idx = 0; child_idx < shredded_types.size(); child_idx++) {
		auto &child_vec = *shredded_fields[child_idx];
		D_ASSERT(child_vec.GetType() == shredded_types[child_idx].second);

		//! Prepare the path component to perform the lookup for
		auto &key = shredded_types[child_idx].first;
		VariantPathComponent path_component;
		path_component.lookup_mode = VariantChildLookupMode::BY_KEY;
		path_component.key = key;

		ValidityMask lookup_validity(count);
		VariantUtils::FindChildValues(variant, path_component, sel, child_values_indexes, lookup_validity,
		                              nested_data.get(), count);

		if (!lookup_validity.AllValid()) {
			auto &child_variant_vectors = StructVector::GetEntries(child_vec);

			//! For some of the rows the field is missing, adjust the selection vector to exclude these rows.
			idx_t child_count = 0;
			for (idx_t i = 0; i < count; i++) {
				if (!lookup_validity.RowIsValid(i)) {
					//! The field is missing, set it to null
					FlatVector::SetNull(*child_variant_vectors[0], result_sel[i], true);
					if (child_variant_vectors.size() >= 2) {
						FlatVector::SetNull(*child_variant_vectors[1], result_sel[i], true);
					}
					continue;
				}

				child_row_sel[child_count] = sel[i];
				child_values_indexes[child_count] = child_values_indexes[i];
				child_result_sel[child_count] = result_sel[i];
				child_count++;
			}

			if (child_count) {
				//! If not all rows are missing this field, write the values for it
				WriteVariantValues(variant, child_vec, child_row_sel, child_values_indexes, child_result_sel,
				                   child_count);
			}
		} else {
			WriteVariantValues(variant, child_vec, &sel, child_values_indexes, result_sel, count);
		}
	}
}

static void WriteTypedArrayValues(UnifiedVariantVectorData &variant, Vector &result, const SelectionVector &sel,
                                  const SelectionVector &value_index_sel, const SelectionVector &result_sel,
                                  idx_t count) {
	auto list_data = FlatVector::GetData<list_entry_t>(result);

	auto nested_data = make_unsafe_uniq_array_uninitialized<VariantNestedData>(count);

	idx_t total_offset = 0;
	for (idx_t i = 0; i < count; i++) {
		auto row = sel[i];
		auto value_index = value_index_sel[i];
		auto result_row = result_sel[i];

		D_ASSERT(variant.GetTypeId(row, value_index) == VariantLogicalType::ARRAY);
		nested_data[i] = VariantUtils::DecodeNestedData(variant, row, value_index);

		list_entry_t list_entry;
		list_entry.length = nested_data[i].child_count;
		list_entry.offset = total_offset;
		list_data[result_row] = list_entry;

		total_offset += nested_data[i].child_count;
	}
	ListVector::Reserve(result, total_offset);
	ListVector::SetListSize(result, total_offset);

	SelectionVector child_sel;
	child_sel.Initialize(total_offset);

	SelectionVector child_value_index_sel;
	child_value_index_sel.Initialize(total_offset);

	SelectionVector child_result_sel;
	child_result_sel.Initialize(total_offset);

	for (idx_t i = 0; i < count; i++) {
		auto row = sel[i];
		auto result_row = result_sel[i];

		auto &array_data = nested_data[i];
		auto &entry = list_data[result_row];
		for (idx_t j = 0; j < entry.length; j++) {
			auto offset = entry.offset + j;
			child_sel[offset] = row;
			child_value_index_sel[offset] = variant.GetValuesIndex(row, array_data.children_idx + j);
			child_result_sel[offset] = offset;
		}
	}

	auto &child_vector = ListVector::GetEntry(result);
	WriteVariantValues(variant, child_vector, child_sel, child_value_index_sel, child_result_sel, total_offset);
}

//! TODO: introduce a third selection vector, because we also need one to map to the result row to write
//! This becomes necessary when we introduce LISTs into the equation because lists are stored on the same VARIANT row,
//! but we're now going to write the flattened child vector
static void WriteShreddedPrimitive(UnifiedVariantVectorData &variant, Vector &result, const SelectionVector &sel,
                                   const SelectionVector &value_index_sel, const SelectionVector &result_sel,
                                   idx_t count, idx_t type_size) {
	auto result_data = FlatVector::GetData(result);
	for (idx_t i = 0; i < count; i++) {
		auto row = sel[i];
		auto result_row = result_sel[i];
		auto value_index = value_index_sel[i];
		D_ASSERT(variant.RowIsValid(row));

		auto byte_offset = variant.GetByteOffset(row, value_index);
		auto &data = variant.GetData(row);
		auto value_ptr = data.GetData();
		auto result_offset = type_size * result_row;
		memcpy(result_data + result_offset, value_ptr + byte_offset, type_size);
	}
}

template <class T>
static void WriteShreddedDecimal(UnifiedVariantVectorData &variant, Vector &result, const SelectionVector &sel,
                                 const SelectionVector &value_index_sel, const SelectionVector &result_sel,
                                 idx_t count) {
	auto result_data = FlatVector::GetData(result);
	for (idx_t i = 0; i < count; i++) {
		auto row = sel[i];
		auto result_row = result_sel[i];
		auto value_index = value_index_sel[i];
		D_ASSERT(variant.RowIsValid(row) && variant.GetTypeId(row, value_index) == VariantLogicalType::DECIMAL);

		auto decimal_data = VariantUtils::DecodeDecimalData(variant, row, value_index);
		D_ASSERT(decimal_data.width <= DecimalWidth<T>::max);
		auto result_offset = sizeof(T) * result_row;
		memcpy(result_data + result_offset, decimal_data.value_ptr, sizeof(T));
	}
}

static void WriteShreddedString(UnifiedVariantVectorData &variant, Vector &result, const SelectionVector &sel,
                                const SelectionVector &value_index_sel, const SelectionVector &result_sel,
                                idx_t count) {
	auto result_data = FlatVector::GetData<string_t>(result);
	for (idx_t i = 0; i < count; i++) {
		auto row = sel[i];
		auto result_row = result_sel[i];
		auto value_index = value_index_sel[i];
		D_ASSERT(variant.RowIsValid(row) && (variant.GetTypeId(row, value_index) == VariantLogicalType::VARCHAR ||
		                                     variant.GetTypeId(row, value_index) == VariantLogicalType::BLOB));

		auto string_data = VariantUtils::DecodeStringData(variant, row, value_index);
		result_data[result_row] = StringVector::AddStringOrBlob(result, string_data);
	}
}

static void WriteShreddedBoolean(UnifiedVariantVectorData &variant, Vector &result, const SelectionVector &sel,
                                 const SelectionVector &value_index_sel, const SelectionVector &result_sel,
                                 idx_t count) {
	auto result_data = FlatVector::GetData<bool>(result);
	for (idx_t i = 0; i < count; i++) {
		auto row = sel[i];
		auto result_row = result_sel[i];
		auto value_index = value_index_sel[i];
		D_ASSERT(variant.RowIsValid(row));
		auto type_id = variant.GetTypeId(row, value_index);
		D_ASSERT(type_id == VariantLogicalType::BOOL_FALSE || type_id == VariantLogicalType::BOOL_TRUE);

		result_data[result_row] = type_id == VariantLogicalType::BOOL_TRUE;
	}
}

static void WriteTypedPrimitiveValues(UnifiedVariantVectorData &variant, Vector &result, const SelectionVector &sel,
                                      const SelectionVector &value_index_sel, const SelectionVector &result_sel,
                                      idx_t count) {
	auto &type = result.GetType();
	D_ASSERT(!type.IsNested());
	switch (type.id()) {
	case LogicalTypeId::TINYINT:
	case LogicalTypeId::SMALLINT:
	case LogicalTypeId::INTEGER:
	case LogicalTypeId::BIGINT:
	case LogicalTypeId::FLOAT:
	case LogicalTypeId::DOUBLE:
	case LogicalTypeId::DATE:
	case LogicalTypeId::TIME:
	case LogicalTypeId::TIMESTAMP_TZ:
	case LogicalTypeId::TIMESTAMP:
	case LogicalTypeId::TIMESTAMP_NS:
	case LogicalTypeId::UUID: {
		const auto physical_type = type.InternalType();
		WriteShreddedPrimitive(variant, result, sel, value_index_sel, result_sel, count, GetTypeIdSize(physical_type));
		break;
	}
	case LogicalTypeId::DECIMAL: {
		const auto physical_type = type.InternalType();
		switch (physical_type) {
		//! DECIMAL4
		case PhysicalType::INT32:
			WriteShreddedDecimal<int32_t>(variant, result, sel, value_index_sel, result_sel, count);
			break;
		//! DECIMAL8
		case PhysicalType::INT64:
			WriteShreddedDecimal<int64_t>(variant, result, sel, value_index_sel, result_sel, count);
			break;
		//! DECIMAL16
		case PhysicalType::INT128:
			WriteShreddedDecimal<hugeint_t>(variant, result, sel, value_index_sel, result_sel, count);
			break;
		default:
			throw InvalidInputException("Can't shred on column of type '%s'", type.ToString());
		}
		break;
	}
	case LogicalTypeId::BLOB:
	case LogicalTypeId::VARCHAR: {
		WriteShreddedString(variant, result, sel, value_index_sel, result_sel, count);
		break;
	}
	case LogicalTypeId::BOOLEAN:
		WriteShreddedBoolean(variant, result, sel, value_index_sel, result_sel, count);
		break;
	default:
		throw InvalidInputException("Can't shred on type: %s", type.ToString());
	}
}

static void WriteTypedValues(UnifiedVariantVectorData &variant, Vector &result, const SelectionVector &sel,
                             const SelectionVector &value_index_sel, const SelectionVector &result_sel, idx_t count) {
	auto &type = result.GetType();

	if (type.id() == LogicalTypeId::STRUCT) {
		//! Shredded OBJECT
		WriteTypedObjectValues(variant, result, sel, value_index_sel, result_sel, count);
	} else if (type.id() == LogicalTypeId::LIST) {
		//! Shredded ARRAY
		WriteTypedArrayValues(variant, result, sel, value_index_sel, result_sel, count);
	} else {
		//! Primitive types
		WriteTypedPrimitiveValues(variant, result, sel, value_index_sel, result_sel, count);
	}
}

static void WriteVariantValues(UnifiedVariantVectorData &variant, Vector &result,
                               optional_ptr<const SelectionVector> sel,
                               optional_ptr<const SelectionVector> value_index_sel,
                               optional_ptr<const SelectionVector> result_sel, idx_t count) {
	optional_ptr<Vector> value;
	optional_ptr<Vector> typed_value;

	auto &result_type = result.GetType();
	D_ASSERT(result_type.id() == LogicalTypeId::STRUCT);
	auto &child_types = StructType::GetChildTypes(result_type);
	auto &child_vectors = StructVector::GetEntries(result);
	D_ASSERT(child_types.size() == child_vectors.size());
	for (idx_t i = 0; i < child_types.size(); i++) {
		auto &name = child_types[i].first;
		if (name == "value") {
			value = child_vectors[i].get();
		} else if (name == "typed_value") {
			typed_value = child_vectors[i].get();
		}
	}

	if (typed_value) {
		ShreddingState shredding_state(typed_value->GetType(), count);
		CreateValues(variant, *value, sel, value_index_sel, result_sel, &shredding_state, count);

		SelectionVector null_values;
		if (shredding_state.count) {
			WriteTypedValues(variant, *typed_value, shredding_state.shredded_sel, shredding_state.values_index_sel,
			                 shredding_state.result_sel, shredding_state.count);
			//! 'shredding_state.result_sel' will always be a subset of 'result_sel', set the rows not in the subset to
			//! NULL
			idx_t sel_idx = 0;
			for (idx_t i = 0; i < count; i++) {
				auto original_index = result_sel ? result_sel->get_index(i) : i;
				if (sel_idx < shredding_state.count && shredding_state.result_sel[sel_idx] == original_index) {
					sel_idx++;
					continue;
				}
				FlatVector::SetNull(*typed_value, original_index, true);
			}
		} else {
			//! Set all rows of the typed_value to NULL, nothing is shredded on
			for (idx_t i = 0; i < count; i++) {
				FlatVector::SetNull(*typed_value, result_sel ? result_sel->get_index(i) : i, true);
			}
		}
	} else {
		CreateValues(variant, *value, sel, value_index_sel, result_sel, nullptr, count);
	}
}

static void ToParquetVariant(DataChunk &input, ExpressionState &state, Vector &result) {
	// DuckDB Variant:
	// - keys = VARCHAR[]
	// - children = STRUCT(keys_index UINTEGER, values_index UINTEGER)[]
	// - values = STRUCT(type_id UTINYINT, byte_offset UINTEGER)[]
	// - data = BLOB

	// Parquet VARIANT:
	// - metadata = BLOB
	// - value = BLOB

	auto &variant_vec = input.data[0];
	auto count = input.size();

	RecursiveUnifiedVectorFormat recursive_format;
	Vector::RecursiveToUnifiedFormat(variant_vec, count, recursive_format);
	UnifiedVariantVectorData variant(recursive_format);

	auto &result_vectors = StructVector::GetEntries(result);
	auto &metadata = *result_vectors[0];
	CreateMetadata(variant, metadata, count);
	WriteVariantValues(variant, result, nullptr, nullptr, nullptr, count);

	if (input.AllConstant()) {
		result.SetVectorType(VectorType::CONSTANT_VECTOR);
	}
}

LogicalType VariantColumnWriter::TransformTypedValueRecursive(const LogicalType &type) {
	switch (type.id()) {
	case LogicalTypeId::STRUCT: {
		//! Wrap all fields of the struct in a struct with 'value' and 'typed_value' fields
		auto &child_types = StructType::GetChildTypes(type);
		child_list_t<LogicalType> replaced_types;
		for (auto &entry : child_types) {
			child_list_t<LogicalType> child_children;
			child_children.emplace_back("value", LogicalType::BLOB);
			if (entry.second.id() != LogicalTypeId::VARIANT) {
				child_children.emplace_back("typed_value", TransformTypedValueRecursive(entry.second));
			}
			replaced_types.emplace_back(entry.first, LogicalType::STRUCT(child_children));
		}
		return LogicalType::STRUCT(replaced_types);
	}
	case LogicalTypeId::LIST: {
		auto &child_type = ListType::GetChildType(type);
		child_list_t<LogicalType> replaced_types;
		replaced_types.emplace_back("value", LogicalType::BLOB);
		if (child_type.id() != LogicalTypeId::VARIANT) {
			replaced_types.emplace_back("typed_value", TransformTypedValueRecursive(child_type));
		}
		return LogicalType::LIST(LogicalType::STRUCT(replaced_types));
	}
	case LogicalTypeId::UNION:
	case LogicalTypeId::MAP:
	case LogicalTypeId::VARIANT:
	case LogicalTypeId::ARRAY:
		throw BinderException("'%s' can't appear inside the a 'typed_value' shredded type!", type.ToString());
	default:
		return type;
	}
}

static LogicalType GetParquetVariantType(optional_ptr<LogicalType> shredding = nullptr) {
	child_list_t<LogicalType> children;
	children.emplace_back("metadata", LogicalType::BLOB);
	children.emplace_back("value", LogicalType::BLOB);
	if (shredding) {
		children.emplace_back("typed_value", VariantColumnWriter::TransformTypedValueRecursive(*shredding));
	}
	auto res = LogicalType::STRUCT(std::move(children));
	res.SetAlias("PARQUET_VARIANT");
	return res;
}

static unique_ptr<FunctionData> BindTransform(ClientContext &context, ScalarFunction &bound_function,
                                              vector<unique_ptr<Expression>> &arguments) {
	if (arguments.empty()) {
		return nullptr;
	}
	auto type = ExpressionBinder::GetExpressionReturnType(*arguments[0]);

	if (arguments.size() == 2) {
		auto &shredding = *arguments[1];
		auto expr_return_type = ExpressionBinder::GetExpressionReturnType(shredding);
		expr_return_type = LogicalType::NormalizeType(expr_return_type);
		if (expr_return_type.id() != LogicalTypeId::VARCHAR) {
			throw BinderException("Optional second argument 'shredding' has to be of type VARCHAR, i.e: "
			                      "'STRUCT(my_field BOOLEAN)', found type: '%s' instead",
			                      expr_return_type);
		}
		if (!shredding.IsFoldable()) {
			throw BinderException("Optional second argument 'shredding' has to be a constant expression");
		}
		Value type_str = ExpressionExecutor::EvaluateScalar(context, shredding);
		if (type_str.IsNull()) {
			throw BinderException("Optional second argument 'shredding' can not be NULL");
		}
		auto shredded_type = TransformStringToLogicalType(type_str.GetValue<string>());
		bound_function.return_type = GetParquetVariantType(shredded_type);
	} else {
		bound_function.return_type = GetParquetVariantType();
	}

	return nullptr;
}

ScalarFunction VariantColumnWriter::GetTransformFunction() {
	ScalarFunction transform("variant_to_parquet_variant", {LogicalType::VARIANT()}, LogicalType::ANY, ToParquetVariant,
	                         BindTransform);
	transform.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
	return transform;
}

} // namespace duckdb