should be it
This commit is contained in:
7
external/duckdb/extension/parquet/reader/variant/CMakeLists.txt
vendored
Normal file
7
external/duckdb/extension/parquet/reader/variant/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
add_library_unity(
|
||||
duckdb_parquet_reader_variant OBJECT variant_binary_decoder.cpp
|
||||
variant_value.cpp variant_shredded_conversion.cpp)
|
||||
|
||||
set(PARQUET_EXTENSION_FILES
|
||||
${PARQUET_EXTENSION_FILES} $<TARGET_OBJECTS:duckdb_parquet_reader_variant>
|
||||
PARENT_SCOPE)
|
||||
365
external/duckdb/extension/parquet/reader/variant/variant_binary_decoder.cpp
vendored
Normal file
365
external/duckdb/extension/parquet/reader/variant/variant_binary_decoder.cpp
vendored
Normal file
@@ -0,0 +1,365 @@
|
||||
#include "reader/variant/variant_binary_decoder.hpp"
|
||||
#include "duckdb/common/printer.hpp"
|
||||
#include "utf8proc_wrapper.hpp"
|
||||
|
||||
#include "reader/uuid_column_reader.hpp"
|
||||
|
||||
#include "duckdb/common/types/timestamp.hpp"
|
||||
#include "duckdb/common/types/decimal.hpp"
|
||||
#include "duckdb/common/types/uuid.hpp"
|
||||
#include "duckdb/common/types/time.hpp"
|
||||
#include "duckdb/common/types/date.hpp"
|
||||
#include "duckdb/common/types/blob.hpp"
|
||||
|
||||
static constexpr uint8_t VERSION_MASK = 0xF;
|
||||
static constexpr uint8_t SORTED_STRINGS_MASK = 0x1;
|
||||
static constexpr uint8_t SORTED_STRINGS_SHIFT = 4;
|
||||
static constexpr uint8_t OFFSET_SIZE_MINUS_ONE_MASK = 0x3;
|
||||
static constexpr uint8_t OFFSET_SIZE_MINUS_ONE_SHIFT = 6;
|
||||
|
||||
static constexpr uint8_t BASIC_TYPE_MASK = 0x3;
|
||||
static constexpr uint8_t VALUE_HEADER_SHIFT = 2;
|
||||
|
||||
//! Object and Array header
|
||||
static constexpr uint8_t FIELD_OFFSET_SIZE_MINUS_ONE_MASK = 0x3;
|
||||
|
||||
//! Object header
|
||||
static constexpr uint8_t FIELD_ID_SIZE_MINUS_ONE_MASK = 0x3;
|
||||
static constexpr uint8_t FIELD_ID_SIZE_MINUS_ONE_SHIFT = 2;
|
||||
|
||||
static constexpr uint8_t OBJECT_IS_LARGE_MASK = 0x1;
|
||||
static constexpr uint8_t OBJECT_IS_LARGE_SHIFT = 4;
|
||||
|
||||
//! Array header
|
||||
static constexpr uint8_t ARRAY_IS_LARGE_MASK = 0x1;
|
||||
static constexpr uint8_t ARRAY_IS_LARGE_SHIFT = 2;
|
||||
|
||||
using namespace duckdb_yyjson;
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
namespace {
|
||||
|
||||
static idx_t ReadVariableLengthLittleEndian(idx_t length_in_bytes, const_data_ptr_t &ptr) {
|
||||
if (length_in_bytes > sizeof(idx_t)) {
|
||||
throw NotImplementedException("Can't read little-endian value of %d bytes", length_in_bytes);
|
||||
}
|
||||
idx_t result = 0;
|
||||
memcpy(reinterpret_cast<uint8_t *>(&result), ptr, length_in_bytes);
|
||||
ptr += length_in_bytes;
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
VariantMetadataHeader VariantMetadataHeader::FromHeaderByte(uint8_t byte) {
|
||||
VariantMetadataHeader header;
|
||||
header.version = byte & VERSION_MASK;
|
||||
header.sorted_strings = (byte >> SORTED_STRINGS_SHIFT) & SORTED_STRINGS_MASK;
|
||||
header.offset_size = ((byte >> OFFSET_SIZE_MINUS_ONE_SHIFT) & OFFSET_SIZE_MINUS_ONE_MASK) + 1;
|
||||
|
||||
if (header.version != 1) {
|
||||
throw NotImplementedException("Only version 1 of the Variant encoding scheme is supported, found version: %d",
|
||||
header.version);
|
||||
}
|
||||
|
||||
return header;
|
||||
}
|
||||
|
||||
VariantMetadata::VariantMetadata(const string_t &metadata) : metadata(metadata) {
|
||||
auto metadata_data = metadata.GetData();
|
||||
|
||||
header = VariantMetadataHeader::FromHeaderByte(metadata_data[0]);
|
||||
|
||||
const_data_ptr_t ptr = reinterpret_cast<const_data_ptr_t>(metadata_data + sizeof(uint8_t));
|
||||
idx_t dictionary_size = ReadVariableLengthLittleEndian(header.offset_size, ptr);
|
||||
|
||||
auto offsets = ptr;
|
||||
auto bytes = offsets + ((dictionary_size + 1) * header.offset_size);
|
||||
idx_t last_offset = ReadVariableLengthLittleEndian(header.offset_size, ptr);
|
||||
for (idx_t i = 0; i < dictionary_size; i++) {
|
||||
auto next_offset = ReadVariableLengthLittleEndian(header.offset_size, ptr);
|
||||
strings.emplace_back(reinterpret_cast<const char *>(bytes + last_offset), next_offset - last_offset);
|
||||
last_offset = next_offset;
|
||||
}
|
||||
}
|
||||
|
||||
VariantValueMetadata VariantValueMetadata::FromHeaderByte(uint8_t byte) {
|
||||
VariantValueMetadata result;
|
||||
result.basic_type = VariantBasicTypeFromByte(byte & BASIC_TYPE_MASK);
|
||||
uint8_t value_header = byte >> VALUE_HEADER_SHIFT;
|
||||
switch (result.basic_type) {
|
||||
case VariantBasicType::PRIMITIVE: {
|
||||
result.primitive_type = VariantPrimitiveTypeFromByte(value_header);
|
||||
break;
|
||||
}
|
||||
case VariantBasicType::SHORT_STRING: {
|
||||
result.string_size = value_header;
|
||||
break;
|
||||
}
|
||||
case VariantBasicType::OBJECT: {
|
||||
result.field_offset_size = (value_header & FIELD_OFFSET_SIZE_MINUS_ONE_MASK) + 1;
|
||||
result.field_id_size = ((value_header >> FIELD_ID_SIZE_MINUS_ONE_SHIFT) & FIELD_ID_SIZE_MINUS_ONE_MASK) + 1;
|
||||
result.is_large = (value_header >> OBJECT_IS_LARGE_SHIFT) & OBJECT_IS_LARGE_MASK;
|
||||
break;
|
||||
}
|
||||
case VariantBasicType::ARRAY: {
|
||||
result.field_offset_size = (value_header & FIELD_OFFSET_SIZE_MINUS_ONE_MASK) + 1;
|
||||
result.is_large = (value_header >> ARRAY_IS_LARGE_SHIFT) & ARRAY_IS_LARGE_MASK;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw InternalException("VariantBasicType (%d) not handled", static_cast<uint8_t>(result.basic_type));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static T DecodeDecimal(const_data_ptr_t data, uint8_t &scale, uint8_t &width) {
|
||||
scale = Load<uint8_t>(data);
|
||||
data++;
|
||||
|
||||
auto result = Load<T>(data);
|
||||
//! FIXME: The spec says:
|
||||
//! The implied precision of a decimal value is `floor(log_10(val)) + 1`
|
||||
width = DecimalWidth<T>::max;
|
||||
return result;
|
||||
}
|
||||
|
||||
template <>
|
||||
hugeint_t DecodeDecimal(const_data_ptr_t data, uint8_t &scale, uint8_t &width) {
|
||||
scale = Load<uint8_t>(data);
|
||||
data++;
|
||||
|
||||
hugeint_t result;
|
||||
result.lower = Load<uint64_t>(data);
|
||||
result.upper = Load<int64_t>(data + sizeof(uint64_t));
|
||||
//! FIXME: The spec says:
|
||||
//! The implied precision of a decimal value is `floor(log_10(val)) + 1`
|
||||
width = DecimalWidth<hugeint_t>::max;
|
||||
return result;
|
||||
}
|
||||
|
||||
VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantValueMetadata &value_metadata,
|
||||
const_data_ptr_t data) {
|
||||
switch (value_metadata.primitive_type) {
|
||||
case VariantPrimitiveType::NULL_TYPE: {
|
||||
return VariantValue(Value());
|
||||
}
|
||||
case VariantPrimitiveType::BOOLEAN_TRUE: {
|
||||
return VariantValue(Value::BOOLEAN(true));
|
||||
}
|
||||
case VariantPrimitiveType::BOOLEAN_FALSE: {
|
||||
return VariantValue(Value::BOOLEAN(false));
|
||||
}
|
||||
case VariantPrimitiveType::INT8: {
|
||||
auto value = Load<int8_t>(data);
|
||||
return VariantValue(Value::TINYINT(value));
|
||||
}
|
||||
case VariantPrimitiveType::INT16: {
|
||||
auto value = Load<int16_t>(data);
|
||||
return VariantValue(Value::SMALLINT(value));
|
||||
}
|
||||
case VariantPrimitiveType::INT32: {
|
||||
auto value = Load<int32_t>(data);
|
||||
return VariantValue(Value::INTEGER(value));
|
||||
}
|
||||
case VariantPrimitiveType::INT64: {
|
||||
auto value = Load<int64_t>(data);
|
||||
return VariantValue(Value::BIGINT(value));
|
||||
}
|
||||
case VariantPrimitiveType::DOUBLE: {
|
||||
double value = Load<double>(data);
|
||||
return VariantValue(Value::DOUBLE(value));
|
||||
}
|
||||
case VariantPrimitiveType::FLOAT: {
|
||||
float value = Load<float>(data);
|
||||
return VariantValue(Value::FLOAT(value));
|
||||
}
|
||||
case VariantPrimitiveType::DECIMAL4: {
|
||||
uint8_t scale;
|
||||
uint8_t width;
|
||||
|
||||
auto value = DecodeDecimal<int32_t>(data, scale, width);
|
||||
auto value_str = Decimal::ToString(value, width, scale);
|
||||
return VariantValue(Value(value_str));
|
||||
}
|
||||
case VariantPrimitiveType::DECIMAL8: {
|
||||
uint8_t scale;
|
||||
uint8_t width;
|
||||
|
||||
auto value = DecodeDecimal<int64_t>(data, scale, width);
|
||||
auto value_str = Decimal::ToString(value, width, scale);
|
||||
return VariantValue(Value(value_str));
|
||||
}
|
||||
case VariantPrimitiveType::DECIMAL16: {
|
||||
uint8_t scale;
|
||||
uint8_t width;
|
||||
|
||||
auto value = DecodeDecimal<hugeint_t>(data, scale, width);
|
||||
auto value_str = Decimal::ToString(value, width, scale);
|
||||
return VariantValue(Value(value_str));
|
||||
}
|
||||
case VariantPrimitiveType::DATE: {
|
||||
date_t value;
|
||||
value.days = Load<int32_t>(data);
|
||||
return VariantValue(Value::DATE(value));
|
||||
}
|
||||
case VariantPrimitiveType::TIMESTAMP_MICROS: {
|
||||
timestamp_tz_t micros_ts_tz;
|
||||
micros_ts_tz.value = Load<int64_t>(data);
|
||||
return VariantValue(Value::TIMESTAMPTZ(micros_ts_tz));
|
||||
}
|
||||
case VariantPrimitiveType::TIMESTAMP_NTZ_MICROS: {
|
||||
timestamp_t micros_ts;
|
||||
micros_ts.value = Load<int64_t>(data);
|
||||
|
||||
auto value = Value::TIMESTAMP(micros_ts);
|
||||
auto value_str = value.ToString();
|
||||
return VariantValue(Value(value_str));
|
||||
}
|
||||
case VariantPrimitiveType::BINARY: {
|
||||
//! Follow the JSON serialization guide by converting BINARY to Base64:
|
||||
//! For example: `"dmFyaWFudAo="`
|
||||
auto size = Load<uint32_t>(data);
|
||||
auto string_data = reinterpret_cast<const char *>(data + sizeof(uint32_t));
|
||||
auto base64_string = Blob::ToBase64(string_t(string_data, size));
|
||||
return VariantValue(Value(base64_string));
|
||||
}
|
||||
case VariantPrimitiveType::STRING: {
|
||||
auto size = Load<uint32_t>(data);
|
||||
auto string_data = reinterpret_cast<const char *>(data + sizeof(uint32_t));
|
||||
if (!Utf8Proc::IsValid(string_data, size)) {
|
||||
throw InternalException("Can't decode Variant short-string, string isn't valid UTF8");
|
||||
}
|
||||
return VariantValue(Value(string(string_data, size)));
|
||||
}
|
||||
case VariantPrimitiveType::TIME_NTZ_MICROS: {
|
||||
dtime_t micros_time;
|
||||
micros_time.micros = Load<int64_t>(data);
|
||||
return VariantValue(Value::TIME(micros_time));
|
||||
}
|
||||
case VariantPrimitiveType::TIMESTAMP_NANOS: {
|
||||
timestamp_ns_t nanos_ts;
|
||||
nanos_ts.value = Load<int64_t>(data);
|
||||
|
||||
//! Convert the nanos timestamp to a micros timestamp (not lossless)
|
||||
auto micros_ts = Timestamp::FromEpochNanoSeconds(nanos_ts.value);
|
||||
return VariantValue(Value::TIMESTAMPTZ(timestamp_tz_t(micros_ts)));
|
||||
}
|
||||
case VariantPrimitiveType::TIMESTAMP_NTZ_NANOS: {
|
||||
timestamp_ns_t nanos_ts;
|
||||
nanos_ts.value = Load<int64_t>(data);
|
||||
|
||||
auto value = Value::TIMESTAMPNS(nanos_ts);
|
||||
auto value_str = value.ToString();
|
||||
return VariantValue(Value(value_str));
|
||||
}
|
||||
case VariantPrimitiveType::UUID: {
|
||||
auto uuid_value = UUIDValueConversion::ReadParquetUUID(data);
|
||||
auto value_str = UUID::ToString(uuid_value);
|
||||
return VariantValue(Value(value_str));
|
||||
}
|
||||
default:
|
||||
throw NotImplementedException("Variant PrimitiveTypeDecode not implemented for type (%d)",
|
||||
static_cast<uint8_t>(value_metadata.primitive_type));
|
||||
}
|
||||
}
|
||||
|
||||
VariantValue VariantBinaryDecoder::ShortStringDecode(const VariantValueMetadata &value_metadata,
|
||||
const_data_ptr_t data) {
|
||||
D_ASSERT(value_metadata.string_size < 64);
|
||||
auto string_data = reinterpret_cast<const char *>(data);
|
||||
if (!Utf8Proc::IsValid(string_data, value_metadata.string_size)) {
|
||||
throw InternalException("Can't decode Variant short-string, string isn't valid UTF8");
|
||||
}
|
||||
return VariantValue(Value(string(string_data, value_metadata.string_size)));
|
||||
}
|
||||
|
||||
VariantValue VariantBinaryDecoder::ObjectDecode(const VariantMetadata &metadata,
|
||||
const VariantValueMetadata &value_metadata, const_data_ptr_t data) {
|
||||
VariantValue ret(VariantValueType::OBJECT);
|
||||
|
||||
auto field_offset_size = value_metadata.field_offset_size;
|
||||
auto field_id_size = value_metadata.field_id_size;
|
||||
auto is_large = value_metadata.is_large;
|
||||
|
||||
idx_t num_elements;
|
||||
if (is_large) {
|
||||
num_elements = Load<uint32_t>(data);
|
||||
data += sizeof(uint32_t);
|
||||
} else {
|
||||
num_elements = Load<uint8_t>(data);
|
||||
data += sizeof(uint8_t);
|
||||
}
|
||||
|
||||
auto field_ids = data;
|
||||
auto field_offsets = data + (num_elements * field_id_size);
|
||||
auto values = field_offsets + ((num_elements + 1) * field_offset_size);
|
||||
|
||||
idx_t last_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets);
|
||||
for (idx_t i = 0; i < num_elements; i++) {
|
||||
auto field_id = ReadVariableLengthLittleEndian(field_id_size, field_ids);
|
||||
auto next_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets);
|
||||
|
||||
auto value = Decode(metadata, values + last_offset);
|
||||
auto &key = metadata.strings[field_id];
|
||||
|
||||
ret.AddChild(key, std::move(value));
|
||||
last_offset = next_offset;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
VariantValue VariantBinaryDecoder::ArrayDecode(const VariantMetadata &metadata,
|
||||
const VariantValueMetadata &value_metadata, const_data_ptr_t data) {
|
||||
VariantValue ret(VariantValueType::ARRAY);
|
||||
|
||||
auto field_offset_size = value_metadata.field_offset_size;
|
||||
auto is_large = value_metadata.is_large;
|
||||
|
||||
uint32_t num_elements;
|
||||
if (is_large) {
|
||||
num_elements = Load<uint32_t>(data);
|
||||
data += sizeof(uint32_t);
|
||||
} else {
|
||||
num_elements = Load<uint8_t>(data);
|
||||
data += sizeof(uint8_t);
|
||||
}
|
||||
|
||||
auto field_offsets = data;
|
||||
auto values = field_offsets + ((num_elements + 1) * field_offset_size);
|
||||
|
||||
idx_t last_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets);
|
||||
for (idx_t i = 0; i < num_elements; i++) {
|
||||
auto next_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets);
|
||||
|
||||
ret.AddItem(Decode(metadata, values + last_offset));
|
||||
last_offset = next_offset;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
VariantValue VariantBinaryDecoder::Decode(const VariantMetadata &variant_metadata, const_data_ptr_t data) {
|
||||
auto value_metadata = VariantValueMetadata::FromHeaderByte(data[0]);
|
||||
|
||||
data++;
|
||||
switch (value_metadata.basic_type) {
|
||||
case VariantBasicType::PRIMITIVE: {
|
||||
return PrimitiveTypeDecode(value_metadata, data);
|
||||
}
|
||||
case VariantBasicType::SHORT_STRING: {
|
||||
return ShortStringDecode(value_metadata, data);
|
||||
}
|
||||
case VariantBasicType::OBJECT: {
|
||||
return ObjectDecode(variant_metadata, value_metadata, data);
|
||||
}
|
||||
case VariantBasicType::ARRAY: {
|
||||
return ArrayDecode(variant_metadata, value_metadata, data);
|
||||
}
|
||||
default:
|
||||
throw InternalException("Unexpected value for VariantBasicType");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
577
external/duckdb/extension/parquet/reader/variant/variant_shredded_conversion.cpp
vendored
Normal file
577
external/duckdb/extension/parquet/reader/variant/variant_shredded_conversion.cpp
vendored
Normal file
@@ -0,0 +1,577 @@
|
||||
#include "reader/variant/variant_shredded_conversion.hpp"
|
||||
#include "column_reader.hpp"
|
||||
#include "utf8proc_wrapper.hpp"
|
||||
|
||||
#include "duckdb/common/types/timestamp.hpp"
|
||||
#include "duckdb/common/types/decimal.hpp"
|
||||
#include "duckdb/common/types/uuid.hpp"
|
||||
#include "duckdb/common/types/time.hpp"
|
||||
#include "duckdb/common/types/date.hpp"
|
||||
#include "duckdb/common/types/blob.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
template <class T>
|
||||
struct ConvertShreddedValue {
|
||||
static VariantValue Convert(T val);
|
||||
static VariantValue ConvertDecimal(T val, uint8_t width, uint8_t scale) {
|
||||
throw InternalException("ConvertShreddedValue::ConvertDecimal not implemented for type");
|
||||
}
|
||||
static VariantValue ConvertBlob(T val) {
|
||||
throw InternalException("ConvertShreddedValue::ConvertBlob not implemented for type");
|
||||
}
|
||||
};
|
||||
|
||||
//! boolean
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<bool>::Convert(bool val) {
|
||||
return VariantValue(Value::BOOLEAN(val));
|
||||
}
|
||||
//! int8
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<int8_t>::Convert(int8_t val) {
|
||||
return VariantValue(Value::TINYINT(val));
|
||||
}
|
||||
//! int16
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<int16_t>::Convert(int16_t val) {
|
||||
return VariantValue(Value::SMALLINT(val));
|
||||
}
|
||||
//! int32
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<int32_t>::Convert(int32_t val) {
|
||||
return VariantValue(Value::INTEGER(val));
|
||||
}
|
||||
//! int64
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<int64_t>::Convert(int64_t val) {
|
||||
return VariantValue(Value::BIGINT(val));
|
||||
}
|
||||
//! float
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<float>::Convert(float val) {
|
||||
return VariantValue(Value::FLOAT(val));
|
||||
}
|
||||
//! double
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<double>::Convert(double val) {
|
||||
return VariantValue(Value::DOUBLE(val));
|
||||
}
|
||||
//! decimal4/decimal8/decimal16
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<int32_t>::ConvertDecimal(int32_t val, uint8_t width, uint8_t scale) {
|
||||
auto value_str = Decimal::ToString(val, width, scale);
|
||||
return VariantValue(Value(value_str));
|
||||
}
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<int64_t>::ConvertDecimal(int64_t val, uint8_t width, uint8_t scale) {
|
||||
auto value_str = Decimal::ToString(val, width, scale);
|
||||
return VariantValue(Value(value_str));
|
||||
}
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<hugeint_t>::ConvertDecimal(hugeint_t val, uint8_t width, uint8_t scale) {
|
||||
auto value_str = Decimal::ToString(val, width, scale);
|
||||
return VariantValue(Value(value_str));
|
||||
}
|
||||
//! date
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<date_t>::Convert(date_t val) {
|
||||
return VariantValue(Value::DATE(val));
|
||||
}
|
||||
//! time
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<dtime_t>::Convert(dtime_t val) {
|
||||
return VariantValue(Value::TIME(val));
|
||||
}
|
||||
//! timestamptz(6)
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<timestamp_tz_t>::Convert(timestamp_tz_t val) {
|
||||
return VariantValue(Value::TIMESTAMPTZ(val));
|
||||
}
|
||||
////! timestamptz(9)
|
||||
// template <>
|
||||
// VariantValue ConvertShreddedValue<timestamp_ns_tz_t>::Convert(timestamp_ns_tz_t val) {
|
||||
// return VariantValue(Value::TIMESTAMPNS_TZ(val));
|
||||
//}
|
||||
//! timestampntz(6)
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<timestamp_t>::Convert(timestamp_t val) {
|
||||
return VariantValue(Value::TIMESTAMP(val));
|
||||
}
|
||||
//! timestampntz(9)
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<timestamp_ns_t>::Convert(timestamp_ns_t val) {
|
||||
return VariantValue(Value::TIMESTAMPNS(val));
|
||||
}
|
||||
//! binary
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<string_t>::ConvertBlob(string_t val) {
|
||||
return VariantValue(Value(Blob::ToBase64(val)));
|
||||
}
|
||||
//! string
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<string_t>::Convert(string_t val) {
|
||||
if (!Utf8Proc::IsValid(val.GetData(), val.GetSize())) {
|
||||
throw InternalException("Can't decode Variant string, it isn't valid UTF8");
|
||||
}
|
||||
return VariantValue(Value(val.GetString()));
|
||||
}
|
||||
//! uuid
|
||||
template <>
|
||||
VariantValue ConvertShreddedValue<hugeint_t>::Convert(hugeint_t val) {
|
||||
return VariantValue(Value(UUID::ToString(val)));
|
||||
}
|
||||
|
||||
template <class T, class OP, LogicalTypeId TYPE_ID>
|
||||
vector<VariantValue> ConvertTypedValues(Vector &vec, Vector &metadata, Vector &blob, idx_t offset, idx_t length,
|
||||
idx_t total_size, const bool is_field) {
|
||||
UnifiedVectorFormat metadata_format;
|
||||
metadata.ToUnifiedFormat(length, metadata_format);
|
||||
auto metadata_data = metadata_format.GetData<string_t>(metadata_format);
|
||||
|
||||
UnifiedVectorFormat typed_format;
|
||||
vec.ToUnifiedFormat(total_size, typed_format);
|
||||
auto data = typed_format.GetData<T>(typed_format);
|
||||
|
||||
UnifiedVectorFormat value_format;
|
||||
blob.ToUnifiedFormat(total_size, value_format);
|
||||
auto value_data = value_format.GetData<string_t>(value_format);
|
||||
|
||||
auto &validity = typed_format.validity;
|
||||
auto &value_validity = value_format.validity;
|
||||
auto &type = vec.GetType();
|
||||
|
||||
//! Values only used for Decimal conversion
|
||||
uint8_t width;
|
||||
uint8_t scale;
|
||||
if (TYPE_ID == LogicalTypeId::DECIMAL) {
|
||||
type.GetDecimalProperties(width, scale);
|
||||
}
|
||||
|
||||
vector<VariantValue> ret(length);
|
||||
if (validity.AllValid()) {
|
||||
for (idx_t i = 0; i < length; i++) {
|
||||
auto index = typed_format.sel->get_index(i + offset);
|
||||
if (TYPE_ID == LogicalTypeId::DECIMAL) {
|
||||
ret[i] = OP::ConvertDecimal(data[index], width, scale);
|
||||
} else if (TYPE_ID == LogicalTypeId::BLOB) {
|
||||
ret[i] = OP::ConvertBlob(data[index]);
|
||||
} else {
|
||||
ret[i] = OP::Convert(data[index]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (idx_t i = 0; i < length; i++) {
|
||||
auto typed_index = typed_format.sel->get_index(i + offset);
|
||||
auto value_index = value_format.sel->get_index(i + offset);
|
||||
if (validity.RowIsValid(typed_index)) {
|
||||
//! This is a leaf, partially shredded values aren't possible here
|
||||
D_ASSERT(!value_validity.RowIsValid(value_index));
|
||||
if (TYPE_ID == LogicalTypeId::DECIMAL) {
|
||||
ret[i] = OP::ConvertDecimal(data[typed_index], width, scale);
|
||||
} else if (TYPE_ID == LogicalTypeId::BLOB) {
|
||||
ret[i] = OP::ConvertBlob(data[typed_index]);
|
||||
} else {
|
||||
ret[i] = OP::Convert(data[typed_index]);
|
||||
}
|
||||
} else {
|
||||
if (is_field && !value_validity.RowIsValid(value_index)) {
|
||||
//! Value is missing for this field
|
||||
continue;
|
||||
}
|
||||
D_ASSERT(value_validity.RowIsValid(value_index));
|
||||
auto metadata_value = metadata_data[metadata_format.sel->get_index(i)];
|
||||
VariantMetadata variant_metadata(metadata_value);
|
||||
ret[i] = VariantBinaryDecoder::Decode(variant_metadata,
|
||||
const_data_ptr_cast(value_data[value_index].GetData()));
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<VariantValue> VariantShreddedConversion::ConvertShreddedLeaf(Vector &metadata, Vector &value,
|
||||
Vector &typed_value, idx_t offset, idx_t length,
|
||||
idx_t total_size, const bool is_field) {
|
||||
D_ASSERT(!typed_value.GetType().IsNested());
|
||||
vector<VariantValue> result;
|
||||
|
||||
auto &type = typed_value.GetType();
|
||||
switch (type.id()) {
|
||||
//! boolean
|
||||
case LogicalTypeId::BOOLEAN: {
|
||||
return ConvertTypedValues<bool, ConvertShreddedValue<bool>, LogicalTypeId::BOOLEAN>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! int8
|
||||
case LogicalTypeId::TINYINT: {
|
||||
return ConvertTypedValues<int8_t, ConvertShreddedValue<int8_t>, LogicalTypeId::TINYINT>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! int16
|
||||
case LogicalTypeId::SMALLINT: {
|
||||
return ConvertTypedValues<int16_t, ConvertShreddedValue<int16_t>, LogicalTypeId::SMALLINT>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! int32
|
||||
case LogicalTypeId::INTEGER: {
|
||||
return ConvertTypedValues<int32_t, ConvertShreddedValue<int32_t>, LogicalTypeId::INTEGER>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! int64
|
||||
case LogicalTypeId::BIGINT: {
|
||||
return ConvertTypedValues<int64_t, ConvertShreddedValue<int64_t>, LogicalTypeId::BIGINT>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! float
|
||||
case LogicalTypeId::FLOAT: {
|
||||
return ConvertTypedValues<float, ConvertShreddedValue<float>, LogicalTypeId::FLOAT>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! double
|
||||
case LogicalTypeId::DOUBLE: {
|
||||
return ConvertTypedValues<double, ConvertShreddedValue<double>, LogicalTypeId::DOUBLE>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! decimal4/decimal8/decimal16
|
||||
case LogicalTypeId::DECIMAL: {
|
||||
auto physical_type = type.InternalType();
|
||||
switch (physical_type) {
|
||||
case PhysicalType::INT32: {
|
||||
return ConvertTypedValues<int32_t, ConvertShreddedValue<int32_t>, LogicalTypeId::DECIMAL>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
case PhysicalType::INT64: {
|
||||
return ConvertTypedValues<int64_t, ConvertShreddedValue<int64_t>, LogicalTypeId::DECIMAL>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
case PhysicalType::INT128: {
|
||||
return ConvertTypedValues<hugeint_t, ConvertShreddedValue<hugeint_t>, LogicalTypeId::DECIMAL>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
default:
|
||||
throw NotImplementedException("Decimal with PhysicalType (%s) not implemented for shredded Variant",
|
||||
EnumUtil::ToString(physical_type));
|
||||
}
|
||||
}
|
||||
//! date
|
||||
case LogicalTypeId::DATE: {
|
||||
return ConvertTypedValues<date_t, ConvertShreddedValue<date_t>, LogicalTypeId::DATE>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! time
|
||||
case LogicalTypeId::TIME: {
|
||||
return ConvertTypedValues<dtime_t, ConvertShreddedValue<dtime_t>, LogicalTypeId::TIME>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! timestamptz(6) (timestamptz(9) not implemented in DuckDB)
|
||||
case LogicalTypeId::TIMESTAMP_TZ: {
|
||||
return ConvertTypedValues<timestamp_tz_t, ConvertShreddedValue<timestamp_tz_t>, LogicalTypeId::TIMESTAMP_TZ>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! timestampntz(6)
|
||||
case LogicalTypeId::TIMESTAMP: {
|
||||
return ConvertTypedValues<timestamp_t, ConvertShreddedValue<timestamp_t>, LogicalTypeId::TIMESTAMP>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! timestampntz(9)
|
||||
case LogicalTypeId::TIMESTAMP_NS: {
|
||||
return ConvertTypedValues<timestamp_ns_t, ConvertShreddedValue<timestamp_ns_t>, LogicalTypeId::TIMESTAMP_NS>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! binary
|
||||
case LogicalTypeId::BLOB: {
|
||||
return ConvertTypedValues<string_t, ConvertShreddedValue<string_t>, LogicalTypeId::BLOB>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! string
|
||||
case LogicalTypeId::VARCHAR: {
|
||||
return ConvertTypedValues<string_t, ConvertShreddedValue<string_t>, LogicalTypeId::VARCHAR>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
//! uuid
|
||||
case LogicalTypeId::UUID: {
|
||||
return ConvertTypedValues<hugeint_t, ConvertShreddedValue<hugeint_t>, LogicalTypeId::UUID>(
|
||||
typed_value, metadata, value, offset, length, total_size, is_field);
|
||||
}
|
||||
default:
|
||||
throw NotImplementedException("Variant shredding on type: '%s' is not implemented", type.ToString());
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
struct ShreddedVariantField {
|
||||
public:
|
||||
explicit ShreddedVariantField(const string &field_name) : field_name(field_name) {
|
||||
}
|
||||
|
||||
public:
|
||||
string field_name;
|
||||
//! Values for the field, for all rows
|
||||
vector<VariantValue> values;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
template <bool IS_REQUIRED>
|
||||
static vector<VariantValue> ConvertBinaryEncoding(Vector &metadata, Vector &value, idx_t offset, idx_t length,
|
||||
idx_t total_size) {
|
||||
UnifiedVectorFormat value_format;
|
||||
value.ToUnifiedFormat(total_size, value_format);
|
||||
auto value_data = value_format.GetData<string_t>(value_format);
|
||||
auto &validity = value_format.validity;
|
||||
|
||||
UnifiedVectorFormat metadata_format;
|
||||
metadata.ToUnifiedFormat(length, metadata_format);
|
||||
auto metadata_data = metadata_format.GetData<string_t>(metadata_format);
|
||||
auto metadata_validity = metadata_format.validity;
|
||||
|
||||
vector<VariantValue> ret(length);
|
||||
if (IS_REQUIRED) {
|
||||
for (idx_t i = 0; i < length; i++) {
|
||||
auto index = value_format.sel->get_index(i + offset);
|
||||
|
||||
// Variant itself is NULL
|
||||
if (!validity.RowIsValid(index) && !metadata_validity.RowIsValid(metadata_format.sel->get_index(i))) {
|
||||
ret[i] = VariantValue(Value());
|
||||
continue;
|
||||
}
|
||||
|
||||
D_ASSERT(validity.RowIsValid(index));
|
||||
auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)];
|
||||
VariantMetadata variant_metadata(metadata_value);
|
||||
auto binary_value = value_data[index].GetData();
|
||||
ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value));
|
||||
}
|
||||
} else {
|
||||
//! Even though 'typed_value' is not present, 'value' is allowed to contain NULLs because we're scanning an
|
||||
//! Object's shredded field.
|
||||
//! When 'value' is null for a row, that means the Object does not contain this field
|
||||
//! for that row.
|
||||
for (idx_t i = 0; i < length; i++) {
|
||||
auto index = value_format.sel->get_index(i + offset);
|
||||
if (validity.RowIsValid(index)) {
|
||||
auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)];
|
||||
VariantMetadata variant_metadata(metadata_value);
|
||||
auto binary_value = value_data[index].GetData();
|
||||
ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value));
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static VariantValue ConvertPartiallyShreddedObject(vector<ShreddedVariantField> &shredded_fields,
|
||||
const UnifiedVectorFormat &metadata_format,
|
||||
const UnifiedVectorFormat &value_format, idx_t i, idx_t offset) {
|
||||
auto ret = VariantValue(VariantValueType::OBJECT);
|
||||
auto index = value_format.sel->get_index(i + offset);
|
||||
auto value_data = value_format.GetData<string_t>(value_format);
|
||||
auto metadata_data = metadata_format.GetData<string_t>(metadata_format);
|
||||
auto &value_validity = value_format.validity;
|
||||
|
||||
for (idx_t field_index = 0; field_index < shredded_fields.size(); field_index++) {
|
||||
auto &shredded_field = shredded_fields[field_index];
|
||||
auto &field_value = shredded_field.values[i];
|
||||
|
||||
if (field_value.IsMissing()) {
|
||||
//! This field is missing from the value, skip it
|
||||
continue;
|
||||
}
|
||||
ret.AddChild(shredded_field.field_name, std::move(field_value));
|
||||
}
|
||||
|
||||
if (value_validity.RowIsValid(index)) {
|
||||
//! Object is partially shredded, decode the object and merge the values
|
||||
auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)];
|
||||
VariantMetadata variant_metadata(metadata_value);
|
||||
auto binary_value = value_data[index].GetData();
|
||||
auto unshredded = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value));
|
||||
if (unshredded.value_type != VariantValueType::OBJECT) {
|
||||
throw InvalidInputException("Partially shredded objects have to encode Object Variants in the 'value'");
|
||||
}
|
||||
for (auto &item : unshredded.object_children) {
|
||||
ret.AddChild(item.first, std::move(item.second));
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<VariantValue> VariantShreddedConversion::ConvertShreddedObject(Vector &metadata, Vector &value,
|
||||
Vector &typed_value, idx_t offset, idx_t length,
|
||||
idx_t total_size, const bool is_field) {
|
||||
auto &type = typed_value.GetType();
|
||||
D_ASSERT(type.id() == LogicalTypeId::STRUCT);
|
||||
auto &fields = StructType::GetChildTypes(type);
|
||||
auto &entries = StructVector::GetEntries(typed_value);
|
||||
D_ASSERT(entries.size() == fields.size());
|
||||
|
||||
//! 'value'
|
||||
UnifiedVectorFormat value_format;
|
||||
value.ToUnifiedFormat(total_size, value_format);
|
||||
auto value_data = value_format.GetData<string_t>(value_format);
|
||||
auto &validity = value_format.validity;
|
||||
(void)validity;
|
||||
|
||||
//! 'metadata'
|
||||
UnifiedVectorFormat metadata_format;
|
||||
metadata.ToUnifiedFormat(length, metadata_format);
|
||||
auto metadata_data = metadata_format.GetData<string_t>(metadata_format);
|
||||
|
||||
//! 'typed_value'
|
||||
UnifiedVectorFormat typed_format;
|
||||
typed_value.ToUnifiedFormat(total_size, typed_format);
|
||||
auto &typed_validity = typed_format.validity;
|
||||
|
||||
//! Process all fields to get the shredded field values
|
||||
vector<ShreddedVariantField> shredded_fields;
|
||||
shredded_fields.reserve(fields.size());
|
||||
for (idx_t i = 0; i < fields.size(); i++) {
|
||||
auto &field = fields[i];
|
||||
auto &field_name = field.first;
|
||||
auto &field_vec = *entries[i];
|
||||
|
||||
shredded_fields.emplace_back(field_name);
|
||||
auto &shredded_field = shredded_fields.back();
|
||||
shredded_field.values = Convert(metadata, field_vec, offset, length, total_size, true);
|
||||
}
|
||||
|
||||
vector<VariantValue> ret(length);
|
||||
if (typed_validity.AllValid()) {
|
||||
for (idx_t i = 0; i < length; i++) {
|
||||
ret[i] = ConvertPartiallyShreddedObject(shredded_fields, metadata_format, value_format, i, offset);
|
||||
}
|
||||
} else {
|
||||
//! For some of the rows, the value is not an object
|
||||
for (idx_t i = 0; i < length; i++) {
|
||||
auto typed_index = typed_format.sel->get_index(i + offset);
|
||||
auto value_index = value_format.sel->get_index(i + offset);
|
||||
if (typed_validity.RowIsValid(typed_index)) {
|
||||
ret[i] = ConvertPartiallyShreddedObject(shredded_fields, metadata_format, value_format, i, offset);
|
||||
} else {
|
||||
if (is_field && !validity.RowIsValid(value_index)) {
|
||||
//! This object is a field in the parent object, the value is missing, skip it
|
||||
continue;
|
||||
}
|
||||
D_ASSERT(validity.RowIsValid(value_index));
|
||||
auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)];
|
||||
VariantMetadata variant_metadata(metadata_value);
|
||||
auto binary_value = value_data[value_index].GetData();
|
||||
ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value));
|
||||
if (ret[i].value_type == VariantValueType::OBJECT) {
|
||||
throw InvalidInputException(
|
||||
"When 'typed_value' for a shredded Object is NULL, 'value' can not contain an Object value");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<VariantValue> VariantShreddedConversion::ConvertShreddedArray(Vector &metadata, Vector &value,
|
||||
Vector &typed_value, idx_t offset, idx_t length,
|
||||
idx_t total_size, const bool is_field) {
|
||||
auto &child = ListVector::GetEntry(typed_value);
|
||||
auto list_size = ListVector::GetListSize(typed_value);
|
||||
|
||||
//! 'value'
|
||||
UnifiedVectorFormat value_format;
|
||||
value.ToUnifiedFormat(total_size, value_format);
|
||||
auto value_data = value_format.GetData<string_t>(value_format);
|
||||
|
||||
//! 'metadata'
|
||||
UnifiedVectorFormat metadata_format;
|
||||
metadata.ToUnifiedFormat(length, metadata_format);
|
||||
auto metadata_data = metadata_format.GetData<string_t>(metadata_format);
|
||||
|
||||
//! 'typed_value'
|
||||
UnifiedVectorFormat list_format;
|
||||
typed_value.ToUnifiedFormat(total_size, list_format);
|
||||
auto list_data = list_format.GetData<list_entry_t>(list_format);
|
||||
auto &validity = list_format.validity;
|
||||
auto &value_validity = value_format.validity;
|
||||
|
||||
vector<VariantValue> ret(length);
|
||||
if (validity.AllValid()) {
|
||||
//! We can be sure that none of the values are binary encoded
|
||||
for (idx_t i = 0; i < length; i++) {
|
||||
auto typed_index = list_format.sel->get_index(i + offset);
|
||||
auto entry = list_data[typed_index];
|
||||
Vector child_metadata(metadata.GetValue(i));
|
||||
ret[i] = VariantValue(VariantValueType::ARRAY);
|
||||
ret[i].array_items = Convert(child_metadata, child, entry.offset, entry.length, list_size, false);
|
||||
}
|
||||
} else {
|
||||
for (idx_t i = 0; i < length; i++) {
|
||||
auto typed_index = list_format.sel->get_index(i + offset);
|
||||
auto value_index = value_format.sel->get_index(i + offset);
|
||||
if (validity.RowIsValid(typed_index)) {
|
||||
auto entry = list_data[typed_index];
|
||||
Vector child_metadata(metadata.GetValue(i));
|
||||
ret[i] = VariantValue(VariantValueType::ARRAY);
|
||||
ret[i].array_items = Convert(child_metadata, child, entry.offset, entry.length, list_size, false);
|
||||
} else {
|
||||
if (is_field && !value_validity.RowIsValid(value_index)) {
|
||||
//! Value is missing for this field
|
||||
continue;
|
||||
}
|
||||
D_ASSERT(value_validity.RowIsValid(value_index));
|
||||
auto metadata_value = metadata_data[metadata_format.sel->get_index(i)];
|
||||
VariantMetadata variant_metadata(metadata_value);
|
||||
ret[i] = VariantBinaryDecoder::Decode(variant_metadata,
|
||||
const_data_ptr_cast(value_data[value_index].GetData()));
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<VariantValue> VariantShreddedConversion::Convert(Vector &metadata, Vector &group, idx_t offset, idx_t length,
|
||||
idx_t total_size, bool is_field) {
|
||||
D_ASSERT(group.GetType().id() == LogicalTypeId::STRUCT);
|
||||
|
||||
auto &group_entries = StructVector::GetEntries(group);
|
||||
auto &group_type_children = StructType::GetChildTypes(group.GetType());
|
||||
D_ASSERT(group_type_children.size() == group_entries.size());
|
||||
|
||||
//! From the spec:
|
||||
//! The Parquet columns used to store variant metadata and values must be accessed by name, not by position.
|
||||
optional_ptr<Vector> value;
|
||||
optional_ptr<Vector> typed_value;
|
||||
for (idx_t i = 0; i < group_entries.size(); i++) {
|
||||
auto &name = group_type_children[i].first;
|
||||
auto &vec = group_entries[i];
|
||||
if (name == "value") {
|
||||
value = vec.get();
|
||||
} else if (name == "typed_value") {
|
||||
typed_value = vec.get();
|
||||
} else {
|
||||
throw InvalidInputException("Variant group can only contain 'value'/'typed_value', not: %s", name);
|
||||
}
|
||||
}
|
||||
if (!value) {
|
||||
throw InvalidInputException("Required column 'value' not found in Variant group");
|
||||
}
|
||||
|
||||
if (typed_value) {
|
||||
auto &type = typed_value->GetType();
|
||||
vector<VariantValue> ret;
|
||||
if (type.id() == LogicalTypeId::STRUCT) {
|
||||
return ConvertShreddedObject(metadata, *value, *typed_value, offset, length, total_size, is_field);
|
||||
} else if (type.id() == LogicalTypeId::LIST) {
|
||||
return ConvertShreddedArray(metadata, *value, *typed_value, offset, length, total_size, is_field);
|
||||
} else {
|
||||
return ConvertShreddedLeaf(metadata, *value, *typed_value, offset, length, total_size, is_field);
|
||||
}
|
||||
} else {
|
||||
if (is_field) {
|
||||
return ConvertBinaryEncoding<false>(metadata, *value, offset, length, total_size);
|
||||
} else {
|
||||
//! Only 'value' is present, we can assume this to be 'required', so it can't contain NULLs
|
||||
return ConvertBinaryEncoding<true>(metadata, *value, offset, length, total_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
85
external/duckdb/extension/parquet/reader/variant/variant_value.cpp
vendored
Normal file
85
external/duckdb/extension/parquet/reader/variant/variant_value.cpp
vendored
Normal file
@@ -0,0 +1,85 @@
|
||||
#include "reader/variant/variant_value.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
void VariantValue::AddChild(const string &key, VariantValue &&val) {
|
||||
D_ASSERT(value_type == VariantValueType::OBJECT);
|
||||
object_children.emplace(key, std::move(val));
|
||||
}
|
||||
|
||||
void VariantValue::AddItem(VariantValue &&val) {
|
||||
D_ASSERT(value_type == VariantValueType::ARRAY);
|
||||
array_items.push_back(std::move(val));
|
||||
}
|
||||
|
||||
yyjson_mut_val *VariantValue::ToJSON(ClientContext &context, yyjson_mut_doc *doc) const {
|
||||
switch (value_type) {
|
||||
case VariantValueType::PRIMITIVE: {
|
||||
if (primitive_value.IsNull()) {
|
||||
return yyjson_mut_null(doc);
|
||||
}
|
||||
switch (primitive_value.type().id()) {
|
||||
case LogicalTypeId::BOOLEAN: {
|
||||
if (primitive_value.GetValue<bool>()) {
|
||||
return yyjson_mut_true(doc);
|
||||
} else {
|
||||
return yyjson_mut_false(doc);
|
||||
}
|
||||
}
|
||||
case LogicalTypeId::TINYINT:
|
||||
return yyjson_mut_int(doc, primitive_value.GetValue<int8_t>());
|
||||
case LogicalTypeId::SMALLINT:
|
||||
return yyjson_mut_int(doc, primitive_value.GetValue<int16_t>());
|
||||
case LogicalTypeId::INTEGER:
|
||||
return yyjson_mut_int(doc, primitive_value.GetValue<int32_t>());
|
||||
case LogicalTypeId::BIGINT:
|
||||
return yyjson_mut_int(doc, primitive_value.GetValue<int64_t>());
|
||||
case LogicalTypeId::FLOAT:
|
||||
return yyjson_mut_real(doc, primitive_value.GetValue<float>());
|
||||
case LogicalTypeId::DOUBLE:
|
||||
return yyjson_mut_real(doc, primitive_value.GetValue<double>());
|
||||
case LogicalTypeId::DATE:
|
||||
case LogicalTypeId::TIME:
|
||||
case LogicalTypeId::VARCHAR: {
|
||||
auto value_str = primitive_value.ToString();
|
||||
return yyjson_mut_strncpy(doc, value_str.c_str(), value_str.size());
|
||||
}
|
||||
case LogicalTypeId::TIMESTAMP: {
|
||||
auto value_str = primitive_value.ToString();
|
||||
return yyjson_mut_strncpy(doc, value_str.c_str(), value_str.size());
|
||||
}
|
||||
case LogicalTypeId::TIMESTAMP_TZ: {
|
||||
auto value_str = primitive_value.CastAs(context, LogicalType::VARCHAR).GetValue<string>();
|
||||
return yyjson_mut_strncpy(doc, value_str.c_str(), value_str.size());
|
||||
}
|
||||
case LogicalTypeId::TIMESTAMP_NS: {
|
||||
auto value_str = primitive_value.CastAs(context, LogicalType::VARCHAR).GetValue<string>();
|
||||
return yyjson_mut_strncpy(doc, value_str.c_str(), value_str.size());
|
||||
}
|
||||
default:
|
||||
throw InternalException("Unexpected primitive type: %s", primitive_value.type().ToString());
|
||||
}
|
||||
}
|
||||
case VariantValueType::OBJECT: {
|
||||
auto obj = yyjson_mut_obj(doc);
|
||||
for (const auto &it : object_children) {
|
||||
auto &key = it.first;
|
||||
auto value = it.second.ToJSON(context, doc);
|
||||
yyjson_mut_obj_add_val(doc, obj, key.c_str(), value);
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
case VariantValueType::ARRAY: {
|
||||
auto arr = yyjson_mut_arr(doc);
|
||||
for (auto &item : array_items) {
|
||||
auto value = item.ToJSON(context, doc);
|
||||
yyjson_mut_arr_add_val(arr, value);
|
||||
}
|
||||
return arr;
|
||||
}
|
||||
default:
|
||||
throw InternalException("Can't serialize this VariantValue type to JSON");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
Reference in New Issue
Block a user