should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,70 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/boolean_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
struct BooleanParquetValueConversion;
class BooleanColumnReader : public TemplatedColumnReader<bool, BooleanParquetValueConversion> {
public:
static constexpr const PhysicalType TYPE = PhysicalType::BOOL;
public:
BooleanColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: TemplatedColumnReader<bool, BooleanParquetValueConversion>(reader, schema), byte_pos(0) {
}
uint8_t byte_pos;
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override {
byte_pos = 0;
TemplatedColumnReader<bool, BooleanParquetValueConversion>::InitializeRead(row_group_idx_p, columns,
protocol_p);
}
void ResetPage() override {
byte_pos = 0;
}
};
struct BooleanParquetValueConversion {
template <bool CHECKED>
static bool PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
auto &byte_pos = reader.Cast<BooleanColumnReader>().byte_pos;
bool ret = (*plain_data.ptr >> byte_pos) & 1;
if (++byte_pos == 8) {
byte_pos = 0;
if (CHECKED) {
plain_data.inc(1);
} else {
plain_data.unsafe_inc(1);
}
}
return ret;
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
PlainRead<CHECKED>(plain_data, reader);
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return plain_data.check_available((count + 7) / 8);
}
static idx_t PlainConstantSize() {
return 0;
}
};
} // namespace duckdb

View File

@@ -0,0 +1,46 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/callback_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
#include "parquet_reader.hpp"
namespace duckdb {
template <class PARQUET_PHYSICAL_TYPE, class DUCKDB_PHYSICAL_TYPE,
DUCKDB_PHYSICAL_TYPE (*FUNC)(const PARQUET_PHYSICAL_TYPE &input)>
class CallbackColumnReader
: public TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
CallbackParquetValueConversion<PARQUET_PHYSICAL_TYPE, DUCKDB_PHYSICAL_TYPE, FUNC>> {
using BaseType =
TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
CallbackParquetValueConversion<PARQUET_PHYSICAL_TYPE, DUCKDB_PHYSICAL_TYPE, FUNC>>;
public:
static constexpr const PhysicalType TYPE = PhysicalType::INVALID;
public:
CallbackColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
CallbackParquetValueConversion<PARQUET_PHYSICAL_TYPE, DUCKDB_PHYSICAL_TYPE, FUNC>>(
reader, schema) {
}
protected:
void Dictionary(shared_ptr<ResizeableBuffer> dictionary_data, idx_t num_entries) {
BaseType::AllocateDict(num_entries * sizeof(DUCKDB_PHYSICAL_TYPE));
auto dict_ptr = (DUCKDB_PHYSICAL_TYPE *)this->dict->ptr;
for (idx_t i = 0; i < num_entries; i++) {
dict_ptr[i] = FUNC(dictionary_data->read<PARQUET_PHYSICAL_TYPE>());
}
}
};
} // namespace duckdb

View File

@@ -0,0 +1,65 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/decimal_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
#include "parquet_reader.hpp"
#include "parquet_decimal_utils.hpp"
namespace duckdb {
template <class DUCKDB_PHYSICAL_TYPE, bool FIXED_LENGTH>
struct DecimalParquetValueConversion {
template <bool CHECKED>
static DUCKDB_PHYSICAL_TYPE PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
idx_t byte_len;
if (FIXED_LENGTH) {
byte_len = reader.Schema().type_length;
} else {
byte_len = plain_data.read<uint32_t>();
}
plain_data.available(byte_len);
auto res = ParquetDecimalUtils::ReadDecimalValue<DUCKDB_PHYSICAL_TYPE>(const_data_ptr_cast(plain_data.ptr),
byte_len, reader.Schema());
plain_data.inc(byte_len);
return res;
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
uint32_t decimal_len = FIXED_LENGTH ? reader.Schema().type_length : plain_data.read<uint32_t>();
plain_data.inc(decimal_len);
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return true;
}
static idx_t PlainConstantSize() {
return 0;
}
};
template <class DUCKDB_PHYSICAL_TYPE, bool FIXED_LENGTH>
class DecimalColumnReader
: public TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
DecimalParquetValueConversion<DUCKDB_PHYSICAL_TYPE, FIXED_LENGTH>> {
using BaseType =
TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE, DecimalParquetValueConversion<DUCKDB_PHYSICAL_TYPE, FIXED_LENGTH>>;
public:
DecimalColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
DecimalParquetValueConversion<DUCKDB_PHYSICAL_TYPE, FIXED_LENGTH>>(reader, schema) {
}
};
} // namespace duckdb

View File

@@ -0,0 +1,56 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/expression_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "duckdb/execution/expression_executor.hpp"
namespace duckdb {
//! A column reader that executes an expression over a child reader
class ExpressionColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::INVALID;
public:
ExpressionColumnReader(ClientContext &context, unique_ptr<ColumnReader> child_reader, unique_ptr<Expression> expr,
const ParquetColumnSchema &schema);
ExpressionColumnReader(ClientContext &context, unique_ptr<ColumnReader> child_reader, unique_ptr<Expression> expr,
unique_ptr<ParquetColumnSchema> owned_schema);
unique_ptr<ColumnReader> child_reader;
DataChunk intermediate_chunk;
unique_ptr<Expression> expr;
ExpressionExecutor executor;
// If this reader was created on top of a child reader, after-the-fact, the schema needs to live somewhere
unique_ptr<ParquetColumnSchema> owned_schema;
public:
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override;
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) override;
void Skip(idx_t num_values) override;
idx_t GroupRowsAvailable() override;
uint64_t TotalCompressedSize() override {
return child_reader->TotalCompressedSize();
}
idx_t FileOffset() const override {
return child_reader->FileOffset();
}
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override {
child_reader->RegisterPrefetch(transport, allow_merge);
}
};
} // namespace duckdb

View File

@@ -0,0 +1,67 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/interval_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
#include "parquet_reader.hpp"
namespace duckdb {
//===--------------------------------------------------------------------===//
// Interval Column Reader
//===--------------------------------------------------------------------===//
struct IntervalValueConversion {
static constexpr const idx_t PARQUET_INTERVAL_SIZE = 12;
static interval_t ReadParquetInterval(const_data_ptr_t input) {
interval_t result;
result.months = Load<int32_t>(input);
result.days = Load<int32_t>(input + sizeof(uint32_t));
result.micros = int64_t(Load<uint32_t>(input + sizeof(uint32_t) * 2)) * 1000;
return result;
}
template <bool CHECKED>
static interval_t PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
plain_data.available(PARQUET_INTERVAL_SIZE);
}
auto res = ReadParquetInterval(const_data_ptr_cast(plain_data.ptr));
plain_data.unsafe_inc(PARQUET_INTERVAL_SIZE);
return res;
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
plain_data.inc(PARQUET_INTERVAL_SIZE);
} else {
plain_data.unsafe_inc(PARQUET_INTERVAL_SIZE);
}
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return plain_data.check_available(count * PARQUET_INTERVAL_SIZE);
}
static idx_t PlainConstantSize() {
return 0;
}
};
class IntervalColumnReader : public TemplatedColumnReader<interval_t, IntervalValueConversion> {
public:
IntervalColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: TemplatedColumnReader<interval_t, IntervalValueConversion>(reader, schema) {
}
};
} // namespace duckdb

View File

@@ -0,0 +1,62 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/list_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
class ListColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::LIST;
public:
ListColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
unique_ptr<ColumnReader> child_column_reader_p);
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out) override;
void ApplyPendingSkips(data_ptr_t define_out, data_ptr_t repeat_out) override;
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override {
child_column_reader->InitializeRead(row_group_idx_p, columns, protocol_p);
}
idx_t GroupRowsAvailable() override {
return child_column_reader->GroupRowsAvailable() + overflow_child_count;
}
uint64_t TotalCompressedSize() override {
return child_column_reader->TotalCompressedSize();
}
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override {
child_column_reader->RegisterPrefetch(transport, allow_merge);
}
protected:
template <class OP>
idx_t ReadInternal(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out,
optional_ptr<Vector> result_out);
private:
unique_ptr<ColumnReader> child_column_reader;
ResizeableBuffer child_defines;
ResizeableBuffer child_repeats;
uint8_t *child_defines_ptr;
uint8_t *child_repeats_ptr;
VectorCache read_cache;
Vector read_vector;
idx_t overflow_child_count;
};
} // namespace duckdb

View File

@@ -0,0 +1,38 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/null_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "duckdb/common/helper.hpp"
namespace duckdb {
class NullColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::INVALID;
public:
NullColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema) : ColumnReader(reader, schema) {};
shared_ptr<ResizeableBuffer> dict;
public:
void Plain(ByteBuffer &plain_data, uint8_t *defines, uint64_t num_values, idx_t result_offset,
Vector &result) override {
(void)defines;
(void)plain_data;
auto &result_mask = FlatVector::Validity(result);
for (idx_t row_idx = 0; row_idx < num_values; row_idx++) {
result_mask.SetInvalid(row_idx + result_offset);
}
}
};
} // namespace duckdb

View File

@@ -0,0 +1,52 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/row_number_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb/common/limits.hpp"
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
//! Reads a file-absolute row number as a virtual column that's not actually stored in the file
class RowNumberColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::INT64;
public:
RowNumberColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema);
public:
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) override;
void Filter(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out,
const TableFilter &filter, TableFilterState &filter_state, SelectionVector &sel,
idx_t &approved_tuple_count, bool is_first_filter) override;
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override;
void Skip(idx_t num_values) override {
row_group_offset += num_values;
}
idx_t GroupRowsAvailable() override {
return NumericLimits<idx_t>::Maximum();
};
uint64_t TotalCompressedSize() override {
return 0;
}
idx_t FileOffset() const override {
return 0;
}
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override {
}
private:
idx_t row_group_offset;
};
} // namespace duckdb

View File

@@ -0,0 +1,91 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/string_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
class StringColumnReader : public ColumnReader {
enum class StringColumnType : uint8_t { VARCHAR, JSON, OTHER };
static StringColumnType GetStringColumnType(const LogicalType &type) {
if (type.IsJSONType()) {
return StringColumnType::JSON;
}
if (type.id() == LogicalTypeId::VARCHAR) {
return StringColumnType::VARCHAR;
}
return StringColumnType::OTHER;
}
public:
static constexpr const PhysicalType TYPE = PhysicalType::VARCHAR;
public:
StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema);
idx_t fixed_width_string_length;
const StringColumnType string_column_type;
public:
static void VerifyString(const char *str_data, uint32_t str_len, const bool isVarchar);
void VerifyString(const char *str_data, uint32_t str_len);
static void ReferenceBlock(Vector &result, shared_ptr<ResizeableBuffer> &block);
protected:
void Plain(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values, idx_t result_offset,
Vector &result) override {
throw NotImplementedException("StringColumnReader can only read plain data from a shared buffer");
}
void Plain(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values, idx_t result_offset,
Vector &result) override;
void PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values) override;
void PlainSelect(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values, Vector &result,
const SelectionVector &sel, idx_t count) override;
bool SupportsDirectFilter() const override {
return true;
}
bool SupportsDirectSelect() const override {
return true;
}
};
struct StringParquetValueConversion {
template <bool CHECKED>
static string_t PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
auto &scr = reader.Cast<StringColumnReader>();
uint32_t str_len =
scr.fixed_width_string_length == 0 ? plain_data.read<uint32_t>() : scr.fixed_width_string_length;
plain_data.available(str_len);
auto plain_str = char_ptr_cast(plain_data.ptr);
scr.VerifyString(plain_str, str_len);
auto ret_str = string_t(plain_str, str_len);
plain_data.inc(str_len);
return ret_str;
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
auto &scr = reader.Cast<StringColumnReader>();
uint32_t str_len =
scr.fixed_width_string_length == 0 ? plain_data.read<uint32_t>() : scr.fixed_width_string_length;
plain_data.inc(str_len);
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return false;
}
static idx_t PlainConstantSize() {
return 0;
}
};
} // namespace duckdb

View File

@@ -0,0 +1,39 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/struct_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
class StructColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::STRUCT;
public:
StructColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
vector<unique_ptr<ColumnReader>> child_readers_p);
vector<unique_ptr<ColumnReader>> child_readers;
public:
ColumnReader &GetChildReader(idx_t child_idx);
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override;
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) override;
void Skip(idx_t num_values) override;
idx_t GroupRowsAvailable() override;
uint64_t TotalCompressedSize() override;
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override;
};
} // namespace duckdb

View File

@@ -0,0 +1,110 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/templated_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "duckdb/common/helper.hpp"
namespace duckdb {
template <class VALUE_TYPE>
struct TemplatedParquetValueConversion {
template <bool CHECKED>
static VALUE_TYPE PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
return plain_data.read<VALUE_TYPE>();
} else {
return plain_data.unsafe_read<VALUE_TYPE>();
}
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
plain_data.inc(sizeof(VALUE_TYPE));
} else {
plain_data.unsafe_inc(sizeof(VALUE_TYPE));
}
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return plain_data.check_available(count * sizeof(VALUE_TYPE));
}
static idx_t PlainConstantSize() {
return sizeof(VALUE_TYPE);
}
};
template <class VALUE_TYPE, class VALUE_CONVERSION>
class TemplatedColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::INVALID;
public:
TemplatedColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema) : ColumnReader(reader, schema) {
}
shared_ptr<ResizeableBuffer> dict;
public:
void AllocateDict(idx_t size) {
if (!dict) {
dict = make_shared_ptr<ResizeableBuffer>(GetAllocator(), size);
} else {
dict->resize(GetAllocator(), size);
}
}
void Plain(ByteBuffer &plain_data, uint8_t *defines, uint64_t num_values, idx_t result_offset,
Vector &result) override {
PlainTemplated<VALUE_TYPE, VALUE_CONVERSION>(plain_data, defines, num_values, result_offset, result);
}
void PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values) override {
PlainSkipTemplated<VALUE_CONVERSION>(plain_data, defines, num_values);
}
bool SupportsDirectFilter() const override {
return true;
}
};
template <class PARQUET_PHYSICAL_TYPE, class DUCKDB_PHYSICAL_TYPE,
DUCKDB_PHYSICAL_TYPE (*FUNC)(const PARQUET_PHYSICAL_TYPE &input)>
struct CallbackParquetValueConversion {
template <bool CHECKED>
static DUCKDB_PHYSICAL_TYPE PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
return FUNC(plain_data.read<PARQUET_PHYSICAL_TYPE>());
} else {
return FUNC(plain_data.unsafe_read<PARQUET_PHYSICAL_TYPE>());
}
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
plain_data.inc(sizeof(PARQUET_PHYSICAL_TYPE));
} else {
plain_data.unsafe_inc(sizeof(PARQUET_PHYSICAL_TYPE));
}
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return plain_data.check_available(count * sizeof(PARQUET_PHYSICAL_TYPE));
}
static idx_t PlainConstantSize() {
return 0;
}
};
} // namespace duckdb

View File

@@ -0,0 +1,60 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/uuid_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "templated_column_reader.hpp"
#include "parquet_reader.hpp"
#include "duckdb/common/types/uuid.hpp"
namespace duckdb {
struct UUIDValueConversion {
static hugeint_t ReadParquetUUID(const_data_ptr_t input) {
// Use the utility function from BaseUUID
return BaseUUID::FromBlob(input);
}
template <bool CHECKED>
static hugeint_t PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
plain_data.available(sizeof(hugeint_t));
}
auto res = ReadParquetUUID(const_data_ptr_cast(plain_data.ptr));
plain_data.unsafe_inc(sizeof(hugeint_t));
return res;
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
plain_data.inc(sizeof(hugeint_t));
} else {
plain_data.unsafe_inc(sizeof(hugeint_t));
}
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return plain_data.check_available(count * sizeof(hugeint_t));
}
static idx_t PlainConstantSize() {
return 0;
}
};
class UUIDColumnReader : public TemplatedColumnReader<hugeint_t, UUIDValueConversion> {
public:
UUIDColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: TemplatedColumnReader<hugeint_t, UUIDValueConversion>(reader, schema) {
}
};
} // namespace duckdb

View File

@@ -0,0 +1,148 @@
#pragma once
#include "duckdb/common/types/string_type.hpp"
#include "duckdb/common/types/value.hpp"
#include "reader/variant/variant_value.hpp"
using namespace duckdb_yyjson;
namespace duckdb {
//! ------------ Metadata ------------
struct VariantMetadataHeader {
public:
static VariantMetadataHeader FromHeaderByte(uint8_t byte);
public:
//! The version of the protocol used (only '1' supported for now)
uint8_t version;
//! Number of bytes per dictionary size and offset field
uint8_t offset_size;
//! Whether dictionary strings are sorted and unique
bool sorted_strings = false;
};
struct VariantMetadata {
public:
explicit VariantMetadata(const string_t &metadata);
public:
const string_t &metadata;
public:
VariantMetadataHeader header;
const_data_ptr_t offsets;
const_data_ptr_t bytes;
//! The json object keys have to be null-terminated
//! But we don't receive them null-terminated
vector<string> strings;
};
//! ------------ Value ------------
enum class VariantBasicType : uint8_t { PRIMITIVE = 0, SHORT_STRING = 1, OBJECT = 2, ARRAY = 3, INVALID };
enum class VariantPrimitiveType : uint8_t {
NULL_TYPE = 0,
BOOLEAN_TRUE = 1,
BOOLEAN_FALSE = 2,
INT8 = 3,
INT16 = 4,
INT32 = 5,
INT64 = 6,
DOUBLE = 7,
DECIMAL4 = 8,
DECIMAL8 = 9,
DECIMAL16 = 10,
DATE = 11,
TIMESTAMP_MICROS = 12,
TIMESTAMP_NTZ_MICROS = 13,
FLOAT = 14,
BINARY = 15,
STRING = 16,
TIME_NTZ_MICROS = 17,
TIMESTAMP_NANOS = 18,
TIMESTAMP_NTZ_NANOS = 19,
UUID = 20,
INVALID
};
struct VariantValueMetadata {
public:
VariantValueMetadata() {
}
public:
static VariantValueMetadata FromHeaderByte(uint8_t byte);
static VariantBasicType VariantBasicTypeFromByte(uint8_t byte) {
if (byte >= static_cast<uint8_t>(VariantBasicType::INVALID)) {
throw NotImplementedException("Variant BasicType (%d) is not supported", byte);
}
return static_cast<VariantBasicType>(byte);
}
static VariantPrimitiveType VariantPrimitiveTypeFromByte(uint8_t byte) {
if (byte >= static_cast<uint8_t>(VariantPrimitiveType::INVALID)) {
throw NotImplementedException("Variant PrimitiveType (%d) is not supported", byte);
}
return static_cast<VariantPrimitiveType>(byte);
}
public:
VariantBasicType basic_type;
public:
//! Primitive Type header
VariantPrimitiveType primitive_type;
public:
//! Short String header
uint8_t string_size;
public:
//! Object header | Array header
//! Size in bytes for each 'field_offset' entry
uint32_t field_offset_size;
//! Size in bytes for each 'field_id' entry
uint32_t field_id_size;
//! Whether the number of elements is encoded in 1 byte (false) or 4 bytes (true)
bool is_large;
};
struct VariantDecodeResult {
public:
VariantDecodeResult() = default;
~VariantDecodeResult() {
if (doc) {
yyjson_mut_doc_free(doc);
}
if (data) {
free(data);
}
}
public:
yyjson_mut_doc *doc = nullptr;
char *data = nullptr;
};
class VariantBinaryDecoder {
public:
VariantBinaryDecoder() = delete;
public:
static VariantValue Decode(const VariantMetadata &metadata, const_data_ptr_t data);
public:
static VariantValue PrimitiveTypeDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data);
static VariantValue ShortStringDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data);
static VariantValue ObjectDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata,
const_data_ptr_t data);
static VariantValue ArrayDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata,
const_data_ptr_t data);
};
} // namespace duckdb

View File

@@ -0,0 +1,24 @@
#pragma once
#include "reader/variant/variant_value.hpp"
#include "reader/variant/variant_binary_decoder.hpp"
namespace duckdb {
class VariantShreddedConversion {
public:
VariantShreddedConversion() = delete;
public:
static vector<VariantValue> Convert(Vector &metadata, Vector &group, idx_t offset, idx_t length, idx_t total_size,
bool is_field);
static vector<VariantValue> ConvertShreddedLeaf(Vector &metadata, Vector &value, Vector &typed_value, idx_t offset,
idx_t length, idx_t total_size, const bool is_field);
static vector<VariantValue> ConvertShreddedArray(Vector &metadata, Vector &value, Vector &typed_value, idx_t offset,
idx_t length, idx_t total_size, const bool is_field);
static vector<VariantValue> ConvertShreddedObject(Vector &metadata, Vector &value, Vector &typed_value,
idx_t offset, idx_t length, idx_t total_size,
const bool is_field);
};
} // namespace duckdb

View File

@@ -0,0 +1,54 @@
#pragma once
#include "duckdb/common/map.hpp"
#include "duckdb/common/vector.hpp"
#include "duckdb/common/types/value.hpp"
#include "yyjson.hpp"
using namespace duckdb_yyjson;
namespace duckdb {
enum class VariantValueType : uint8_t { PRIMITIVE, OBJECT, ARRAY, MISSING };
struct VariantValue {
public:
VariantValue() : value_type(VariantValueType::MISSING) {
}
explicit VariantValue(VariantValueType type) : value_type(type) {
}
explicit VariantValue(Value &&val) : value_type(VariantValueType::PRIMITIVE), primitive_value(std::move(val)) {
}
// Delete copy constructor and copy assignment operator
VariantValue(const VariantValue &) = delete;
VariantValue &operator=(const VariantValue &) = delete;
// Default move constructor and move assignment operator
VariantValue(VariantValue &&) noexcept = default;
VariantValue &operator=(VariantValue &&) noexcept = default;
public:
bool IsNull() const {
return value_type == VariantValueType::PRIMITIVE && primitive_value.IsNull();
}
bool IsMissing() const {
return value_type == VariantValueType::MISSING;
}
public:
void AddChild(const string &key, VariantValue &&val);
void AddItem(VariantValue &&val);
public:
yyjson_mut_val *ToJSON(ClientContext &context, yyjson_mut_doc *doc) const;
public:
VariantValueType value_type;
//! FIXME: how can we get a deterministic child order for a partially shredded object?
map<string, VariantValue> object_children;
vector<VariantValue> array_items;
Value primitive_value;
};
} // namespace duckdb

View File

@@ -0,0 +1,44 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/variant_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
class VariantColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::VARCHAR;
public:
VariantColumnReader(ClientContext &context, ParquetReader &reader, const ParquetColumnSchema &schema,
vector<unique_ptr<ColumnReader>> child_readers_p);
ClientContext &context;
vector<unique_ptr<ColumnReader>> child_readers;
public:
ColumnReader &GetChildReader(idx_t child_idx);
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override;
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) override;
void Skip(idx_t num_values) override;
idx_t GroupRowsAvailable() override;
uint64_t TotalCompressedSize() override;
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override;
protected:
idx_t metadata_reader_idx;
idx_t value_reader_idx;
};
} // namespace duckdb