should be it
This commit is contained in:
70
external/duckdb/extension/parquet/include/reader/boolean_column_reader.hpp
vendored
Normal file
70
external/duckdb/extension/parquet/include/reader/boolean_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/boolean_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_reader.hpp"
|
||||
#include "reader/templated_column_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct BooleanParquetValueConversion;
|
||||
|
||||
class BooleanColumnReader : public TemplatedColumnReader<bool, BooleanParquetValueConversion> {
|
||||
public:
|
||||
static constexpr const PhysicalType TYPE = PhysicalType::BOOL;
|
||||
|
||||
public:
|
||||
BooleanColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
|
||||
: TemplatedColumnReader<bool, BooleanParquetValueConversion>(reader, schema), byte_pos(0) {
|
||||
}
|
||||
|
||||
uint8_t byte_pos;
|
||||
|
||||
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override {
|
||||
byte_pos = 0;
|
||||
TemplatedColumnReader<bool, BooleanParquetValueConversion>::InitializeRead(row_group_idx_p, columns,
|
||||
protocol_p);
|
||||
}
|
||||
|
||||
void ResetPage() override {
|
||||
byte_pos = 0;
|
||||
}
|
||||
};
|
||||
|
||||
struct BooleanParquetValueConversion {
|
||||
template <bool CHECKED>
|
||||
static bool PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
auto &byte_pos = reader.Cast<BooleanColumnReader>().byte_pos;
|
||||
bool ret = (*plain_data.ptr >> byte_pos) & 1;
|
||||
if (++byte_pos == 8) {
|
||||
byte_pos = 0;
|
||||
if (CHECKED) {
|
||||
plain_data.inc(1);
|
||||
} else {
|
||||
plain_data.unsafe_inc(1);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <bool CHECKED>
|
||||
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
PlainRead<CHECKED>(plain_data, reader);
|
||||
}
|
||||
|
||||
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
|
||||
return plain_data.check_available((count + 7) / 8);
|
||||
}
|
||||
|
||||
static idx_t PlainConstantSize() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
46
external/duckdb/extension/parquet/include/reader/callback_column_reader.hpp
vendored
Normal file
46
external/duckdb/extension/parquet/include/reader/callback_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/callback_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_reader.hpp"
|
||||
#include "reader/templated_column_reader.hpp"
|
||||
#include "parquet_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
template <class PARQUET_PHYSICAL_TYPE, class DUCKDB_PHYSICAL_TYPE,
|
||||
DUCKDB_PHYSICAL_TYPE (*FUNC)(const PARQUET_PHYSICAL_TYPE &input)>
|
||||
class CallbackColumnReader
|
||||
: public TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
|
||||
CallbackParquetValueConversion<PARQUET_PHYSICAL_TYPE, DUCKDB_PHYSICAL_TYPE, FUNC>> {
|
||||
using BaseType =
|
||||
TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
|
||||
CallbackParquetValueConversion<PARQUET_PHYSICAL_TYPE, DUCKDB_PHYSICAL_TYPE, FUNC>>;
|
||||
|
||||
public:
|
||||
static constexpr const PhysicalType TYPE = PhysicalType::INVALID;
|
||||
|
||||
public:
|
||||
CallbackColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
|
||||
: TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
|
||||
CallbackParquetValueConversion<PARQUET_PHYSICAL_TYPE, DUCKDB_PHYSICAL_TYPE, FUNC>>(
|
||||
reader, schema) {
|
||||
}
|
||||
|
||||
protected:
|
||||
void Dictionary(shared_ptr<ResizeableBuffer> dictionary_data, idx_t num_entries) {
|
||||
BaseType::AllocateDict(num_entries * sizeof(DUCKDB_PHYSICAL_TYPE));
|
||||
auto dict_ptr = (DUCKDB_PHYSICAL_TYPE *)this->dict->ptr;
|
||||
for (idx_t i = 0; i < num_entries; i++) {
|
||||
dict_ptr[i] = FUNC(dictionary_data->read<PARQUET_PHYSICAL_TYPE>());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
65
external/duckdb/extension/parquet/include/reader/decimal_column_reader.hpp
vendored
Normal file
65
external/duckdb/extension/parquet/include/reader/decimal_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,65 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/decimal_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_reader.hpp"
|
||||
#include "reader/templated_column_reader.hpp"
|
||||
#include "parquet_reader.hpp"
|
||||
#include "parquet_decimal_utils.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
template <class DUCKDB_PHYSICAL_TYPE, bool FIXED_LENGTH>
|
||||
struct DecimalParquetValueConversion {
|
||||
template <bool CHECKED>
|
||||
static DUCKDB_PHYSICAL_TYPE PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
idx_t byte_len;
|
||||
if (FIXED_LENGTH) {
|
||||
byte_len = reader.Schema().type_length;
|
||||
} else {
|
||||
byte_len = plain_data.read<uint32_t>();
|
||||
}
|
||||
plain_data.available(byte_len);
|
||||
auto res = ParquetDecimalUtils::ReadDecimalValue<DUCKDB_PHYSICAL_TYPE>(const_data_ptr_cast(plain_data.ptr),
|
||||
byte_len, reader.Schema());
|
||||
|
||||
plain_data.inc(byte_len);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <bool CHECKED>
|
||||
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
uint32_t decimal_len = FIXED_LENGTH ? reader.Schema().type_length : plain_data.read<uint32_t>();
|
||||
plain_data.inc(decimal_len);
|
||||
}
|
||||
|
||||
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
|
||||
return true;
|
||||
}
|
||||
|
||||
static idx_t PlainConstantSize() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
template <class DUCKDB_PHYSICAL_TYPE, bool FIXED_LENGTH>
|
||||
class DecimalColumnReader
|
||||
: public TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
|
||||
DecimalParquetValueConversion<DUCKDB_PHYSICAL_TYPE, FIXED_LENGTH>> {
|
||||
using BaseType =
|
||||
TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE, DecimalParquetValueConversion<DUCKDB_PHYSICAL_TYPE, FIXED_LENGTH>>;
|
||||
|
||||
public:
|
||||
DecimalColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
|
||||
: TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
|
||||
DecimalParquetValueConversion<DUCKDB_PHYSICAL_TYPE, FIXED_LENGTH>>(reader, schema) {
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
56
external/duckdb/extension/parquet/include/reader/expression_column_reader.hpp
vendored
Normal file
56
external/duckdb/extension/parquet/include/reader/expression_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/expression_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_reader.hpp"
|
||||
#include "duckdb/execution/expression_executor.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
//! A column reader that executes an expression over a child reader
|
||||
class ExpressionColumnReader : public ColumnReader {
|
||||
public:
|
||||
static constexpr const PhysicalType TYPE = PhysicalType::INVALID;
|
||||
|
||||
public:
|
||||
ExpressionColumnReader(ClientContext &context, unique_ptr<ColumnReader> child_reader, unique_ptr<Expression> expr,
|
||||
const ParquetColumnSchema &schema);
|
||||
ExpressionColumnReader(ClientContext &context, unique_ptr<ColumnReader> child_reader, unique_ptr<Expression> expr,
|
||||
unique_ptr<ParquetColumnSchema> owned_schema);
|
||||
|
||||
unique_ptr<ColumnReader> child_reader;
|
||||
DataChunk intermediate_chunk;
|
||||
unique_ptr<Expression> expr;
|
||||
ExpressionExecutor executor;
|
||||
|
||||
// If this reader was created on top of a child reader, after-the-fact, the schema needs to live somewhere
|
||||
unique_ptr<ParquetColumnSchema> owned_schema;
|
||||
|
||||
public:
|
||||
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override;
|
||||
|
||||
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) override;
|
||||
|
||||
void Skip(idx_t num_values) override;
|
||||
idx_t GroupRowsAvailable() override;
|
||||
|
||||
uint64_t TotalCompressedSize() override {
|
||||
return child_reader->TotalCompressedSize();
|
||||
}
|
||||
|
||||
idx_t FileOffset() const override {
|
||||
return child_reader->FileOffset();
|
||||
}
|
||||
|
||||
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override {
|
||||
child_reader->RegisterPrefetch(transport, allow_merge);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
67
external/duckdb/extension/parquet/include/reader/interval_column_reader.hpp
vendored
Normal file
67
external/duckdb/extension/parquet/include/reader/interval_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,67 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/interval_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_reader.hpp"
|
||||
#include "reader/templated_column_reader.hpp"
|
||||
#include "parquet_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Interval Column Reader
|
||||
//===--------------------------------------------------------------------===//
|
||||
struct IntervalValueConversion {
|
||||
static constexpr const idx_t PARQUET_INTERVAL_SIZE = 12;
|
||||
|
||||
static interval_t ReadParquetInterval(const_data_ptr_t input) {
|
||||
interval_t result;
|
||||
result.months = Load<int32_t>(input);
|
||||
result.days = Load<int32_t>(input + sizeof(uint32_t));
|
||||
result.micros = int64_t(Load<uint32_t>(input + sizeof(uint32_t) * 2)) * 1000;
|
||||
return result;
|
||||
}
|
||||
|
||||
template <bool CHECKED>
|
||||
static interval_t PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
if (CHECKED) {
|
||||
plain_data.available(PARQUET_INTERVAL_SIZE);
|
||||
}
|
||||
auto res = ReadParquetInterval(const_data_ptr_cast(plain_data.ptr));
|
||||
plain_data.unsafe_inc(PARQUET_INTERVAL_SIZE);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <bool CHECKED>
|
||||
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
if (CHECKED) {
|
||||
plain_data.inc(PARQUET_INTERVAL_SIZE);
|
||||
} else {
|
||||
plain_data.unsafe_inc(PARQUET_INTERVAL_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
|
||||
return plain_data.check_available(count * PARQUET_INTERVAL_SIZE);
|
||||
}
|
||||
|
||||
static idx_t PlainConstantSize() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
class IntervalColumnReader : public TemplatedColumnReader<interval_t, IntervalValueConversion> {
|
||||
|
||||
public:
|
||||
IntervalColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
|
||||
: TemplatedColumnReader<interval_t, IntervalValueConversion>(reader, schema) {
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
62
external/duckdb/extension/parquet/include/reader/list_column_reader.hpp
vendored
Normal file
62
external/duckdb/extension/parquet/include/reader/list_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/list_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_reader.hpp"
|
||||
#include "reader/templated_column_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class ListColumnReader : public ColumnReader {
|
||||
public:
|
||||
static constexpr const PhysicalType TYPE = PhysicalType::LIST;
|
||||
|
||||
public:
|
||||
ListColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
|
||||
unique_ptr<ColumnReader> child_column_reader_p);
|
||||
|
||||
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out) override;
|
||||
|
||||
void ApplyPendingSkips(data_ptr_t define_out, data_ptr_t repeat_out) override;
|
||||
|
||||
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override {
|
||||
child_column_reader->InitializeRead(row_group_idx_p, columns, protocol_p);
|
||||
}
|
||||
|
||||
idx_t GroupRowsAvailable() override {
|
||||
return child_column_reader->GroupRowsAvailable() + overflow_child_count;
|
||||
}
|
||||
|
||||
uint64_t TotalCompressedSize() override {
|
||||
return child_column_reader->TotalCompressedSize();
|
||||
}
|
||||
|
||||
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override {
|
||||
child_column_reader->RegisterPrefetch(transport, allow_merge);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <class OP>
|
||||
idx_t ReadInternal(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out,
|
||||
optional_ptr<Vector> result_out);
|
||||
|
||||
private:
|
||||
unique_ptr<ColumnReader> child_column_reader;
|
||||
ResizeableBuffer child_defines;
|
||||
ResizeableBuffer child_repeats;
|
||||
uint8_t *child_defines_ptr;
|
||||
uint8_t *child_repeats_ptr;
|
||||
|
||||
VectorCache read_cache;
|
||||
Vector read_vector;
|
||||
|
||||
idx_t overflow_child_count;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
38
external/duckdb/extension/parquet/include/reader/null_column_reader.hpp
vendored
Normal file
38
external/duckdb/extension/parquet/include/reader/null_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/null_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_reader.hpp"
|
||||
#include "duckdb/common/helper.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class NullColumnReader : public ColumnReader {
|
||||
public:
|
||||
static constexpr const PhysicalType TYPE = PhysicalType::INVALID;
|
||||
|
||||
public:
|
||||
NullColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema) : ColumnReader(reader, schema) {};
|
||||
|
||||
shared_ptr<ResizeableBuffer> dict;
|
||||
|
||||
public:
|
||||
void Plain(ByteBuffer &plain_data, uint8_t *defines, uint64_t num_values, idx_t result_offset,
|
||||
Vector &result) override {
|
||||
(void)defines;
|
||||
(void)plain_data;
|
||||
|
||||
auto &result_mask = FlatVector::Validity(result);
|
||||
for (idx_t row_idx = 0; row_idx < num_values; row_idx++) {
|
||||
result_mask.SetInvalid(row_idx + result_offset);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
52
external/duckdb/extension/parquet/include/reader/row_number_column_reader.hpp
vendored
Normal file
52
external/duckdb/extension/parquet/include/reader/row_number_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/row_number_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/limits.hpp"
|
||||
#include "column_reader.hpp"
|
||||
#include "reader/templated_column_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
//! Reads a file-absolute row number as a virtual column that's not actually stored in the file
|
||||
class RowNumberColumnReader : public ColumnReader {
|
||||
public:
|
||||
static constexpr const PhysicalType TYPE = PhysicalType::INT64;
|
||||
|
||||
public:
|
||||
RowNumberColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema);
|
||||
|
||||
public:
|
||||
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) override;
|
||||
void Filter(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out,
|
||||
const TableFilter &filter, TableFilterState &filter_state, SelectionVector &sel,
|
||||
idx_t &approved_tuple_count, bool is_first_filter) override;
|
||||
|
||||
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override;
|
||||
|
||||
void Skip(idx_t num_values) override {
|
||||
row_group_offset += num_values;
|
||||
}
|
||||
idx_t GroupRowsAvailable() override {
|
||||
return NumericLimits<idx_t>::Maximum();
|
||||
};
|
||||
uint64_t TotalCompressedSize() override {
|
||||
return 0;
|
||||
}
|
||||
idx_t FileOffset() const override {
|
||||
return 0;
|
||||
}
|
||||
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override {
|
||||
}
|
||||
|
||||
private:
|
||||
idx_t row_group_offset;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
91
external/duckdb/extension/parquet/include/reader/string_column_reader.hpp
vendored
Normal file
91
external/duckdb/extension/parquet/include/reader/string_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,91 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/string_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_reader.hpp"
|
||||
#include "reader/templated_column_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class StringColumnReader : public ColumnReader {
|
||||
enum class StringColumnType : uint8_t { VARCHAR, JSON, OTHER };
|
||||
|
||||
static StringColumnType GetStringColumnType(const LogicalType &type) {
|
||||
if (type.IsJSONType()) {
|
||||
return StringColumnType::JSON;
|
||||
}
|
||||
if (type.id() == LogicalTypeId::VARCHAR) {
|
||||
return StringColumnType::VARCHAR;
|
||||
}
|
||||
return StringColumnType::OTHER;
|
||||
}
|
||||
|
||||
public:
|
||||
static constexpr const PhysicalType TYPE = PhysicalType::VARCHAR;
|
||||
|
||||
public:
|
||||
StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema);
|
||||
idx_t fixed_width_string_length;
|
||||
const StringColumnType string_column_type;
|
||||
|
||||
public:
|
||||
static void VerifyString(const char *str_data, uint32_t str_len, const bool isVarchar);
|
||||
void VerifyString(const char *str_data, uint32_t str_len);
|
||||
|
||||
static void ReferenceBlock(Vector &result, shared_ptr<ResizeableBuffer> &block);
|
||||
|
||||
protected:
|
||||
void Plain(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values, idx_t result_offset,
|
||||
Vector &result) override {
|
||||
throw NotImplementedException("StringColumnReader can only read plain data from a shared buffer");
|
||||
}
|
||||
void Plain(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values, idx_t result_offset,
|
||||
Vector &result) override;
|
||||
void PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values) override;
|
||||
void PlainSelect(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values, Vector &result,
|
||||
const SelectionVector &sel, idx_t count) override;
|
||||
|
||||
bool SupportsDirectFilter() const override {
|
||||
return true;
|
||||
}
|
||||
bool SupportsDirectSelect() const override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
struct StringParquetValueConversion {
|
||||
template <bool CHECKED>
|
||||
static string_t PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
auto &scr = reader.Cast<StringColumnReader>();
|
||||
uint32_t str_len =
|
||||
scr.fixed_width_string_length == 0 ? plain_data.read<uint32_t>() : scr.fixed_width_string_length;
|
||||
plain_data.available(str_len);
|
||||
auto plain_str = char_ptr_cast(plain_data.ptr);
|
||||
scr.VerifyString(plain_str, str_len);
|
||||
auto ret_str = string_t(plain_str, str_len);
|
||||
plain_data.inc(str_len);
|
||||
return ret_str;
|
||||
}
|
||||
template <bool CHECKED>
|
||||
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
auto &scr = reader.Cast<StringColumnReader>();
|
||||
uint32_t str_len =
|
||||
scr.fixed_width_string_length == 0 ? plain_data.read<uint32_t>() : scr.fixed_width_string_length;
|
||||
plain_data.inc(str_len);
|
||||
}
|
||||
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static idx_t PlainConstantSize() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
39
external/duckdb/extension/parquet/include/reader/struct_column_reader.hpp
vendored
Normal file
39
external/duckdb/extension/parquet/include/reader/struct_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/struct_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_reader.hpp"
|
||||
#include "reader/templated_column_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class StructColumnReader : public ColumnReader {
|
||||
public:
|
||||
static constexpr const PhysicalType TYPE = PhysicalType::STRUCT;
|
||||
|
||||
public:
|
||||
StructColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
|
||||
vector<unique_ptr<ColumnReader>> child_readers_p);
|
||||
|
||||
vector<unique_ptr<ColumnReader>> child_readers;
|
||||
|
||||
public:
|
||||
ColumnReader &GetChildReader(idx_t child_idx);
|
||||
|
||||
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override;
|
||||
|
||||
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) override;
|
||||
|
||||
void Skip(idx_t num_values) override;
|
||||
idx_t GroupRowsAvailable() override;
|
||||
uint64_t TotalCompressedSize() override;
|
||||
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
110
external/duckdb/extension/parquet/include/reader/templated_column_reader.hpp
vendored
Normal file
110
external/duckdb/extension/parquet/include/reader/templated_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,110 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/templated_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_reader.hpp"
|
||||
#include "duckdb/common/helper.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
template <class VALUE_TYPE>
|
||||
struct TemplatedParquetValueConversion {
|
||||
template <bool CHECKED>
|
||||
static VALUE_TYPE PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
if (CHECKED) {
|
||||
return plain_data.read<VALUE_TYPE>();
|
||||
} else {
|
||||
return plain_data.unsafe_read<VALUE_TYPE>();
|
||||
}
|
||||
}
|
||||
|
||||
template <bool CHECKED>
|
||||
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
if (CHECKED) {
|
||||
plain_data.inc(sizeof(VALUE_TYPE));
|
||||
} else {
|
||||
plain_data.unsafe_inc(sizeof(VALUE_TYPE));
|
||||
}
|
||||
}
|
||||
|
||||
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
|
||||
return plain_data.check_available(count * sizeof(VALUE_TYPE));
|
||||
}
|
||||
|
||||
static idx_t PlainConstantSize() {
|
||||
return sizeof(VALUE_TYPE);
|
||||
}
|
||||
};
|
||||
|
||||
template <class VALUE_TYPE, class VALUE_CONVERSION>
|
||||
class TemplatedColumnReader : public ColumnReader {
|
||||
public:
|
||||
static constexpr const PhysicalType TYPE = PhysicalType::INVALID;
|
||||
|
||||
public:
|
||||
TemplatedColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema) : ColumnReader(reader, schema) {
|
||||
}
|
||||
|
||||
shared_ptr<ResizeableBuffer> dict;
|
||||
|
||||
public:
|
||||
void AllocateDict(idx_t size) {
|
||||
if (!dict) {
|
||||
dict = make_shared_ptr<ResizeableBuffer>(GetAllocator(), size);
|
||||
} else {
|
||||
dict->resize(GetAllocator(), size);
|
||||
}
|
||||
}
|
||||
|
||||
void Plain(ByteBuffer &plain_data, uint8_t *defines, uint64_t num_values, idx_t result_offset,
|
||||
Vector &result) override {
|
||||
PlainTemplated<VALUE_TYPE, VALUE_CONVERSION>(plain_data, defines, num_values, result_offset, result);
|
||||
}
|
||||
|
||||
void PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values) override {
|
||||
PlainSkipTemplated<VALUE_CONVERSION>(plain_data, defines, num_values);
|
||||
}
|
||||
|
||||
bool SupportsDirectFilter() const override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <class PARQUET_PHYSICAL_TYPE, class DUCKDB_PHYSICAL_TYPE,
|
||||
DUCKDB_PHYSICAL_TYPE (*FUNC)(const PARQUET_PHYSICAL_TYPE &input)>
|
||||
struct CallbackParquetValueConversion {
|
||||
|
||||
template <bool CHECKED>
|
||||
static DUCKDB_PHYSICAL_TYPE PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
if (CHECKED) {
|
||||
return FUNC(plain_data.read<PARQUET_PHYSICAL_TYPE>());
|
||||
} else {
|
||||
return FUNC(plain_data.unsafe_read<PARQUET_PHYSICAL_TYPE>());
|
||||
}
|
||||
}
|
||||
|
||||
template <bool CHECKED>
|
||||
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
if (CHECKED) {
|
||||
plain_data.inc(sizeof(PARQUET_PHYSICAL_TYPE));
|
||||
} else {
|
||||
plain_data.unsafe_inc(sizeof(PARQUET_PHYSICAL_TYPE));
|
||||
}
|
||||
}
|
||||
|
||||
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
|
||||
return plain_data.check_available(count * sizeof(PARQUET_PHYSICAL_TYPE));
|
||||
}
|
||||
|
||||
static idx_t PlainConstantSize() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
60
external/duckdb/extension/parquet/include/reader/uuid_column_reader.hpp
vendored
Normal file
60
external/duckdb/extension/parquet/include/reader/uuid_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/uuid_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_reader.hpp"
|
||||
#include "templated_column_reader.hpp"
|
||||
#include "parquet_reader.hpp"
|
||||
#include "duckdb/common/types/uuid.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct UUIDValueConversion {
|
||||
static hugeint_t ReadParquetUUID(const_data_ptr_t input) {
|
||||
// Use the utility function from BaseUUID
|
||||
return BaseUUID::FromBlob(input);
|
||||
}
|
||||
|
||||
template <bool CHECKED>
|
||||
static hugeint_t PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
if (CHECKED) {
|
||||
plain_data.available(sizeof(hugeint_t));
|
||||
}
|
||||
auto res = ReadParquetUUID(const_data_ptr_cast(plain_data.ptr));
|
||||
plain_data.unsafe_inc(sizeof(hugeint_t));
|
||||
return res;
|
||||
}
|
||||
|
||||
template <bool CHECKED>
|
||||
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
|
||||
if (CHECKED) {
|
||||
plain_data.inc(sizeof(hugeint_t));
|
||||
} else {
|
||||
plain_data.unsafe_inc(sizeof(hugeint_t));
|
||||
}
|
||||
}
|
||||
|
||||
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
|
||||
return plain_data.check_available(count * sizeof(hugeint_t));
|
||||
}
|
||||
|
||||
static idx_t PlainConstantSize() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
class UUIDColumnReader : public TemplatedColumnReader<hugeint_t, UUIDValueConversion> {
|
||||
|
||||
public:
|
||||
UUIDColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
|
||||
: TemplatedColumnReader<hugeint_t, UUIDValueConversion>(reader, schema) {
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
148
external/duckdb/extension/parquet/include/reader/variant/variant_binary_decoder.hpp
vendored
Normal file
148
external/duckdb/extension/parquet/include/reader/variant/variant_binary_decoder.hpp
vendored
Normal file
@@ -0,0 +1,148 @@
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/types/string_type.hpp"
|
||||
#include "duckdb/common/types/value.hpp"
|
||||
#include "reader/variant/variant_value.hpp"
|
||||
|
||||
using namespace duckdb_yyjson;
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
//! ------------ Metadata ------------
|
||||
|
||||
struct VariantMetadataHeader {
|
||||
public:
|
||||
static VariantMetadataHeader FromHeaderByte(uint8_t byte);
|
||||
|
||||
public:
|
||||
//! The version of the protocol used (only '1' supported for now)
|
||||
uint8_t version;
|
||||
//! Number of bytes per dictionary size and offset field
|
||||
uint8_t offset_size;
|
||||
//! Whether dictionary strings are sorted and unique
|
||||
bool sorted_strings = false;
|
||||
};
|
||||
|
||||
struct VariantMetadata {
|
||||
public:
|
||||
explicit VariantMetadata(const string_t &metadata);
|
||||
|
||||
public:
|
||||
const string_t &metadata;
|
||||
|
||||
public:
|
||||
VariantMetadataHeader header;
|
||||
const_data_ptr_t offsets;
|
||||
const_data_ptr_t bytes;
|
||||
|
||||
//! The json object keys have to be null-terminated
|
||||
//! But we don't receive them null-terminated
|
||||
vector<string> strings;
|
||||
};
|
||||
|
||||
//! ------------ Value ------------
|
||||
|
||||
enum class VariantBasicType : uint8_t { PRIMITIVE = 0, SHORT_STRING = 1, OBJECT = 2, ARRAY = 3, INVALID };
|
||||
|
||||
enum class VariantPrimitiveType : uint8_t {
|
||||
NULL_TYPE = 0,
|
||||
BOOLEAN_TRUE = 1,
|
||||
BOOLEAN_FALSE = 2,
|
||||
INT8 = 3,
|
||||
INT16 = 4,
|
||||
INT32 = 5,
|
||||
INT64 = 6,
|
||||
DOUBLE = 7,
|
||||
DECIMAL4 = 8,
|
||||
DECIMAL8 = 9,
|
||||
DECIMAL16 = 10,
|
||||
DATE = 11,
|
||||
TIMESTAMP_MICROS = 12,
|
||||
TIMESTAMP_NTZ_MICROS = 13,
|
||||
FLOAT = 14,
|
||||
BINARY = 15,
|
||||
STRING = 16,
|
||||
TIME_NTZ_MICROS = 17,
|
||||
TIMESTAMP_NANOS = 18,
|
||||
TIMESTAMP_NTZ_NANOS = 19,
|
||||
UUID = 20,
|
||||
INVALID
|
||||
};
|
||||
|
||||
struct VariantValueMetadata {
|
||||
public:
|
||||
VariantValueMetadata() {
|
||||
}
|
||||
|
||||
public:
|
||||
static VariantValueMetadata FromHeaderByte(uint8_t byte);
|
||||
static VariantBasicType VariantBasicTypeFromByte(uint8_t byte) {
|
||||
if (byte >= static_cast<uint8_t>(VariantBasicType::INVALID)) {
|
||||
throw NotImplementedException("Variant BasicType (%d) is not supported", byte);
|
||||
}
|
||||
return static_cast<VariantBasicType>(byte);
|
||||
}
|
||||
|
||||
static VariantPrimitiveType VariantPrimitiveTypeFromByte(uint8_t byte) {
|
||||
if (byte >= static_cast<uint8_t>(VariantPrimitiveType::INVALID)) {
|
||||
throw NotImplementedException("Variant PrimitiveType (%d) is not supported", byte);
|
||||
}
|
||||
return static_cast<VariantPrimitiveType>(byte);
|
||||
}
|
||||
|
||||
public:
|
||||
VariantBasicType basic_type;
|
||||
|
||||
public:
|
||||
//! Primitive Type header
|
||||
VariantPrimitiveType primitive_type;
|
||||
|
||||
public:
|
||||
//! Short String header
|
||||
uint8_t string_size;
|
||||
|
||||
public:
|
||||
//! Object header | Array header
|
||||
|
||||
//! Size in bytes for each 'field_offset' entry
|
||||
uint32_t field_offset_size;
|
||||
//! Size in bytes for each 'field_id' entry
|
||||
uint32_t field_id_size;
|
||||
//! Whether the number of elements is encoded in 1 byte (false) or 4 bytes (true)
|
||||
bool is_large;
|
||||
};
|
||||
|
||||
struct VariantDecodeResult {
|
||||
public:
|
||||
VariantDecodeResult() = default;
|
||||
~VariantDecodeResult() {
|
||||
if (doc) {
|
||||
yyjson_mut_doc_free(doc);
|
||||
}
|
||||
if (data) {
|
||||
free(data);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
yyjson_mut_doc *doc = nullptr;
|
||||
char *data = nullptr;
|
||||
};
|
||||
|
||||
class VariantBinaryDecoder {
|
||||
public:
|
||||
VariantBinaryDecoder() = delete;
|
||||
|
||||
public:
|
||||
static VariantValue Decode(const VariantMetadata &metadata, const_data_ptr_t data);
|
||||
|
||||
public:
|
||||
static VariantValue PrimitiveTypeDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data);
|
||||
static VariantValue ShortStringDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data);
|
||||
static VariantValue ObjectDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata,
|
||||
const_data_ptr_t data);
|
||||
static VariantValue ArrayDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata,
|
||||
const_data_ptr_t data);
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
24
external/duckdb/extension/parquet/include/reader/variant/variant_shredded_conversion.hpp
vendored
Normal file
24
external/duckdb/extension/parquet/include/reader/variant/variant_shredded_conversion.hpp
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
#pragma once
|
||||
|
||||
#include "reader/variant/variant_value.hpp"
|
||||
#include "reader/variant/variant_binary_decoder.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class VariantShreddedConversion {
|
||||
public:
|
||||
VariantShreddedConversion() = delete;
|
||||
|
||||
public:
|
||||
static vector<VariantValue> Convert(Vector &metadata, Vector &group, idx_t offset, idx_t length, idx_t total_size,
|
||||
bool is_field);
|
||||
static vector<VariantValue> ConvertShreddedLeaf(Vector &metadata, Vector &value, Vector &typed_value, idx_t offset,
|
||||
idx_t length, idx_t total_size, const bool is_field);
|
||||
static vector<VariantValue> ConvertShreddedArray(Vector &metadata, Vector &value, Vector &typed_value, idx_t offset,
|
||||
idx_t length, idx_t total_size, const bool is_field);
|
||||
static vector<VariantValue> ConvertShreddedObject(Vector &metadata, Vector &value, Vector &typed_value,
|
||||
idx_t offset, idx_t length, idx_t total_size,
|
||||
const bool is_field);
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
54
external/duckdb/extension/parquet/include/reader/variant/variant_value.hpp
vendored
Normal file
54
external/duckdb/extension/parquet/include/reader/variant/variant_value.hpp
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/map.hpp"
|
||||
#include "duckdb/common/vector.hpp"
|
||||
#include "duckdb/common/types/value.hpp"
|
||||
|
||||
#include "yyjson.hpp"
|
||||
|
||||
using namespace duckdb_yyjson;
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
enum class VariantValueType : uint8_t { PRIMITIVE, OBJECT, ARRAY, MISSING };
|
||||
|
||||
struct VariantValue {
|
||||
public:
|
||||
VariantValue() : value_type(VariantValueType::MISSING) {
|
||||
}
|
||||
explicit VariantValue(VariantValueType type) : value_type(type) {
|
||||
}
|
||||
explicit VariantValue(Value &&val) : value_type(VariantValueType::PRIMITIVE), primitive_value(std::move(val)) {
|
||||
}
|
||||
// Delete copy constructor and copy assignment operator
|
||||
VariantValue(const VariantValue &) = delete;
|
||||
VariantValue &operator=(const VariantValue &) = delete;
|
||||
|
||||
// Default move constructor and move assignment operator
|
||||
VariantValue(VariantValue &&) noexcept = default;
|
||||
VariantValue &operator=(VariantValue &&) noexcept = default;
|
||||
|
||||
public:
|
||||
bool IsNull() const {
|
||||
return value_type == VariantValueType::PRIMITIVE && primitive_value.IsNull();
|
||||
}
|
||||
bool IsMissing() const {
|
||||
return value_type == VariantValueType::MISSING;
|
||||
}
|
||||
|
||||
public:
|
||||
void AddChild(const string &key, VariantValue &&val);
|
||||
void AddItem(VariantValue &&val);
|
||||
|
||||
public:
|
||||
yyjson_mut_val *ToJSON(ClientContext &context, yyjson_mut_doc *doc) const;
|
||||
|
||||
public:
|
||||
VariantValueType value_type;
|
||||
//! FIXME: how can we get a deterministic child order for a partially shredded object?
|
||||
map<string, VariantValue> object_children;
|
||||
vector<VariantValue> array_items;
|
||||
Value primitive_value;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
44
external/duckdb/extension/parquet/include/reader/variant_column_reader.hpp
vendored
Normal file
44
external/duckdb/extension/parquet/include/reader/variant_column_reader.hpp
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// reader/variant_column_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_reader.hpp"
|
||||
#include "reader/templated_column_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class VariantColumnReader : public ColumnReader {
|
||||
public:
|
||||
static constexpr const PhysicalType TYPE = PhysicalType::VARCHAR;
|
||||
|
||||
public:
|
||||
VariantColumnReader(ClientContext &context, ParquetReader &reader, const ParquetColumnSchema &schema,
|
||||
vector<unique_ptr<ColumnReader>> child_readers_p);
|
||||
|
||||
ClientContext &context;
|
||||
vector<unique_ptr<ColumnReader>> child_readers;
|
||||
|
||||
public:
|
||||
ColumnReader &GetChildReader(idx_t child_idx);
|
||||
|
||||
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override;
|
||||
|
||||
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) override;
|
||||
|
||||
void Skip(idx_t num_values) override;
|
||||
idx_t GroupRowsAvailable() override;
|
||||
uint64_t TotalCompressedSize() override;
|
||||
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override;
|
||||
|
||||
protected:
|
||||
idx_t metadata_reader_idx;
|
||||
idx_t value_reader_idx;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
Reference in New Issue
Block a user