should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,91 @@
cmake_minimum_required(VERSION 3.5...3.29)
project(ParquetExtension)
include_directories(
include ../../third_party/lz4 ../../third_party/parquet
../../third_party/thrift ../../third_party/snappy
../../third_party/brotli/include)
add_subdirectory(decoder)
add_subdirectory(reader)
add_subdirectory(writer)
set(PARQUET_EXTENSION_FILES
${PARQUET_EXTENSION_FILES}
column_reader.cpp
column_writer.cpp
parquet_crypto.cpp
parquet_extension.cpp
parquet_file_metadata_cache.cpp
parquet_float16.cpp
parquet_multi_file_info.cpp
parquet_metadata.cpp
parquet_reader.cpp
parquet_field_id.cpp
parquet_statistics.cpp
parquet_timestamp.cpp
parquet_writer.cpp
parquet_shredding.cpp
serialize_parquet.cpp
zstd_file_system.cpp
geo_parquet.cpp)
if(NOT CLANG_TIDY)
# parquet/thrift/snappy
set(PARQUET_EXTENSION_FILES
${PARQUET_EXTENSION_FILES}
../../third_party/parquet/parquet_types.cpp
../../third_party/thrift/thrift/protocol/TProtocol.cpp
../../third_party/thrift/thrift/transport/TTransportException.cpp
../../third_party/thrift/thrift/transport/TBufferTransports.cpp
../../third_party/snappy/snappy.cc
../../third_party/snappy/snappy-sinksource.cc)
# lz4
set(PARQUET_EXTENSION_FILES ${PARQUET_EXTENSION_FILES}
../../third_party/lz4/lz4.cpp)
# brotli
set(PARQUET_EXTENSION_FILES
${PARQUET_EXTENSION_FILES}
../../third_party/brotli/enc/dictionary_hash.cpp
../../third_party/brotli/enc/backward_references_hq.cpp
../../third_party/brotli/enc/histogram.cpp
../../third_party/brotli/enc/memory.cpp
../../third_party/brotli/enc/entropy_encode.cpp
../../third_party/brotli/enc/compound_dictionary.cpp
../../third_party/brotli/enc/compress_fragment_two_pass.cpp
../../third_party/brotli/enc/block_splitter.cpp
../../third_party/brotli/enc/command.cpp
../../third_party/brotli/enc/encode.cpp
../../third_party/brotli/enc/encoder_dict.cpp
../../third_party/brotli/enc/cluster.cpp
../../third_party/brotli/enc/backward_references.cpp
../../third_party/brotli/enc/utf8_util.cpp
../../third_party/brotli/enc/compress_fragment.cpp
../../third_party/brotli/enc/fast_log.cpp
../../third_party/brotli/enc/brotli_bit_stream.cpp
../../third_party/brotli/enc/bit_cost.cpp
../../third_party/brotli/enc/static_dict.cpp
../../third_party/brotli/enc/literal_cost.cpp
../../third_party/brotli/enc/metablock.cpp
../../third_party/brotli/common/dictionary.cpp
../../third_party/brotli/common/constants.cpp
../../third_party/brotli/common/transform.cpp
../../third_party/brotli/common/platform.cpp
../../third_party/brotli/common/shared_dictionary.cpp
../../third_party/brotli/common/context.cpp
../../third_party/brotli/dec/state.cpp
../../third_party/brotli/dec/decode.cpp
../../third_party/brotli/dec/huffman.cpp
../../third_party/brotli/dec/bit_reader.cpp)
endif()
build_static_extension(parquet ${PARQUET_EXTENSION_FILES})
set(PARAMETERS "-warnings")
build_loadable_extension(parquet ${PARAMETERS} ${PARQUET_EXTENSION_FILES})
target_link_libraries(parquet_loadable_extension duckdb_mbedtls duckdb_zstd)
install(
TARGETS parquet_extension
EXPORT "${DUCKDB_EXPORT_SET}"
LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")

View File

@@ -0,0 +1,911 @@
#include "column_reader.hpp"
#include "reader/boolean_column_reader.hpp"
#include "brotli/decode.h"
#include "reader/callback_column_reader.hpp"
#include "reader/decimal_column_reader.hpp"
#include "duckdb.hpp"
#include "reader/expression_column_reader.hpp"
#include "reader/interval_column_reader.hpp"
#include "reader/list_column_reader.hpp"
#include "lz4.hpp"
#include "miniz_wrapper.hpp"
#include "reader/null_column_reader.hpp"
#include "parquet_reader.hpp"
#include "parquet_timestamp.hpp"
#include "parquet_float16.hpp"
#include "reader/row_number_column_reader.hpp"
#include "snappy.h"
#include "reader/string_column_reader.hpp"
#include "reader/struct_column_reader.hpp"
#include "reader/templated_column_reader.hpp"
#include "reader/uuid_column_reader.hpp"
#include "zstd.h"
#include "duckdb/storage/table/column_segment.hpp"
#include "duckdb/common/helper.hpp"
#include "duckdb/common/types/bit.hpp"
namespace duckdb {
using duckdb_parquet::CompressionCodec;
using duckdb_parquet::ConvertedType;
using duckdb_parquet::Encoding;
using duckdb_parquet::PageType;
using duckdb_parquet::Type;
const uint64_t ParquetDecodeUtils::BITPACK_MASKS[] = {0,
1,
3,
7,
15,
31,
63,
127,
255,
511,
1023,
2047,
4095,
8191,
16383,
32767,
65535,
131071,
262143,
524287,
1048575,
2097151,
4194303,
8388607,
16777215,
33554431,
67108863,
134217727,
268435455,
536870911,
1073741823,
2147483647,
4294967295,
8589934591,
17179869183,
34359738367,
68719476735,
137438953471,
274877906943,
549755813887,
1099511627775,
2199023255551,
4398046511103,
8796093022207,
17592186044415,
35184372088831,
70368744177663,
140737488355327,
281474976710655,
562949953421311,
1125899906842623,
2251799813685247,
4503599627370495,
9007199254740991,
18014398509481983,
36028797018963967,
72057594037927935,
144115188075855871,
288230376151711743,
576460752303423487,
1152921504606846975,
2305843009213693951,
4611686018427387903,
9223372036854775807,
18446744073709551615ULL};
const uint64_t ParquetDecodeUtils::BITPACK_MASKS_SIZE = sizeof(ParquetDecodeUtils::BITPACK_MASKS) / sizeof(uint64_t);
const uint8_t ParquetDecodeUtils::BITPACK_DLEN = 8;
ColumnReader::ColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema_p)
: column_schema(schema_p), reader(reader), page_rows_available(0), dictionary_decoder(*this),
delta_binary_packed_decoder(*this), rle_decoder(*this), delta_length_byte_array_decoder(*this),
delta_byte_array_decoder(*this), byte_stream_split_decoder(*this) {
}
ColumnReader::~ColumnReader() {
}
Allocator &ColumnReader::GetAllocator() {
return reader.allocator;
}
ParquetReader &ColumnReader::Reader() {
return reader;
}
void ColumnReader::RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) {
if (chunk) {
uint64_t size = chunk->meta_data.total_compressed_size;
transport.RegisterPrefetch(FileOffset(), size, allow_merge);
}
}
unique_ptr<BaseStatistics> ColumnReader::Stats(idx_t row_group_idx_p, const vector<ColumnChunk> &columns) {
return Schema().Stats(*reader.GetFileMetadata(), reader.parquet_options, row_group_idx_p, columns);
}
uint64_t ColumnReader::TotalCompressedSize() {
if (!chunk) {
return 0;
}
return chunk->meta_data.total_compressed_size;
}
// Note: It's not trivial to determine where all Column data is stored. Chunk->file_offset
// apparently is not the first page of the data. Therefore we determine the address of the first page by taking the
// minimum of all page offsets.
idx_t ColumnReader::FileOffset() const {
if (!chunk) {
throw std::runtime_error("FileOffset called on ColumnReader with no chunk");
}
auto min_offset = NumericLimits<idx_t>::Maximum();
if (chunk->meta_data.__isset.dictionary_page_offset) {
min_offset = MinValue<idx_t>(min_offset, chunk->meta_data.dictionary_page_offset);
}
if (chunk->meta_data.__isset.index_page_offset) {
min_offset = MinValue<idx_t>(min_offset, chunk->meta_data.index_page_offset);
}
min_offset = MinValue<idx_t>(min_offset, chunk->meta_data.data_page_offset);
return min_offset;
}
idx_t ColumnReader::GroupRowsAvailable() {
return group_rows_available;
}
void ColumnReader::PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values) {
throw NotImplementedException("PlainSkip not implemented");
}
void ColumnReader::Plain(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values, // NOLINT
idx_t result_offset, Vector &result) {
throw NotImplementedException("Plain not implemented");
}
void ColumnReader::Plain(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values,
idx_t result_offset, Vector &result) {
Plain(*plain_data, defines, num_values, result_offset, result);
}
void ColumnReader::PlainSelect(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values,
Vector &result, const SelectionVector &sel, idx_t count) {
throw NotImplementedException("PlainSelect not implemented");
}
void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) {
D_ASSERT(ColumnIndex() < columns.size());
chunk = &columns[ColumnIndex()];
protocol = &protocol_p;
D_ASSERT(chunk);
D_ASSERT(chunk->__isset.meta_data);
if (chunk->__isset.file_path) {
throw InvalidInputException("Failed to read file \"%s\": Only inlined data files are supported (no references)",
Reader().GetFileName());
}
// ugh. sometimes there is an extra offset for the dict. sometimes it's wrong.
chunk_read_offset = chunk->meta_data.data_page_offset;
if (chunk->meta_data.__isset.dictionary_page_offset && chunk->meta_data.dictionary_page_offset >= 4) {
// this assumes the data pages follow the dict pages directly.
chunk_read_offset = chunk->meta_data.dictionary_page_offset;
}
group_rows_available = chunk->meta_data.num_values;
}
bool ColumnReader::PageIsFilteredOut(PageHeader &page_hdr) {
if (!dictionary_decoder.HasFilteredOutAllValues()) {
return false;
}
if (page_hdr.type != PageType::DATA_PAGE && page_hdr.type != PageType::DATA_PAGE_V2) {
// we can only filter out data pages
return false;
}
bool is_v1 = page_hdr.type == PageType::DATA_PAGE;
auto &v1_header = page_hdr.data_page_header;
auto &v2_header = page_hdr.data_page_header_v2;
auto page_encoding = is_v1 ? v1_header.encoding : v2_header.encoding;
if (page_encoding != Encoding::PLAIN_DICTIONARY && page_encoding != Encoding::RLE_DICTIONARY) {
// not a dictionary page
return false;
}
// the page has been filtered out!
// skip forward
auto &trans = reinterpret_cast<ThriftFileTransport &>(*protocol->getTransport());
trans.Skip(page_hdr.compressed_page_size);
page_rows_available = is_v1 ? v1_header.num_values : v2_header.num_values;
encoding = ColumnEncoding::DICTIONARY;
page_is_filtered_out = true;
return true;
}
void ColumnReader::PrepareRead(optional_ptr<const TableFilter> filter, optional_ptr<TableFilterState> filter_state) {
encoding = ColumnEncoding::INVALID;
defined_decoder.reset();
page_is_filtered_out = false;
block.reset();
PageHeader page_hdr;
auto &trans = reinterpret_cast<ThriftFileTransport &>(*protocol->getTransport());
if (trans.HasPrefetch()) {
// Already has some data prefetched, let's not mess with it
reader.Read(page_hdr, *protocol);
} else {
// No prefetch yet, prefetch the full header in one go (so thrift won't read byte-by-byte from storage)
// 256 bytes should cover almost all headers (unless it's a V2 header with really LONG string statistics)
static constexpr idx_t ASSUMED_HEADER_SIZE = 256;
const auto prefetch_size = MinValue(trans.GetSize() - trans.GetLocation(), ASSUMED_HEADER_SIZE);
trans.Prefetch(trans.GetLocation(), prefetch_size);
reader.Read(page_hdr, *protocol);
trans.ClearPrefetch();
}
// some basic sanity check
if (page_hdr.compressed_page_size < 0 || page_hdr.uncompressed_page_size < 0) {
throw InvalidInputException("Failed to read file \"%s\": Page sizes can't be < 0", Reader().GetFileName());
}
if (PageIsFilteredOut(page_hdr)) {
// this page has been filtered out so we don't need to read it
return;
}
switch (page_hdr.type) {
case PageType::DATA_PAGE_V2:
PreparePageV2(page_hdr);
PrepareDataPage(page_hdr);
break;
case PageType::DATA_PAGE:
PreparePage(page_hdr);
PrepareDataPage(page_hdr);
break;
case PageType::DICTIONARY_PAGE: {
PreparePage(page_hdr);
auto dictionary_size = page_hdr.dictionary_page_header.num_values;
if (dictionary_size < 0) {
throw InvalidInputException("Failed to read file \"%s\": Invalid dictionary page header (num_values < 0)",
Reader().GetFileName());
}
dictionary_decoder.InitializeDictionary(dictionary_size, filter, filter_state, HasDefines());
break;
}
default:
break; // ignore INDEX page type and any other custom extensions
}
ResetPage();
}
void ColumnReader::ResetPage() {
}
void ColumnReader::PreparePageV2(PageHeader &page_hdr) {
D_ASSERT(page_hdr.type == PageType::DATA_PAGE_V2);
AllocateBlock(page_hdr.uncompressed_page_size + 1);
bool uncompressed = false;
if (page_hdr.data_page_header_v2.__isset.is_compressed && !page_hdr.data_page_header_v2.is_compressed) {
uncompressed = true;
}
if (chunk->meta_data.codec == CompressionCodec::UNCOMPRESSED) {
if (page_hdr.compressed_page_size != page_hdr.uncompressed_page_size) {
throw InvalidInputException("Failed to read file \"%s\": Page size mismatch", Reader().GetFileName());
}
uncompressed = true;
}
if (uncompressed) {
reader.ReadData(*protocol, block->ptr, page_hdr.compressed_page_size);
return;
}
// copy repeats & defines as-is because FOR SOME REASON they are uncompressed
auto uncompressed_bytes = page_hdr.data_page_header_v2.repetition_levels_byte_length +
page_hdr.data_page_header_v2.definition_levels_byte_length;
if (uncompressed_bytes > page_hdr.uncompressed_page_size) {
throw InvalidInputException(
"Failed to read file \"%s\": header inconsistency, uncompressed_page_size needs to be larger than "
"repetition_levels_byte_length + definition_levels_byte_length",
Reader().GetFileName());
}
reader.ReadData(*protocol, block->ptr, uncompressed_bytes);
auto compressed_bytes = page_hdr.compressed_page_size - uncompressed_bytes;
if (compressed_bytes > 0) {
ResizeableBuffer compressed_buffer;
compressed_buffer.resize(GetAllocator(), compressed_bytes);
reader.ReadData(*protocol, compressed_buffer.ptr, compressed_bytes);
DecompressInternal(chunk->meta_data.codec, compressed_buffer.ptr, compressed_bytes,
block->ptr + uncompressed_bytes, page_hdr.uncompressed_page_size - uncompressed_bytes);
}
}
void ColumnReader::AllocateBlock(idx_t size) {
if (!block) {
block = make_shared_ptr<ResizeableBuffer>(GetAllocator(), size);
} else {
block->resize(GetAllocator(), size);
}
}
void ColumnReader::PreparePage(PageHeader &page_hdr) {
AllocateBlock(page_hdr.uncompressed_page_size + 1);
if (chunk->meta_data.codec == CompressionCodec::UNCOMPRESSED) {
if (page_hdr.compressed_page_size != page_hdr.uncompressed_page_size) {
throw std::runtime_error("Page size mismatch");
}
reader.ReadData(*protocol, block->ptr, page_hdr.compressed_page_size);
return;
}
ResizeableBuffer compressed_buffer;
compressed_buffer.resize(GetAllocator(), page_hdr.compressed_page_size + 1);
reader.ReadData(*protocol, compressed_buffer.ptr, page_hdr.compressed_page_size);
DecompressInternal(chunk->meta_data.codec, compressed_buffer.ptr, page_hdr.compressed_page_size, block->ptr,
page_hdr.uncompressed_page_size);
}
void ColumnReader::DecompressInternal(CompressionCodec::type codec, const_data_ptr_t src, idx_t src_size,
data_ptr_t dst, idx_t dst_size) {
switch (codec) {
case CompressionCodec::UNCOMPRESSED:
throw InternalException("Parquet data unexpectedly uncompressed");
case CompressionCodec::GZIP: {
MiniZStream s;
s.Decompress(const_char_ptr_cast(src), src_size, char_ptr_cast(dst), dst_size);
break;
}
case CompressionCodec::LZ4_RAW: {
auto res =
duckdb_lz4::LZ4_decompress_safe(const_char_ptr_cast(src), char_ptr_cast(dst),
UnsafeNumericCast<int32_t>(src_size), UnsafeNumericCast<int32_t>(dst_size));
if (res != NumericCast<int>(dst_size)) {
throw InvalidInputException("Failed to read file \"%s\": LZ4 decompression failure",
Reader().GetFileName());
}
break;
}
case CompressionCodec::SNAPPY: {
{
size_t uncompressed_size = 0;
auto res = duckdb_snappy::GetUncompressedLength(const_char_ptr_cast(src), src_size, &uncompressed_size);
if (!res) {
throw InvalidInputException("Failed to read file \"%s\": Snappy decompression failure",
Reader().GetFileName());
}
if (uncompressed_size != dst_size) {
throw InvalidInputException(
"Failed to read file \"%s\": Snappy decompression failure: Uncompressed data size mismatch",
Reader().GetFileName());
}
}
auto res = duckdb_snappy::RawUncompress(const_char_ptr_cast(src), src_size, char_ptr_cast(dst));
if (!res) {
throw InvalidInputException("Failed to read file \"%s\": Snappy decompression failure",
Reader().GetFileName());
}
break;
}
case CompressionCodec::ZSTD: {
auto res = duckdb_zstd::ZSTD_decompress(dst, dst_size, src, src_size);
if (duckdb_zstd::ZSTD_isError(res) || res != dst_size) {
throw InvalidInputException("Failed to read file \"%s\": ZSTD Decompression failure",
Reader().GetFileName());
}
break;
}
case CompressionCodec::BROTLI: {
auto state = duckdb_brotli::BrotliDecoderCreateInstance(nullptr, nullptr, nullptr);
size_t total_out = 0;
auto src_size_size_t = NumericCast<size_t>(src_size);
auto dst_size_size_t = NumericCast<size_t>(dst_size);
auto res = duckdb_brotli::BrotliDecoderDecompressStream(state, &src_size_size_t, &src, &dst_size_size_t, &dst,
&total_out);
if (res != duckdb_brotli::BROTLI_DECODER_RESULT_SUCCESS) {
throw InvalidInputException("Failed to read file \"%s\": Brotli Decompression failure",
Reader().GetFileName());
}
duckdb_brotli::BrotliDecoderDestroyInstance(state);
break;
}
default: {
duckdb::stringstream codec_name;
codec_name << codec;
throw InvalidInputException("Failed to read file \"%s\": Unsupported compression codec \"%s\". Supported "
"options are uncompressed, brotli, gzip, lz4_raw, snappy or zstd",
Reader().GetFileName(), codec_name.str());
}
}
}
void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
if (page_hdr.type == PageType::DATA_PAGE && !page_hdr.__isset.data_page_header) {
throw InvalidInputException("Failed to read file \"%s\": Missing data page header from data page",
Reader().GetFileName());
}
if (page_hdr.type == PageType::DATA_PAGE_V2 && !page_hdr.__isset.data_page_header_v2) {
throw InvalidInputException("Failed to read file \"%s\": Missing data page header from data page v2",
Reader().GetFileName());
}
bool is_v1 = page_hdr.type == PageType::DATA_PAGE;
bool is_v2 = page_hdr.type == PageType::DATA_PAGE_V2;
auto &v1_header = page_hdr.data_page_header;
auto &v2_header = page_hdr.data_page_header_v2;
page_rows_available = is_v1 ? v1_header.num_values : v2_header.num_values;
auto page_encoding = is_v1 ? v1_header.encoding : v2_header.encoding;
if (HasRepeats()) {
uint32_t rep_length = is_v1 ? block->read<uint32_t>() : v2_header.repetition_levels_byte_length;
block->available(rep_length);
repeated_decoder = make_uniq<RleBpDecoder>(block->ptr, rep_length, RleBpDecoder::ComputeBitWidth(MaxRepeat()));
block->inc(rep_length);
} else if (is_v2 && v2_header.repetition_levels_byte_length > 0) {
block->inc(v2_header.repetition_levels_byte_length);
}
if (HasDefines()) {
uint32_t def_length = is_v1 ? block->read<uint32_t>() : v2_header.definition_levels_byte_length;
block->available(def_length);
defined_decoder = make_uniq<RleBpDecoder>(block->ptr, def_length, RleBpDecoder::ComputeBitWidth(MaxDefine()));
block->inc(def_length);
} else if (is_v2 && v2_header.definition_levels_byte_length > 0) {
block->inc(v2_header.definition_levels_byte_length);
}
switch (page_encoding) {
case Encoding::RLE_DICTIONARY:
case Encoding::PLAIN_DICTIONARY: {
encoding = ColumnEncoding::DICTIONARY;
dictionary_decoder.InitializePage();
break;
}
case Encoding::RLE: {
encoding = ColumnEncoding::RLE;
rle_decoder.InitializePage();
break;
}
case Encoding::DELTA_BINARY_PACKED: {
encoding = ColumnEncoding::DELTA_BINARY_PACKED;
delta_binary_packed_decoder.InitializePage();
break;
}
case Encoding::DELTA_LENGTH_BYTE_ARRAY: {
encoding = ColumnEncoding::DELTA_LENGTH_BYTE_ARRAY;
delta_length_byte_array_decoder.InitializePage();
break;
}
case Encoding::DELTA_BYTE_ARRAY: {
encoding = ColumnEncoding::DELTA_BYTE_ARRAY;
delta_byte_array_decoder.InitializePage();
break;
}
case Encoding::BYTE_STREAM_SPLIT: {
encoding = ColumnEncoding::BYTE_STREAM_SPLIT;
byte_stream_split_decoder.InitializePage();
break;
}
case Encoding::PLAIN:
// nothing to do here, will be read directly below
encoding = ColumnEncoding::PLAIN;
break;
default:
throw InvalidInputException("Failed to read file \"%s\": Unsupported page encoding", Reader().GetFileName());
}
}
void ColumnReader::BeginRead(data_ptr_t define_out, data_ptr_t repeat_out) {
// we need to reset the location because multiple column readers share the same protocol
auto &trans = reinterpret_cast<ThriftFileTransport &>(*protocol->getTransport());
trans.SetLocation(chunk_read_offset);
// Perform any skips that were not applied yet.
if (define_out && repeat_out) {
ApplyPendingSkips(define_out, repeat_out);
}
}
idx_t ColumnReader::ReadPageHeaders(idx_t max_read, optional_ptr<const TableFilter> filter,
optional_ptr<TableFilterState> filter_state) {
while (page_rows_available == 0) {
PrepareRead(filter, filter_state);
}
return MinValue<idx_t>(MinValue<idx_t>(max_read, page_rows_available), STANDARD_VECTOR_SIZE);
}
bool ColumnReader::PrepareRead(idx_t read_now, data_ptr_t define_out, data_ptr_t repeat_out, idx_t result_offset) {
D_ASSERT(block);
D_ASSERT(read_now + result_offset <= STANDARD_VECTOR_SIZE);
D_ASSERT(!page_is_filtered_out);
if (HasRepeats()) {
D_ASSERT(repeated_decoder);
repeated_decoder->GetBatch<uint8_t>(repeat_out + result_offset, read_now);
}
if (HasDefines()) {
D_ASSERT(defined_decoder);
const auto max_define = NumericCast<uint8_t>(MaxDefine());
if (!HasRepeats() && defined_decoder->HasRepeatedBatch<uint8_t>(read_now, max_define)) {
// Fast path: no repeats and all valid
defined_decoder->GetRepeatedBatch<uint8_t>(read_now, max_define);
return true;
}
defined_decoder->GetBatch<uint8_t>(define_out + result_offset, read_now);
return false;
}
return true; // No defines, so everything is valid
}
void ColumnReader::ReadData(idx_t read_now, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result,
idx_t result_offset) {
// flatten the result vector if required
if (result_offset != 0 && result.GetVectorType() != VectorType::FLAT_VECTOR) {
result.Flatten(result_offset);
result.Resize(result_offset, STANDARD_VECTOR_SIZE);
}
if (page_is_filtered_out) {
// page is filtered out - emit NULL for any rows
auto &validity = FlatVector::Validity(result);
for (idx_t i = 0; i < read_now; i++) {
validity.SetInvalid(result_offset + i);
}
page_rows_available -= read_now;
return;
}
// read the defines/repeats
const auto all_valid = PrepareRead(read_now, define_out, repeat_out, result_offset);
// read the data according to the encoder
const auto define_ptr = all_valid ? nullptr : static_cast<uint8_t *>(define_out);
switch (encoding) {
case ColumnEncoding::DICTIONARY:
dictionary_decoder.Read(define_ptr, read_now, result, result_offset);
break;
case ColumnEncoding::DELTA_BINARY_PACKED:
delta_binary_packed_decoder.Read(define_ptr, read_now, result, result_offset);
break;
case ColumnEncoding::RLE:
rle_decoder.Read(define_ptr, read_now, result, result_offset);
break;
case ColumnEncoding::DELTA_LENGTH_BYTE_ARRAY:
delta_length_byte_array_decoder.Read(block, define_ptr, read_now, result, result_offset);
break;
case ColumnEncoding::DELTA_BYTE_ARRAY:
delta_byte_array_decoder.Read(define_ptr, read_now, result, result_offset);
break;
case ColumnEncoding::BYTE_STREAM_SPLIT:
byte_stream_split_decoder.Read(define_ptr, read_now, result, result_offset);
break;
default:
Plain(block, define_ptr, read_now, result_offset, result);
break;
}
page_rows_available -= read_now;
}
void ColumnReader::FinishRead(idx_t read_count) {
auto &trans = reinterpret_cast<ThriftFileTransport &>(*protocol->getTransport());
chunk_read_offset = trans.GetLocation();
group_rows_available -= read_count;
}
idx_t ColumnReader::ReadInternal(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) {
idx_t result_offset = 0;
auto to_read = num_values;
D_ASSERT(to_read <= STANDARD_VECTOR_SIZE);
while (to_read > 0) {
auto read_now = ReadPageHeaders(to_read);
ReadData(read_now, define_out, repeat_out, result, result_offset);
result_offset += read_now;
to_read -= read_now;
}
FinishRead(num_values);
return num_values;
}
idx_t ColumnReader::Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) {
BeginRead(define_out, repeat_out);
return ReadInternal(num_values, define_out, repeat_out, result);
}
void ColumnReader::Select(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out,
const SelectionVector &sel, idx_t approved_tuple_count) {
if (SupportsDirectSelect() && approved_tuple_count < num_values) {
DirectSelect(num_values, define_out, repeat_out, result_out, sel, approved_tuple_count);
return;
}
Read(num_values, define_out, repeat_out, result_out);
}
void ColumnReader::DirectSelect(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result,
const SelectionVector &sel, idx_t approved_tuple_count) {
auto to_read = num_values;
// prepare the first read if we haven't yet
BeginRead(define_out, repeat_out);
auto read_now = ReadPageHeaders(num_values);
// we can only push the filter into the decoder if we are reading the ENTIRE vector in one go
if (read_now == to_read && encoding == ColumnEncoding::PLAIN) {
const auto all_valid = PrepareRead(read_now, define_out, repeat_out, 0);
const auto define_ptr = all_valid ? nullptr : static_cast<uint8_t *>(define_out);
PlainSelect(block, define_ptr, read_now, result, sel, approved_tuple_count);
page_rows_available -= read_now;
FinishRead(num_values);
return;
}
// fallback to regular read + filter
ReadInternal(num_values, define_out, repeat_out, result);
}
void ColumnReader::Filter(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result,
const TableFilter &filter, TableFilterState &filter_state, SelectionVector &sel,
idx_t &approved_tuple_count, bool is_first_filter) {
if (SupportsDirectFilter() && is_first_filter) {
DirectFilter(num_values, define_out, repeat_out, result, filter, filter_state, sel, approved_tuple_count);
return;
}
Select(num_values, define_out, repeat_out, result, sel, approved_tuple_count);
ApplyFilter(result, filter, filter_state, num_values, sel, approved_tuple_count);
}
void ColumnReader::DirectFilter(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result,
const TableFilter &filter, TableFilterState &filter_state, SelectionVector &sel,
idx_t &approved_tuple_count) {
auto to_read = num_values;
// prepare the first read if we haven't yet
BeginRead(define_out, repeat_out);
auto read_now = ReadPageHeaders(num_values, &filter, &filter_state);
// we can only push the filter into the decoder if we are reading the ENTIRE vector in one go
if (encoding == ColumnEncoding::DICTIONARY && read_now == to_read && dictionary_decoder.HasFilter()) {
if (page_is_filtered_out) {
// the page has been filtered out entirely - skip
approved_tuple_count = 0;
} else {
// Push filter into dictionary directly
// read the defines/repeats
const auto all_valid = PrepareRead(read_now, define_out, repeat_out, 0);
const auto define_ptr = all_valid ? nullptr : static_cast<uint8_t *>(define_out);
dictionary_decoder.Filter(define_ptr, read_now, result, sel, approved_tuple_count);
}
page_rows_available -= read_now;
FinishRead(num_values);
return;
}
// fallback to regular read + filter
ReadInternal(num_values, define_out, repeat_out, result);
ApplyFilter(result, filter, filter_state, num_values, sel, approved_tuple_count);
}
void ColumnReader::ApplyFilter(Vector &v, const TableFilter &filter, TableFilterState &filter_state, idx_t scan_count,
SelectionVector &sel, idx_t &approved_tuple_count) {
UnifiedVectorFormat vdata;
v.ToUnifiedFormat(scan_count, vdata);
ColumnSegment::FilterSelection(sel, v, vdata, filter, filter_state, scan_count, approved_tuple_count);
}
void ColumnReader::Skip(idx_t num_values) {
pending_skips += num_values;
}
void ColumnReader::ApplyPendingSkips(data_ptr_t define_out, data_ptr_t repeat_out) {
if (pending_skips == 0) {
return;
}
idx_t num_values = pending_skips;
pending_skips = 0;
auto to_skip = num_values;
// start reading but do not apply skips (we are skipping now)
BeginRead(nullptr, nullptr);
while (to_skip > 0) {
auto skip_now = ReadPageHeaders(to_skip);
if (page_is_filtered_out) {
// the page has been filtered out entirely - skip
page_rows_available -= skip_now;
to_skip -= skip_now;
continue;
}
const auto all_valid = PrepareRead(skip_now, define_out, repeat_out, 0);
const auto define_ptr = all_valid ? nullptr : static_cast<uint8_t *>(define_out);
switch (encoding) {
case ColumnEncoding::DICTIONARY:
dictionary_decoder.Skip(define_ptr, skip_now);
break;
case ColumnEncoding::DELTA_BINARY_PACKED:
delta_binary_packed_decoder.Skip(define_ptr, skip_now);
break;
case ColumnEncoding::RLE:
rle_decoder.Skip(define_ptr, skip_now);
break;
case ColumnEncoding::DELTA_LENGTH_BYTE_ARRAY:
delta_length_byte_array_decoder.Skip(define_ptr, skip_now);
break;
case ColumnEncoding::DELTA_BYTE_ARRAY:
delta_byte_array_decoder.Skip(define_ptr, skip_now);
break;
case ColumnEncoding::BYTE_STREAM_SPLIT:
byte_stream_split_decoder.Skip(define_ptr, skip_now);
break;
default:
PlainSkip(*block, define_ptr, skip_now);
break;
}
page_rows_available -= skip_now;
to_skip -= skip_now;
}
FinishRead(num_values);
}
//===--------------------------------------------------------------------===//
// Create Column Reader
//===--------------------------------------------------------------------===//
template <class T>
static unique_ptr<ColumnReader> CreateDecimalReader(ParquetReader &reader, const ParquetColumnSchema &schema) {
switch (schema.type.InternalType()) {
case PhysicalType::INT16:
return make_uniq<TemplatedColumnReader<int16_t, TemplatedParquetValueConversion<T>>>(reader, schema);
case PhysicalType::INT32:
return make_uniq<TemplatedColumnReader<int32_t, TemplatedParquetValueConversion<T>>>(reader, schema);
case PhysicalType::INT64:
return make_uniq<TemplatedColumnReader<int64_t, TemplatedParquetValueConversion<T>>>(reader, schema);
case PhysicalType::INT128:
return make_uniq<TemplatedColumnReader<hugeint_t, TemplatedParquetValueConversion<T>>>(reader, schema);
default:
throw NotImplementedException("Unimplemented internal type for CreateDecimalReader");
}
}
unique_ptr<ColumnReader> ColumnReader::CreateReader(ParquetReader &reader, const ParquetColumnSchema &schema) {
switch (schema.type.id()) {
case LogicalTypeId::BOOLEAN:
return make_uniq<BooleanColumnReader>(reader, schema);
case LogicalTypeId::UTINYINT:
return make_uniq<TemplatedColumnReader<uint8_t, TemplatedParquetValueConversion<uint32_t>>>(reader, schema);
case LogicalTypeId::USMALLINT:
return make_uniq<TemplatedColumnReader<uint16_t, TemplatedParquetValueConversion<uint32_t>>>(reader, schema);
case LogicalTypeId::UINTEGER:
return make_uniq<TemplatedColumnReader<uint32_t, TemplatedParquetValueConversion<uint32_t>>>(reader, schema);
case LogicalTypeId::UBIGINT:
return make_uniq<TemplatedColumnReader<uint64_t, TemplatedParquetValueConversion<uint64_t>>>(reader, schema);
case LogicalTypeId::TINYINT:
return make_uniq<TemplatedColumnReader<int8_t, TemplatedParquetValueConversion<int32_t>>>(reader, schema);
case LogicalTypeId::SMALLINT:
return make_uniq<TemplatedColumnReader<int16_t, TemplatedParquetValueConversion<int32_t>>>(reader, schema);
case LogicalTypeId::INTEGER:
return make_uniq<TemplatedColumnReader<int32_t, TemplatedParquetValueConversion<int32_t>>>(reader, schema);
case LogicalTypeId::BIGINT:
return make_uniq<TemplatedColumnReader<int64_t, TemplatedParquetValueConversion<int64_t>>>(reader, schema);
case LogicalTypeId::FLOAT:
if (schema.type_info == ParquetExtraTypeInfo::FLOAT16) {
return make_uniq<CallbackColumnReader<uint16_t, float, Float16ToFloat32>>(reader, schema);
}
return make_uniq<TemplatedColumnReader<float, TemplatedParquetValueConversion<float>>>(reader, schema);
case LogicalTypeId::DOUBLE:
if (schema.type_info == ParquetExtraTypeInfo::DECIMAL_BYTE_ARRAY) {
return ParquetDecimalUtils::CreateReader(reader, schema);
}
return make_uniq<TemplatedColumnReader<double, TemplatedParquetValueConversion<double>>>(reader, schema);
case LogicalTypeId::TIMESTAMP:
case LogicalTypeId::TIMESTAMP_TZ:
switch (schema.type_info) {
case ParquetExtraTypeInfo::IMPALA_TIMESTAMP:
return make_uniq<CallbackColumnReader<Int96, timestamp_t, ImpalaTimestampToTimestamp>>(reader, schema);
case ParquetExtraTypeInfo::UNIT_MS:
return make_uniq<CallbackColumnReader<int64_t, timestamp_t, ParquetTimestampMsToTimestamp>>(reader, schema);
case ParquetExtraTypeInfo::UNIT_MICROS:
return make_uniq<CallbackColumnReader<int64_t, timestamp_t, ParquetTimestampMicrosToTimestamp>>(reader,
schema);
case ParquetExtraTypeInfo::UNIT_NS:
return make_uniq<CallbackColumnReader<int64_t, timestamp_t, ParquetTimestampNsToTimestamp>>(reader, schema);
default:
throw InternalException("TIMESTAMP requires type info");
}
case LogicalTypeId::TIMESTAMP_NS:
switch (schema.type_info) {
case ParquetExtraTypeInfo::IMPALA_TIMESTAMP:
return make_uniq<CallbackColumnReader<Int96, timestamp_ns_t, ImpalaTimestampToTimestampNS>>(reader, schema);
case ParquetExtraTypeInfo::UNIT_MS:
return make_uniq<CallbackColumnReader<int64_t, timestamp_ns_t, ParquetTimestampMsToTimestampNs>>(reader,
schema);
case ParquetExtraTypeInfo::UNIT_MICROS:
return make_uniq<CallbackColumnReader<int64_t, timestamp_ns_t, ParquetTimestampUsToTimestampNs>>(reader,
schema);
case ParquetExtraTypeInfo::UNIT_NS:
return make_uniq<CallbackColumnReader<int64_t, timestamp_ns_t, ParquetTimestampNsToTimestampNs>>(reader,
schema);
default:
throw InternalException("TIMESTAMP_NS requires type info");
}
case LogicalTypeId::DATE:
return make_uniq<CallbackColumnReader<int32_t, date_t, ParquetIntToDate>>(reader, schema);
case LogicalTypeId::TIME:
switch (schema.type_info) {
case ParquetExtraTypeInfo::UNIT_MS:
return make_uniq<CallbackColumnReader<int32_t, dtime_t, ParquetMsIntToTime>>(reader, schema);
case ParquetExtraTypeInfo::UNIT_MICROS:
return make_uniq<CallbackColumnReader<int64_t, dtime_t, ParquetIntToTime>>(reader, schema);
case ParquetExtraTypeInfo::UNIT_NS:
return make_uniq<CallbackColumnReader<int64_t, dtime_t, ParquetNsIntToTime>>(reader, schema);
default:
throw InternalException("TIME requires type info");
}
case LogicalTypeId::TIME_NS:
switch (schema.type_info) {
case ParquetExtraTypeInfo::UNIT_MS:
return make_uniq<CallbackColumnReader<int32_t, dtime_ns_t, ParquetMsIntToTimeNs>>(reader, schema);
case ParquetExtraTypeInfo::UNIT_MICROS:
return make_uniq<CallbackColumnReader<int64_t, dtime_ns_t, ParquetUsIntToTimeNs>>(reader, schema);
case ParquetExtraTypeInfo::UNIT_NS:
return make_uniq<CallbackColumnReader<int64_t, dtime_ns_t, ParquetIntToTimeNs>>(reader, schema);
default:
throw InternalException("TIME requires type info");
}
case LogicalTypeId::TIME_TZ:
switch (schema.type_info) {
case ParquetExtraTypeInfo::UNIT_MS:
return make_uniq<CallbackColumnReader<int32_t, dtime_tz_t, ParquetIntToTimeMsTZ>>(reader, schema);
case ParquetExtraTypeInfo::UNIT_MICROS:
return make_uniq<CallbackColumnReader<int64_t, dtime_tz_t, ParquetIntToTimeTZ>>(reader, schema);
case ParquetExtraTypeInfo::UNIT_NS:
return make_uniq<CallbackColumnReader<int64_t, dtime_tz_t, ParquetIntToTimeNsTZ>>(reader, schema);
default:
throw InternalException("TIME_TZ requires type info");
}
case LogicalTypeId::BLOB:
case LogicalTypeId::VARCHAR:
return make_uniq<StringColumnReader>(reader, schema);
case LogicalTypeId::DECIMAL:
// we have to figure out what kind of int we need
switch (schema.type_info) {
case ParquetExtraTypeInfo::DECIMAL_INT32:
return CreateDecimalReader<int32_t>(reader, schema);
case ParquetExtraTypeInfo::DECIMAL_INT64:
return CreateDecimalReader<int64_t>(reader, schema);
case ParquetExtraTypeInfo::DECIMAL_BYTE_ARRAY:
return ParquetDecimalUtils::CreateReader(reader, schema);
default:
throw NotImplementedException("Unrecognized Parquet type for Decimal");
}
break;
case LogicalTypeId::UUID:
return make_uniq<UUIDColumnReader>(reader, schema);
case LogicalTypeId::INTERVAL:
return make_uniq<IntervalColumnReader>(reader, schema);
case LogicalTypeId::SQLNULL:
return make_uniq<NullColumnReader>(reader, schema);
default:
break;
}
throw NotImplementedException(schema.type.ToString());
}
} // namespace duckdb

View File

@@ -0,0 +1,669 @@
#include "column_writer.hpp"
#include "duckdb.hpp"
#include "geo_parquet.hpp"
#include "parquet_rle_bp_decoder.hpp"
#include "parquet_bss_encoder.hpp"
#include "parquet_statistics.hpp"
#include "parquet_writer.hpp"
#include "writer/array_column_writer.hpp"
#include "writer/boolean_column_writer.hpp"
#include "writer/decimal_column_writer.hpp"
#include "writer/enum_column_writer.hpp"
#include "writer/list_column_writer.hpp"
#include "writer/primitive_column_writer.hpp"
#include "writer/struct_column_writer.hpp"
#include "writer/variant_column_writer.hpp"
#include "writer/templated_column_writer.hpp"
#include "duckdb/common/exception.hpp"
#include "duckdb/common/operator/comparison_operators.hpp"
#include "duckdb/common/serializer/buffered_file_writer.hpp"
#include "duckdb/common/serializer/memory_stream.hpp"
#include "duckdb/common/serializer/write_stream.hpp"
#include "duckdb/common/string_map_set.hpp"
#include "duckdb/common/types/hugeint.hpp"
#include "duckdb/common/types/time.hpp"
#include "duckdb/common/types/timestamp.hpp"
#include "duckdb/execution/expression_executor.hpp"
#include "brotli/encode.h"
#include "lz4.hpp"
#include "miniz_wrapper.hpp"
#include "snappy.h"
#include "zstd.h"
#include <cmath>
namespace duckdb {
using namespace duckdb_parquet; // NOLINT
using namespace duckdb_miniz; // NOLINT
using duckdb_parquet::CompressionCodec;
using duckdb_parquet::ConvertedType;
using duckdb_parquet::Encoding;
using duckdb_parquet::FieldRepetitionType;
using duckdb_parquet::FileMetaData;
using duckdb_parquet::PageHeader;
using duckdb_parquet::PageType;
using ParquetRowGroup = duckdb_parquet::RowGroup;
using duckdb_parquet::Type;
constexpr uint16_t ColumnWriter::PARQUET_DEFINE_VALID;
//===--------------------------------------------------------------------===//
// ColumnWriterStatistics
//===--------------------------------------------------------------------===//
ColumnWriterStatistics::~ColumnWriterStatistics() {
}
bool ColumnWriterStatistics::HasStats() {
return false;
}
string ColumnWriterStatistics::GetMin() {
return string();
}
string ColumnWriterStatistics::GetMax() {
return string();
}
string ColumnWriterStatistics::GetMinValue() {
return string();
}
string ColumnWriterStatistics::GetMaxValue() {
return string();
}
bool ColumnWriterStatistics::CanHaveNaN() {
return false;
}
bool ColumnWriterStatistics::HasNaN() {
return false;
}
bool ColumnWriterStatistics::MinIsExact() {
return true;
}
bool ColumnWriterStatistics::MaxIsExact() {
return true;
}
bool ColumnWriterStatistics::HasGeoStats() {
return false;
}
optional_ptr<GeometryStatsData> ColumnWriterStatistics::GetGeoStats() {
return nullptr;
}
void ColumnWriterStatistics::WriteGeoStats(duckdb_parquet::GeospatialStatistics &stats) {
D_ASSERT(false); // this should never be called
}
//===--------------------------------------------------------------------===//
// ColumnWriter
//===--------------------------------------------------------------------===//
ColumnWriter::ColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
vector<string> schema_path_p, bool can_have_nulls)
: writer(writer), column_schema(column_schema), schema_path(std::move(schema_path_p)),
can_have_nulls(can_have_nulls) {
}
ColumnWriter::~ColumnWriter() {
}
ColumnWriterState::~ColumnWriterState() {
}
void ColumnWriter::CompressPage(MemoryStream &temp_writer, size_t &compressed_size, data_ptr_t &compressed_data,
AllocatedData &compressed_buf) {
switch (writer.GetCodec()) {
case CompressionCodec::UNCOMPRESSED:
compressed_size = temp_writer.GetPosition();
compressed_data = temp_writer.GetData();
break;
case CompressionCodec::SNAPPY: {
compressed_size = duckdb_snappy::MaxCompressedLength(temp_writer.GetPosition());
compressed_buf = BufferAllocator::Get(writer.GetContext()).Allocate(compressed_size);
duckdb_snappy::RawCompress(const_char_ptr_cast(temp_writer.GetData()), temp_writer.GetPosition(),
char_ptr_cast(compressed_buf.get()), &compressed_size);
compressed_data = compressed_buf.get();
D_ASSERT(compressed_size <= duckdb_snappy::MaxCompressedLength(temp_writer.GetPosition()));
break;
}
case CompressionCodec::LZ4_RAW: {
compressed_size = duckdb_lz4::LZ4_compressBound(UnsafeNumericCast<int32_t>(temp_writer.GetPosition()));
compressed_buf = BufferAllocator::Get(writer.GetContext()).Allocate(compressed_size);
compressed_size = duckdb_lz4::LZ4_compress_default(
const_char_ptr_cast(temp_writer.GetData()), char_ptr_cast(compressed_buf.get()),
UnsafeNumericCast<int32_t>(temp_writer.GetPosition()), UnsafeNumericCast<int32_t>(compressed_size));
compressed_data = compressed_buf.get();
break;
}
case CompressionCodec::GZIP: {
MiniZStream s;
compressed_size = s.MaxCompressedLength(temp_writer.GetPosition());
compressed_buf = BufferAllocator::Get(writer.GetContext()).Allocate(compressed_size);
s.Compress(const_char_ptr_cast(temp_writer.GetData()), temp_writer.GetPosition(),
char_ptr_cast(compressed_buf.get()), &compressed_size);
compressed_data = compressed_buf.get();
break;
}
case CompressionCodec::ZSTD: {
compressed_size = duckdb_zstd::ZSTD_compressBound(temp_writer.GetPosition());
compressed_buf = BufferAllocator::Get(writer.GetContext()).Allocate(compressed_size);
compressed_size = duckdb_zstd::ZSTD_compress((void *)compressed_buf.get(), compressed_size,
(const void *)temp_writer.GetData(), temp_writer.GetPosition(),
UnsafeNumericCast<int32_t>(writer.CompressionLevel()));
compressed_data = compressed_buf.get();
break;
}
case CompressionCodec::BROTLI: {
compressed_size = duckdb_brotli::BrotliEncoderMaxCompressedSize(temp_writer.GetPosition());
compressed_buf = BufferAllocator::Get(writer.GetContext()).Allocate(compressed_size);
duckdb_brotli::BrotliEncoderCompress(BROTLI_DEFAULT_QUALITY, BROTLI_DEFAULT_WINDOW, BROTLI_DEFAULT_MODE,
temp_writer.GetPosition(), temp_writer.GetData(), &compressed_size,
compressed_buf.get());
compressed_data = compressed_buf.get();
break;
}
default:
throw InternalException("Unsupported codec for Parquet Writer");
}
if (compressed_size > idx_t(NumericLimits<int32_t>::Maximum())) {
throw InternalException("Parquet writer: %d compressed page size out of range for type integer",
temp_writer.GetPosition());
}
}
void ColumnWriter::HandleRepeatLevels(ColumnWriterState &state, ColumnWriterState *parent, idx_t count) const {
if (!parent) {
// no repeat levels without a parent node
return;
}
if (state.repetition_levels.size() >= parent->repetition_levels.size()) {
return;
}
state.repetition_levels.insert(state.repetition_levels.end(),
parent->repetition_levels.begin() + state.repetition_levels.size(),
parent->repetition_levels.end());
}
void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterState *parent, const ValidityMask &validity,
const idx_t count, const uint16_t define_value, const uint16_t null_value) const {
if (parent) {
// parent node: inherit definition level from the parent
idx_t vector_index = 0;
while (state.definition_levels.size() < parent->definition_levels.size()) {
idx_t current_index = state.definition_levels.size();
if (parent->definition_levels[current_index] != PARQUET_DEFINE_VALID) {
//! Inherit nulls from parent
state.definition_levels.push_back(parent->definition_levels[current_index]);
state.parent_null_count++;
} else if (validity.RowIsValid(vector_index)) {
//! Produce a non-null define
state.definition_levels.push_back(define_value);
} else {
//! Produce a null define
if (!can_have_nulls) {
throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
}
state.null_count++;
state.definition_levels.push_back(null_value);
}
D_ASSERT(parent->is_empty.empty() || current_index < parent->is_empty.size());
if (parent->is_empty.empty() || !parent->is_empty[current_index]) {
vector_index++;
}
}
return;
}
// no parent: set definition levels only from this validity mask
if (validity.AllValid()) {
state.definition_levels.insert(state.definition_levels.end(), count, define_value);
} else {
for (idx_t i = 0; i < count; i++) {
const auto is_null = !validity.RowIsValid(i);
state.definition_levels.emplace_back(is_null ? null_value : define_value);
state.null_count += is_null;
}
}
if (!can_have_nulls && state.null_count != 0) {
throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
}
}
//===--------------------------------------------------------------------===//
// Create Column Writer
//===--------------------------------------------------------------------===//
ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::SchemaElement> &schemas,
const LogicalType &type, const string &name, bool allow_geometry,
optional_ptr<const ChildFieldIDs> field_ids,
optional_ptr<const ShreddingType> shredding_types, idx_t max_repeat,
idx_t max_define, bool can_have_nulls) {
auto null_type = can_have_nulls ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED;
if (!can_have_nulls) {
max_define--;
}
idx_t schema_idx = schemas.size();
optional_ptr<const FieldID> field_id;
optional_ptr<const ChildFieldIDs> child_field_ids;
if (field_ids) {
auto field_id_it = field_ids->ids->find(name);
if (field_id_it != field_ids->ids->end()) {
field_id = &field_id_it->second;
child_field_ids = &field_id->child_field_ids;
}
}
optional_ptr<const ShreddingType> shredding_type;
if (shredding_types) {
shredding_type = shredding_types->GetChild(name);
}
if (type.id() == LogicalTypeId::STRUCT && type.GetAlias() == "PARQUET_VARIANT") {
// variant type
// variants are stored as follows:
// group <name> VARIANT {
// metadata BYTE_ARRAY,
// value BYTE_ARRAY,
// [<typed_value>]
// }
const bool is_shredded = shredding_type != nullptr;
child_list_t<LogicalType> child_types;
child_types.emplace_back("metadata", LogicalType::BLOB);
child_types.emplace_back("value", LogicalType::BLOB);
if (is_shredded) {
auto &typed_value_type = shredding_type->type;
if (typed_value_type.id() != LogicalTypeId::ANY) {
child_types.emplace_back("typed_value",
VariantColumnWriter::TransformTypedValueRecursive(typed_value_type));
}
}
// variant group
duckdb_parquet::SchemaElement top_element;
top_element.repetition_type = null_type;
top_element.num_children = child_types.size();
top_element.logicalType.__isset.VARIANT = true;
top_element.logicalType.VARIANT.__isset.specification_version = true;
top_element.logicalType.VARIANT.specification_version = 1;
top_element.__isset.logicalType = true;
top_element.__isset.num_children = true;
top_element.__isset.repetition_type = true;
top_element.name = name;
schemas.push_back(std::move(top_element));
ParquetColumnSchema variant_column(name, type, max_define, max_repeat, schema_idx, 0);
variant_column.children.reserve(child_types.size());
for (auto &child_type : child_types) {
auto &child_name = child_type.first;
bool is_optional;
if (child_name == "metadata") {
is_optional = false;
} else if (child_name == "value") {
if (is_shredded) {
//! When shredding the variant, the 'value' becomes optional
is_optional = true;
} else {
is_optional = false;
}
} else {
D_ASSERT(child_name == "typed_value");
is_optional = true;
}
variant_column.children.emplace_back(FillParquetSchema(schemas, child_type.second, child_type.first,
allow_geometry, child_field_ids, shredding_type,
max_repeat, max_define + 1, is_optional));
}
return variant_column;
}
if (type.id() == LogicalTypeId::STRUCT || type.id() == LogicalTypeId::UNION) {
auto &child_types = StructType::GetChildTypes(type);
// set up the schema element for this struct
duckdb_parquet::SchemaElement schema_element;
schema_element.repetition_type = null_type;
schema_element.num_children = UnsafeNumericCast<int32_t>(child_types.size());
schema_element.__isset.num_children = true;
schema_element.__isset.type = false;
schema_element.__isset.repetition_type = true;
schema_element.name = name;
if (field_id && field_id->set) {
schema_element.__isset.field_id = true;
schema_element.field_id = field_id->field_id;
}
schemas.push_back(std::move(schema_element));
ParquetColumnSchema struct_column(name, type, max_define, max_repeat, schema_idx, 0);
// construct the child schemas recursively
struct_column.children.reserve(child_types.size());
for (auto &child_type : child_types) {
struct_column.children.emplace_back(FillParquetSchema(schemas, child_type.second, child_type.first,
allow_geometry, child_field_ids, shredding_type,
max_repeat, max_define + 1, true));
}
return struct_column;
}
if (type.id() == LogicalTypeId::LIST || type.id() == LogicalTypeId::ARRAY) {
auto is_list = type.id() == LogicalTypeId::LIST;
auto &child_type = is_list ? ListType::GetChildType(type) : ArrayType::GetChildType(type);
// set up the two schema elements for the list
// for some reason we only set the converted type in the OPTIONAL element
// first an OPTIONAL element
duckdb_parquet::SchemaElement optional_element;
optional_element.repetition_type = null_type;
optional_element.num_children = 1;
optional_element.converted_type = ConvertedType::LIST;
optional_element.__isset.num_children = true;
optional_element.__isset.type = false;
optional_element.__isset.repetition_type = true;
optional_element.__isset.converted_type = true;
optional_element.name = name;
if (field_id && field_id->set) {
optional_element.__isset.field_id = true;
optional_element.field_id = field_id->field_id;
}
schemas.push_back(std::move(optional_element));
// then a REPEATED element
duckdb_parquet::SchemaElement repeated_element;
repeated_element.repetition_type = FieldRepetitionType::REPEATED;
repeated_element.num_children = 1;
repeated_element.__isset.num_children = true;
repeated_element.__isset.type = false;
repeated_element.__isset.repetition_type = true;
repeated_element.name = "list";
schemas.push_back(std::move(repeated_element));
ParquetColumnSchema list_column(name, type, max_define, max_repeat, schema_idx, 0);
list_column.children.push_back(FillParquetSchema(schemas, child_type, "element", allow_geometry,
child_field_ids, shredding_type, max_repeat + 1,
max_define + 2, true));
return list_column;
}
if (type.id() == LogicalTypeId::MAP) {
// map type
// maps are stored as follows:
// <map-repetition> group <name> (MAP) {
// repeated group key_value {
// required <key-type> key;
// <value-repetition> <value-type> value;
// }
// }
// top map element
duckdb_parquet::SchemaElement top_element;
top_element.repetition_type = null_type;
top_element.num_children = 1;
top_element.converted_type = ConvertedType::MAP;
top_element.__isset.repetition_type = true;
top_element.__isset.num_children = true;
top_element.__isset.converted_type = true;
top_element.__isset.type = false;
top_element.name = name;
if (field_id && field_id->set) {
top_element.__isset.field_id = true;
top_element.field_id = field_id->field_id;
}
schemas.push_back(std::move(top_element));
// key_value element
duckdb_parquet::SchemaElement kv_element;
kv_element.repetition_type = FieldRepetitionType::REPEATED;
kv_element.num_children = 2;
kv_element.__isset.repetition_type = true;
kv_element.__isset.num_children = true;
kv_element.__isset.type = false;
kv_element.name = "key_value";
schemas.push_back(std::move(kv_element));
// construct the child types recursively
vector<LogicalType> kv_types {MapType::KeyType(type), MapType::ValueType(type)};
vector<string> kv_names {"key", "value"};
ParquetColumnSchema map_column(name, type, max_define, max_repeat, schema_idx, 0);
map_column.children.reserve(2);
for (idx_t i = 0; i < 2; i++) {
// key needs to be marked as REQUIRED
bool is_key = i == 0;
auto child_schema = FillParquetSchema(schemas, kv_types[i], kv_names[i], allow_geometry, child_field_ids,
shredding_type, max_repeat + 1, max_define + 2, !is_key);
map_column.children.push_back(std::move(child_schema));
}
return map_column;
}
duckdb_parquet::SchemaElement schema_element;
schema_element.type = ParquetWriter::DuckDBTypeToParquetType(type);
schema_element.repetition_type = null_type;
schema_element.__isset.num_children = false;
schema_element.__isset.type = true;
schema_element.__isset.repetition_type = true;
schema_element.name = name;
if (field_id && field_id->set) {
schema_element.__isset.field_id = true;
schema_element.field_id = field_id->field_id;
}
ParquetWriter::SetSchemaProperties(type, schema_element, allow_geometry);
schemas.push_back(std::move(schema_element));
return ParquetColumnSchema(name, type, max_define, max_repeat, schema_idx, 0);
}
unique_ptr<ColumnWriter>
ColumnWriter::CreateWriterRecursive(ClientContext &context, ParquetWriter &writer,
const vector<duckdb_parquet::SchemaElement> &parquet_schemas,
const ParquetColumnSchema &schema, vector<string> path_in_schema) {
auto &type = schema.type;
auto can_have_nulls = parquet_schemas[schema.schema_index].repetition_type == FieldRepetitionType::OPTIONAL;
path_in_schema.push_back(schema.name);
if (type.id() == LogicalTypeId::STRUCT && type.GetAlias() == "PARQUET_VARIANT") {
vector<unique_ptr<ColumnWriter>> child_writers;
child_writers.reserve(schema.children.size());
for (idx_t i = 0; i < schema.children.size(); i++) {
child_writers.push_back(
CreateWriterRecursive(context, writer, parquet_schemas, schema.children[i], path_in_schema));
}
return make_uniq<VariantColumnWriter>(writer, schema, path_in_schema, std::move(child_writers), can_have_nulls);
}
if (type.id() == LogicalTypeId::STRUCT || type.id() == LogicalTypeId::UNION) {
// construct the child writers recursively
vector<unique_ptr<ColumnWriter>> child_writers;
child_writers.reserve(schema.children.size());
for (auto &child_column : schema.children) {
child_writers.push_back(
CreateWriterRecursive(context, writer, parquet_schemas, child_column, path_in_schema));
}
return make_uniq<StructColumnWriter>(writer, schema, std::move(path_in_schema), std::move(child_writers),
can_have_nulls);
}
if (type.id() == LogicalTypeId::LIST || type.id() == LogicalTypeId::ARRAY) {
auto is_list = type.id() == LogicalTypeId::LIST;
path_in_schema.push_back("list");
auto child_writer = CreateWriterRecursive(context, writer, parquet_schemas, schema.children[0], path_in_schema);
if (is_list) {
return make_uniq<ListColumnWriter>(writer, schema, std::move(path_in_schema), std::move(child_writer),
can_have_nulls);
} else {
return make_uniq<ArrayColumnWriter>(writer, schema, std::move(path_in_schema), std::move(child_writer),
can_have_nulls);
}
}
if (type.id() == LogicalTypeId::MAP) {
path_in_schema.push_back("key_value");
// construct the child types recursively
vector<unique_ptr<ColumnWriter>> child_writers;
child_writers.reserve(2);
for (idx_t i = 0; i < 2; i++) {
// key needs to be marked as REQUIRED
auto child_writer =
CreateWriterRecursive(context, writer, parquet_schemas, schema.children[i], path_in_schema);
child_writers.push_back(std::move(child_writer));
}
auto struct_writer =
make_uniq<StructColumnWriter>(writer, schema, path_in_schema, std::move(child_writers), can_have_nulls);
return make_uniq<ListColumnWriter>(writer, schema, path_in_schema, std::move(struct_writer), can_have_nulls);
}
if (type.id() == LogicalTypeId::BLOB && type.GetAlias() == "WKB_BLOB") {
return make_uniq<StandardColumnWriter<string_t, string_t, ParquetGeometryOperator>>(
writer, schema, std::move(path_in_schema), can_have_nulls);
}
switch (type.id()) {
case LogicalTypeId::BOOLEAN:
return make_uniq<BooleanColumnWriter>(writer, schema, std::move(path_in_schema), can_have_nulls);
case LogicalTypeId::TINYINT:
return make_uniq<StandardColumnWriter<int8_t, int32_t>>(writer, schema, std::move(path_in_schema),
can_have_nulls);
case LogicalTypeId::SMALLINT:
return make_uniq<StandardColumnWriter<int16_t, int32_t>>(writer, schema, std::move(path_in_schema),
can_have_nulls);
case LogicalTypeId::INTEGER:
case LogicalTypeId::DATE:
return make_uniq<StandardColumnWriter<int32_t, int32_t>>(writer, schema, std::move(path_in_schema),
can_have_nulls);
case LogicalTypeId::BIGINT:
case LogicalTypeId::TIME:
case LogicalTypeId::TIMESTAMP:
case LogicalTypeId::TIMESTAMP_TZ:
case LogicalTypeId::TIMESTAMP_MS:
return make_uniq<StandardColumnWriter<int64_t, int64_t>>(writer, schema, std::move(path_in_schema),
can_have_nulls);
case LogicalTypeId::TIME_TZ:
return make_uniq<StandardColumnWriter<dtime_tz_t, int64_t, ParquetTimeTZOperator>>(
writer, schema, std::move(path_in_schema), can_have_nulls);
case LogicalTypeId::HUGEINT:
return make_uniq<StandardColumnWriter<hugeint_t, double, ParquetHugeintOperator>>(
writer, schema, std::move(path_in_schema), can_have_nulls);
case LogicalTypeId::UHUGEINT:
return make_uniq<StandardColumnWriter<uhugeint_t, double, ParquetUhugeintOperator>>(
writer, schema, std::move(path_in_schema), can_have_nulls);
case LogicalTypeId::TIMESTAMP_NS:
return make_uniq<StandardColumnWriter<int64_t, int64_t, ParquetTimestampNSOperator>>(
writer, schema, std::move(path_in_schema), can_have_nulls);
case LogicalTypeId::TIMESTAMP_SEC:
return make_uniq<StandardColumnWriter<int64_t, int64_t, ParquetTimestampSOperator>>(
writer, schema, std::move(path_in_schema), can_have_nulls);
case LogicalTypeId::UTINYINT:
return make_uniq<StandardColumnWriter<uint8_t, int32_t>>(writer, schema, std::move(path_in_schema),
can_have_nulls);
case LogicalTypeId::USMALLINT:
return make_uniq<StandardColumnWriter<uint16_t, int32_t>>(writer, schema, std::move(path_in_schema),
can_have_nulls);
case LogicalTypeId::UINTEGER:
return make_uniq<StandardColumnWriter<uint32_t, uint32_t>>(writer, schema, std::move(path_in_schema),
can_have_nulls);
case LogicalTypeId::UBIGINT:
return make_uniq<StandardColumnWriter<uint64_t, uint64_t>>(writer, schema, std::move(path_in_schema),
can_have_nulls);
case LogicalTypeId::FLOAT:
return make_uniq<StandardColumnWriter<float_na_equal, float, FloatingPointOperator>>(
writer, schema, std::move(path_in_schema), can_have_nulls);
case LogicalTypeId::DOUBLE:
return make_uniq<StandardColumnWriter<double_na_equal, double, FloatingPointOperator>>(
writer, schema, std::move(path_in_schema), can_have_nulls);
case LogicalTypeId::DECIMAL:
switch (type.InternalType()) {
case PhysicalType::INT16:
return make_uniq<StandardColumnWriter<int16_t, int32_t>>(writer, schema, std::move(path_in_schema),
can_have_nulls);
case PhysicalType::INT32:
return make_uniq<StandardColumnWriter<int32_t, int32_t>>(writer, schema, std::move(path_in_schema),
can_have_nulls);
case PhysicalType::INT64:
return make_uniq<StandardColumnWriter<int64_t, int64_t>>(writer, schema, std::move(path_in_schema),
can_have_nulls);
default:
return make_uniq<FixedDecimalColumnWriter>(writer, schema, std::move(path_in_schema), can_have_nulls);
}
case LogicalTypeId::BLOB:
return make_uniq<StandardColumnWriter<string_t, string_t, ParquetBlobOperator>>(
writer, schema, std::move(path_in_schema), can_have_nulls);
case LogicalTypeId::VARCHAR:
return make_uniq<StandardColumnWriter<string_t, string_t, ParquetStringOperator>>(
writer, schema, std::move(path_in_schema), can_have_nulls);
case LogicalTypeId::UUID:
return make_uniq<StandardColumnWriter<hugeint_t, ParquetUUIDTargetType, ParquetUUIDOperator>>(
writer, schema, std::move(path_in_schema), can_have_nulls);
case LogicalTypeId::INTERVAL:
return make_uniq<StandardColumnWriter<interval_t, ParquetIntervalTargetType, ParquetIntervalOperator>>(
writer, schema, std::move(path_in_schema), can_have_nulls);
case LogicalTypeId::ENUM:
return make_uniq<EnumColumnWriter>(writer, schema, std::move(path_in_schema), can_have_nulls);
default:
throw InternalException("Unsupported type \"%s\" in Parquet writer", type.ToString());
}
}
template <>
struct NumericLimits<float_na_equal> {
static constexpr float Minimum() {
return std::numeric_limits<float>::lowest();
};
static constexpr float Maximum() {
return std::numeric_limits<float>::max();
};
static constexpr bool IsSigned() {
return std::is_signed<float>::value;
}
static constexpr bool IsIntegral() {
return std::is_integral<float>::value;
}
};
template <>
struct NumericLimits<double_na_equal> {
static constexpr double Minimum() {
return std::numeric_limits<double>::lowest();
};
static constexpr double Maximum() {
return std::numeric_limits<double>::max();
};
static constexpr bool IsSigned() {
return std::is_signed<double>::value;
}
static constexpr bool IsIntegral() {
return std::is_integral<double>::value;
}
};
template <>
hash_t Hash(ParquetIntervalTargetType val) {
return Hash(const_char_ptr_cast(val.bytes), ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE);
}
template <>
hash_t Hash(ParquetUUIDTargetType val) {
return Hash(const_char_ptr_cast(val.bytes), ParquetUUIDTargetType::PARQUET_UUID_SIZE);
}
template <>
hash_t Hash(float_na_equal val) {
if (std::isnan(val.val)) {
return Hash<float>(std::numeric_limits<float>::quiet_NaN());
}
return Hash<float>(val.val);
}
template <>
hash_t Hash(double_na_equal val) {
if (std::isnan(val.val)) {
return Hash<double>(std::numeric_limits<double>::quiet_NaN());
}
return Hash<double>(val.val);
}
} // namespace duckdb

View File

@@ -0,0 +1,12 @@
add_library_unity(
duckdb_parquet_decoders
OBJECT
byte_stream_split_decoder.cpp
delta_binary_packed_decoder.cpp
delta_byte_array_decoder.cpp
delta_length_byte_array_decoder.cpp
dictionary_decoder.cpp
rle_decoder.cpp)
set(PARQUET_EXTENSION_FILES
${PARQUET_EXTENSION_FILES} $<TARGET_OBJECTS:duckdb_parquet_decoders>
PARENT_SCOPE)

View File

@@ -0,0 +1,54 @@
#include "decoder/byte_stream_split_decoder.hpp"
#include "column_reader.hpp"
#include "parquet_reader.hpp"
namespace duckdb {
ByteStreamSplitDecoder::ByteStreamSplitDecoder(ColumnReader &reader)
: reader(reader), decoded_data_buffer(reader.encoding_buffers[0]) {
}
void ByteStreamSplitDecoder::InitializePage() {
auto &block = reader.block;
// Subtract 1 from length as the block is allocated with 1 extra byte,
// but the byte stream split encoder needs to know the correct data size.
bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
block->inc(block->len);
}
void ByteStreamSplitDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
idx_t valid_count = reader.GetValidCount(defines, read_count, result_offset);
auto &allocator = reader.reader.allocator;
decoded_data_buffer.reset();
switch (reader.Schema().parquet_type) {
case duckdb_parquet::Type::FLOAT:
decoded_data_buffer.resize(allocator, sizeof(float) * valid_count);
bss_decoder->GetBatch<float>(decoded_data_buffer.ptr, valid_count);
break;
case duckdb_parquet::Type::DOUBLE:
decoded_data_buffer.resize(allocator, sizeof(double) * valid_count);
bss_decoder->GetBatch<double>(decoded_data_buffer.ptr, valid_count);
break;
default:
throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
}
reader.Plain(decoded_data_buffer, defines, read_count, result_offset, result);
}
void ByteStreamSplitDecoder::Skip(uint8_t *defines, idx_t skip_count) {
idx_t valid_count = reader.GetValidCount(defines, skip_count);
switch (reader.Schema().parquet_type) {
case duckdb_parquet::Type::FLOAT:
bss_decoder->Skip<float>(valid_count);
break;
case duckdb_parquet::Type::DOUBLE:
bss_decoder->Skip<double>(valid_count);
break;
default:
throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
}
}
} // namespace duckdb

View File

@@ -0,0 +1,54 @@
#include "decoder/delta_binary_packed_decoder.hpp"
#include "column_reader.hpp"
#include "parquet_reader.hpp"
namespace duckdb {
DeltaBinaryPackedDecoder::DeltaBinaryPackedDecoder(ColumnReader &reader)
: reader(reader), decoded_data_buffer(reader.encoding_buffers[0]) {
}
void DeltaBinaryPackedDecoder::InitializePage() {
auto &block = reader.block;
dbp_decoder = make_uniq<DbpDecoder>(block->ptr, block->len);
block->inc(block->len);
}
void DeltaBinaryPackedDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
idx_t valid_count = reader.GetValidCount(defines, read_count, result_offset);
auto &allocator = reader.reader.allocator;
decoded_data_buffer.reset();
switch (reader.Schema().parquet_type) {
case duckdb_parquet::Type::INT32:
decoded_data_buffer.resize(allocator, sizeof(int32_t) * (valid_count));
dbp_decoder->GetBatch<int32_t>(decoded_data_buffer.ptr, valid_count);
break;
case duckdb_parquet::Type::INT64:
decoded_data_buffer.resize(allocator, sizeof(int64_t) * (valid_count));
dbp_decoder->GetBatch<int64_t>(decoded_data_buffer.ptr, valid_count);
break;
default:
throw std::runtime_error("DELTA_BINARY_PACKED should only be INT32 or INT64");
}
// Plain() will put NULLs in the right place
reader.Plain(decoded_data_buffer, defines, read_count, result_offset, result);
}
void DeltaBinaryPackedDecoder::Skip(uint8_t *defines, idx_t skip_count) {
idx_t valid_count = reader.GetValidCount(defines, skip_count);
switch (reader.Schema().parquet_type) {
case duckdb_parquet::Type::INT32:
dbp_decoder->Skip<int32_t>(valid_count);
break;
case duckdb_parquet::Type::INT64:
dbp_decoder->Skip<int64_t>(valid_count);
break;
default:
throw std::runtime_error("DELTA_BINARY_PACKED should only be INT32 or INT64");
}
}
} // namespace duckdb

View File

@@ -0,0 +1,103 @@
#include "decoder/delta_byte_array_decoder.hpp"
#include "column_reader.hpp"
#include "parquet_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
DeltaByteArrayDecoder::DeltaByteArrayDecoder(ColumnReader &reader) : reader(reader) {
}
void DeltaByteArrayDecoder::ReadDbpData(Allocator &allocator, ResizeableBuffer &buffer, ResizeableBuffer &result_buffer,
idx_t &value_count) {
auto decoder = make_uniq<DbpDecoder>(buffer.ptr, buffer.len);
value_count = decoder->TotalValues();
result_buffer.reset();
result_buffer.resize(allocator, sizeof(uint32_t) * value_count);
decoder->GetBatch<uint32_t>(result_buffer.ptr, value_count);
decoder->Finalize();
buffer.inc(buffer.len - decoder->BufferPtr().len);
}
void DeltaByteArrayDecoder::InitializePage() {
if (reader.Type().InternalType() != PhysicalType::VARCHAR) {
throw std::runtime_error("Delta Byte Array encoding is only supported for string/blob data");
}
auto &block = *reader.block;
auto &allocator = reader.reader.allocator;
idx_t prefix_count, suffix_count;
auto &prefix_buffer = reader.encoding_buffers[0];
auto &suffix_buffer = reader.encoding_buffers[1];
ReadDbpData(allocator, block, prefix_buffer, prefix_count);
ReadDbpData(allocator, block, suffix_buffer, suffix_count);
if (prefix_count != suffix_count) {
throw std::runtime_error("DELTA_BYTE_ARRAY - prefix and suffix counts are different - corrupt file?");
}
if (prefix_count == 0) {
// no values
byte_array_data = make_uniq<Vector>(LogicalType::VARCHAR, nullptr);
return;
}
auto prefix_data = reinterpret_cast<uint32_t *>(prefix_buffer.ptr);
auto suffix_data = reinterpret_cast<uint32_t *>(suffix_buffer.ptr);
byte_array_data = make_uniq<Vector>(LogicalType::VARCHAR, prefix_count);
byte_array_count = prefix_count;
delta_offset = 0;
auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
for (idx_t i = 0; i < prefix_count; i++) {
auto str_len = prefix_data[i] + suffix_data[i];
block.available(suffix_data[i]);
string_data[i] = StringVector::EmptyString(*byte_array_data, str_len);
auto result_data = string_data[i].GetDataWriteable();
if (prefix_data[i] > 0) {
if (i == 0 || prefix_data[i] > string_data[i - 1].GetSize()) {
throw std::runtime_error("DELTA_BYTE_ARRAY - prefix is out of range - corrupt file?");
}
memcpy(result_data, string_data[i - 1].GetData(), prefix_data[i]);
}
memcpy(result_data + prefix_data[i], block.ptr, suffix_data[i]);
block.inc(suffix_data[i]);
string_data[i].Finalize();
}
}
void DeltaByteArrayDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
if (!byte_array_data) {
throw std::runtime_error("Internal error - DeltaByteArray called but there was no byte_array_data set");
}
auto result_ptr = FlatVector::GetData<string_t>(result);
auto &result_mask = FlatVector::Validity(result);
auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
for (idx_t row_idx = 0; row_idx < read_count; row_idx++) {
if (defines && defines[row_idx + result_offset] != reader.MaxDefine()) {
result_mask.SetInvalid(row_idx + result_offset);
continue;
}
if (delta_offset >= byte_array_count) {
throw IOException("DELTA_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
"read of %d from %d entries) - corrupt file?",
delta_offset + 1, byte_array_count);
}
result_ptr[row_idx + result_offset] = string_data[delta_offset++];
}
StringVector::AddHeapReference(result, *byte_array_data);
}
void DeltaByteArrayDecoder::Skip(uint8_t *defines, idx_t skip_count) {
if (!byte_array_data) {
throw std::runtime_error("Internal error - DeltaByteArray called but there was no byte_array_data set");
}
for (idx_t row_idx = 0; row_idx < skip_count; row_idx++) {
if (defines && defines[row_idx] != reader.MaxDefine()) {
continue;
}
if (delta_offset >= byte_array_count) {
throw IOException("DELTA_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
"read of %d from %d entries) - corrupt file?",
delta_offset + 1, byte_array_count);
}
delta_offset++;
}
}
} // namespace duckdb

View File

@@ -0,0 +1,128 @@
#include "decoder/delta_length_byte_array_decoder.hpp"
#include "decoder/delta_byte_array_decoder.hpp"
#include "column_reader.hpp"
#include "parquet_reader.hpp"
#include "reader/string_column_reader.hpp"
#include "utf8proc_wrapper.hpp"
namespace duckdb {
DeltaLengthByteArrayDecoder::DeltaLengthByteArrayDecoder(ColumnReader &reader)
: reader(reader), length_buffer(reader.encoding_buffers[0]), length_idx(0) {
}
void DeltaLengthByteArrayDecoder::InitializePage() {
if (reader.Type().InternalType() != PhysicalType::VARCHAR) {
throw std::runtime_error("Delta Length Byte Array encoding is only supported for string/blob data");
}
// read the binary packed lengths
auto &block = *reader.block;
auto &allocator = reader.reader.allocator;
DeltaByteArrayDecoder::ReadDbpData(allocator, block, length_buffer, byte_array_count);
// Verify that the sum of DBP string lengths match up with the available string data
idx_t total_string_length = 0;
const auto length_data = reinterpret_cast<uint32_t *>(length_buffer.ptr);
for (idx_t i = 0; i < byte_array_count; i++) {
total_string_length += length_data[i];
}
block.available(total_string_length);
length_idx = 0;
}
void DeltaLengthByteArrayDecoder::Read(shared_ptr<ResizeableBuffer> &block_ref, uint8_t *defines, idx_t read_count,
Vector &result, idx_t result_offset) {
if (defines) {
ReadInternal<true>(block_ref, defines, read_count, result, result_offset);
} else {
ReadInternal<false>(block_ref, defines, read_count, result, result_offset);
}
}
template <bool HAS_DEFINES>
void DeltaLengthByteArrayDecoder::ReadInternal(shared_ptr<ResizeableBuffer> &block_ref, uint8_t *const defines,
const idx_t read_count, Vector &result, const idx_t result_offset) {
auto &block = *block_ref;
const auto length_data = reinterpret_cast<uint32_t *>(length_buffer.ptr);
auto result_data = FlatVector::GetData<string_t>(result);
auto &result_mask = FlatVector::Validity(result);
if (!HAS_DEFINES) {
// Fast path: take this out of the loop below
if (length_idx + read_count > byte_array_count) {
throw IOException(
"DELTA_LENGTH_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
"read of %d from %d entries) - corrupt file?",
length_idx + read_count, byte_array_count);
}
}
const auto start_ptr = block.ptr;
for (idx_t row_idx = 0; row_idx < read_count; row_idx++) {
const auto result_idx = result_offset + row_idx;
if (HAS_DEFINES) {
if (defines[result_idx] != reader.MaxDefine()) {
result_mask.SetInvalid(result_idx);
continue;
}
if (length_idx >= byte_array_count) {
throw IOException(
"DELTA_LENGTH_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
"read of %d from %d entries) - corrupt file?",
length_idx, byte_array_count);
}
}
const auto &str_len = length_data[length_idx++];
result_data[result_idx] = string_t(char_ptr_cast(block.ptr), str_len);
block.unsafe_inc(str_len);
}
// Verify that the strings we read are valid UTF-8
reader.Cast<StringColumnReader>().VerifyString(char_ptr_cast(start_ptr), block.ptr - start_ptr);
StringColumnReader::ReferenceBlock(result, block_ref);
}
void DeltaLengthByteArrayDecoder::Skip(uint8_t *defines, idx_t skip_count) {
if (defines) {
SkipInternal<true>(defines, skip_count);
} else {
SkipInternal<false>(defines, skip_count);
}
}
template <bool HAS_DEFINES>
void DeltaLengthByteArrayDecoder::SkipInternal(uint8_t *defines, idx_t skip_count) {
auto &block = *reader.block;
const auto length_data = reinterpret_cast<uint32_t *>(length_buffer.ptr);
if (!HAS_DEFINES) {
// Fast path: take this out of the loop below
if (length_idx + skip_count > byte_array_count) {
throw IOException(
"DELTA_LENGTH_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
"read of %d from %d entries) - corrupt file?",
length_idx + skip_count, byte_array_count);
}
}
idx_t skip_bytes = 0;
for (idx_t row_idx = 0; row_idx < skip_count; row_idx++) {
if (HAS_DEFINES) {
if (defines[row_idx] != reader.MaxDefine()) {
continue;
}
if (length_idx >= byte_array_count) {
throw IOException(
"DELTA_LENGTH_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
"read of %d from %d entries) - corrupt file?",
length_idx, byte_array_count);
}
}
skip_bytes += length_data[length_idx++];
}
block.inc(skip_bytes);
}
} // namespace duckdb

View File

@@ -0,0 +1,229 @@
#include "decoder/dictionary_decoder.hpp"
#include "column_reader.hpp"
#include "parquet_reader.hpp"
#include "duckdb/planner/filter/conjunction_filter.hpp"
#include "duckdb/planner/filter/expression_filter.hpp"
#include "duckdb/planner/table_filter_state.hpp"
namespace duckdb {
DictionaryDecoder::DictionaryDecoder(ColumnReader &reader)
: reader(reader), offset_buffer(reader.encoding_buffers[0]), valid_sel(STANDARD_VECTOR_SIZE),
dictionary_selection_vector(STANDARD_VECTOR_SIZE), dictionary_size(0) {
}
void DictionaryDecoder::InitializeDictionary(idx_t new_dictionary_size, optional_ptr<const TableFilter> filter,
optional_ptr<TableFilterState> filter_state, bool has_defines) {
dictionary_size = new_dictionary_size;
filter_result.reset();
filter_count = 0;
can_have_nulls = has_defines;
// we use the last entry as a NULL, dictionary vectors don't have a separate validity mask
const auto duckdb_dictionary_size = dictionary_size + can_have_nulls;
dictionary = DictionaryVector::CreateReusableDictionary(reader.Type(), duckdb_dictionary_size);
auto &dict_validity = FlatVector::Validity(dictionary->data);
dict_validity.Reset(duckdb_dictionary_size);
if (can_have_nulls) {
dict_validity.SetInvalid(dictionary_size);
}
// now read the non-NULL values from Parquet
reader.Plain(reader.block, nullptr, dictionary_size, 0, dictionary->data);
// immediately filter the dictionary, if applicable
if (filter && CanFilter(*filter, *filter_state)) {
// no filter result yet - apply filter to the dictionary
// initialize the filter result - setting everything to false
filter_result = make_unsafe_uniq_array<bool>(duckdb_dictionary_size);
// apply the filter
UnifiedVectorFormat vdata;
dictionary->data.ToUnifiedFormat(duckdb_dictionary_size, vdata);
SelectionVector dict_sel;
filter_count = duckdb_dictionary_size;
ColumnSegment::FilterSelection(dict_sel, dictionary->data, vdata, *filter, *filter_state,
duckdb_dictionary_size, filter_count);
// now set all matching tuples to true
for (idx_t i = 0; i < filter_count; i++) {
auto idx = dict_sel.get_index(i);
filter_result[idx] = true;
}
}
}
void DictionaryDecoder::InitializePage() {
// where is it otherwise??
auto &block = reader.block;
auto dict_width = block->read<uint8_t>();
dict_decoder = make_uniq<RleBpDecoder>(block->ptr, block->len, dict_width);
block->inc(block->len);
}
void DictionaryDecoder::ConvertDictToSelVec(uint32_t *offsets, const SelectionVector &rows, idx_t count) {
D_ASSERT(count <= STANDARD_VECTOR_SIZE);
for (idx_t idx = 0; idx < count; idx++) {
auto row_idx = rows.get_index(idx);
auto offset = offsets[idx];
if (offset >= dictionary_size) {
throw std::runtime_error("Parquet file is likely corrupted, dictionary offset out of range");
}
dictionary_selection_vector.set_index(row_idx, offset);
}
}
idx_t DictionaryDecoder::GetValidValues(uint8_t *defines, idx_t read_count, idx_t result_offset) {
idx_t valid_count = read_count;
if (defines) {
D_ASSERT(can_have_nulls);
valid_count = 0;
for (idx_t i = 0; i < read_count; i++) {
valid_sel.set_index(valid_count, i);
dictionary_selection_vector.set_index(i, dictionary_size);
valid_count += defines[result_offset + i] == reader.MaxDefine();
}
}
return valid_count;
}
idx_t DictionaryDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
if (!dictionary || dictionary_size < 0) {
throw std::runtime_error("Parquet file is likely corrupted, missing dictionary");
}
idx_t valid_count = GetValidValues(defines, read_count, result_offset);
if (valid_count == read_count) {
// all values are valid - we can directly decompress the offsets into the selection vector
dict_decoder->GetBatch<uint32_t>(data_ptr_cast(dictionary_selection_vector.data()),
NumericCast<uint32_t>(valid_count));
// we do still need to verify the offsets though
uint32_t max_index = 0;
for (idx_t idx = 0; idx < valid_count; idx++) {
max_index = MaxValue(max_index, dictionary_selection_vector[idx]);
}
if (max_index >= dictionary_size) {
throw std::runtime_error("Parquet file is likely corrupted, dictionary offset out of range");
}
} else if (valid_count > 0) {
// for the valid entries - decode the offsets
offset_buffer.resize(reader.reader.allocator, sizeof(uint32_t) * valid_count);
dict_decoder->GetBatch<uint32_t>(offset_buffer.ptr, NumericCast<uint32_t>(valid_count));
ConvertDictToSelVec(reinterpret_cast<uint32_t *>(offset_buffer.ptr), valid_sel, valid_count);
}
#ifdef DEBUG
dictionary_selection_vector.Verify(read_count, dictionary_size + can_have_nulls);
#endif
if (result_offset == 0) {
result.Dictionary(dictionary, dictionary_selection_vector);
D_ASSERT(result.GetVectorType() == VectorType::DICTIONARY_VECTOR);
} else {
D_ASSERT(result.GetVectorType() == VectorType::FLAT_VECTOR);
VectorOperations::Copy(dictionary->data, result, dictionary_selection_vector, read_count, 0, result_offset);
}
return valid_count;
}
void DictionaryDecoder::Skip(uint8_t *defines, idx_t skip_count) {
if (!dictionary || dictionary_size < 0) {
throw std::runtime_error("Parquet file is likely corrupted, missing dictionary");
}
idx_t valid_count = reader.GetValidCount(defines, skip_count);
// skip past the valid offsets
dict_decoder->Skip(NumericCast<uint32_t>(valid_count));
}
bool DictionaryDecoder::DictionarySupportsFilter(const TableFilter &filter, TableFilterState &filter_state) {
switch (filter.filter_type) {
case TableFilterType::CONJUNCTION_OR: {
auto &conjunction = filter.Cast<ConjunctionOrFilter>();
auto &state = filter_state.Cast<ConjunctionOrFilterState>();
for (idx_t child_idx = 0; child_idx < conjunction.child_filters.size(); child_idx++) {
auto &child_filter = *conjunction.child_filters[child_idx];
auto &child_state = *state.child_states[child_idx];
if (!DictionarySupportsFilter(child_filter, child_state)) {
return false;
}
}
return true;
}
case TableFilterType::CONJUNCTION_AND: {
auto &conjunction = filter.Cast<ConjunctionAndFilter>();
auto &state = filter_state.Cast<ConjunctionAndFilterState>();
for (idx_t child_idx = 0; child_idx < conjunction.child_filters.size(); child_idx++) {
auto &child_filter = *conjunction.child_filters[child_idx];
auto &child_state = *state.child_states[child_idx];
if (!DictionarySupportsFilter(child_filter, child_state)) {
return false;
}
}
return true;
}
case TableFilterType::CONSTANT_COMPARISON:
case TableFilterType::IS_NOT_NULL:
return true;
case TableFilterType::EXPRESSION_FILTER: {
// expression filters can only be pushed into the dictionary if they filter out NULL values
auto &expr_filter = filter.Cast<ExpressionFilter>();
auto &state = filter_state.Cast<ExpressionFilterState>();
auto emits_nulls = expr_filter.EvaluateWithConstant(state.executor, Value(reader.Type()));
return !emits_nulls;
}
case TableFilterType::IS_NULL:
case TableFilterType::DYNAMIC_FILTER:
case TableFilterType::OPTIONAL_FILTER:
case TableFilterType::STRUCT_EXTRACT:
default:
return false;
}
}
bool DictionaryDecoder::CanFilter(const TableFilter &filter, TableFilterState &filter_state) {
if (dictionary_size == 0) {
return false;
}
// We can only push the filter if the filter removes NULL values
if (!DictionarySupportsFilter(filter, filter_state)) {
return false;
}
return true;
}
void DictionaryDecoder::Filter(uint8_t *defines, const idx_t read_count, Vector &result, SelectionVector &sel,
idx_t &approved_tuple_count) {
if (!dictionary || dictionary_size < 0) {
throw std::runtime_error("Parquet file is likely corrupted, missing dictionary");
}
D_ASSERT(filter_count > 0);
// read the dictionary values
const auto valid_count = Read(defines, read_count, result, 0);
if (valid_count == 0) {
// all values are NULL
approved_tuple_count = 0;
return;
}
// apply the filter by checking the dictionary offsets directly
uint32_t *offsets;
if (valid_count == read_count) {
offsets = dictionary_selection_vector.data();
} else {
offsets = reinterpret_cast<uint32_t *>(offset_buffer.ptr);
}
D_ASSERT(offsets);
SelectionVector new_sel(valid_count);
approved_tuple_count = 0;
for (idx_t idx = 0; idx < valid_count; idx++) {
auto row_idx = valid_count == read_count ? idx : valid_sel.get_index(idx);
auto offset = offsets[idx];
if (!filter_result[offset]) {
// does not pass the filter
continue;
}
new_sel.set_index(approved_tuple_count++, row_idx);
}
if (approved_tuple_count < read_count) {
sel.Initialize(new_sel);
}
}
} // namespace duckdb

View File

@@ -0,0 +1,36 @@
#include "decoder/rle_decoder.hpp"
#include "column_reader.hpp"
#include "parquet_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
RLEDecoder::RLEDecoder(ColumnReader &reader) : reader(reader), decoded_data_buffer(reader.encoding_buffers[0]) {
}
void RLEDecoder::InitializePage() {
if (reader.Type().id() != LogicalTypeId::BOOLEAN) {
throw std::runtime_error("RLE encoding is only supported for boolean data");
}
auto &block = reader.block;
block->inc(sizeof(uint32_t));
rle_decoder = make_uniq<RleBpDecoder>(block->ptr, block->len, 1);
}
void RLEDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
// RLE encoding for boolean
D_ASSERT(reader.Type().id() == LogicalTypeId::BOOLEAN);
idx_t valid_count = reader.GetValidCount(defines, read_count, result_offset);
decoded_data_buffer.reset();
decoded_data_buffer.resize(reader.reader.allocator, sizeof(bool) * valid_count);
rle_decoder->GetBatch<uint8_t>(decoded_data_buffer.ptr, valid_count);
reader.PlainTemplated<bool, TemplatedParquetValueConversion<bool>>(decoded_data_buffer, defines, read_count,
result_offset, result);
}
void RLEDecoder::Skip(uint8_t *defines, idx_t skip_count) {
idx_t valid_count = reader.GetValidCount(defines, skip_count);
rle_decoder->Skip(valid_count);
}
} // namespace duckdb

View File

@@ -0,0 +1,332 @@
#include "geo_parquet.hpp"
#include "column_reader.hpp"
#include "duckdb/catalog/catalog_entry/scalar_function_catalog_entry.hpp"
#include "duckdb/execution/expression_executor.hpp"
#include "duckdb/function/scalar_function.hpp"
#include "duckdb/planner/expression/bound_function_expression.hpp"
#include "duckdb/planner/expression/bound_reference_expression.hpp"
#include "duckdb/main/extension_helper.hpp"
#include "reader/expression_column_reader.hpp"
#include "parquet_reader.hpp"
#include "yyjson.hpp"
namespace duckdb {
using namespace duckdb_yyjson; // NOLINT
//------------------------------------------------------------------------------
// GeoParquetFileMetadata
//------------------------------------------------------------------------------
unique_ptr<GeoParquetFileMetadata> GeoParquetFileMetadata::TryRead(const duckdb_parquet::FileMetaData &file_meta_data,
const ClientContext &context) {
// Conversion not enabled, or spatial is not loaded!
if (!IsGeoParquetConversionEnabled(context)) {
return nullptr;
}
for (auto &kv : file_meta_data.key_value_metadata) {
if (kv.key == "geo") {
const auto geo_metadata = yyjson_read(kv.value.c_str(), kv.value.size(), 0);
if (!geo_metadata) {
// Could not parse the JSON
return nullptr;
}
try {
// Check the root object
const auto root = yyjson_doc_get_root(geo_metadata);
if (!yyjson_is_obj(root)) {
throw InvalidInputException("Geoparquet metadata is not an object");
}
// We dont actually care about the version for now, as we only support V1+native
auto result = make_uniq<GeoParquetFileMetadata>(GeoParquetVersion::BOTH);
// Check and parse the version
const auto version_val = yyjson_obj_get(root, "version");
if (!yyjson_is_str(version_val)) {
throw InvalidInputException("Geoparquet metadata does not have a version");
}
auto version = yyjson_get_str(version_val);
if (StringUtil::StartsWith(version, "3")) {
// Guard against a breaking future 3.0 version
throw InvalidInputException("Geoparquet version %s is not supported", version);
}
// Check and parse the geometry columns
const auto columns_val = yyjson_obj_get(root, "columns");
if (!yyjson_is_obj(columns_val)) {
throw InvalidInputException("Geoparquet metadata does not have a columns object");
}
// Iterate over all geometry columns
yyjson_obj_iter iter = yyjson_obj_iter_with(columns_val);
yyjson_val *column_key;
while ((column_key = yyjson_obj_iter_next(&iter))) {
const auto column_val = yyjson_obj_iter_get_val(column_key);
const auto column_name = yyjson_get_str(column_key);
auto &column = result->geometry_columns[column_name];
if (!yyjson_is_obj(column_val)) {
throw InvalidInputException("Geoparquet column '%s' is not an object", column_name);
}
// Parse the encoding
const auto encoding_val = yyjson_obj_get(column_val, "encoding");
if (!yyjson_is_str(encoding_val)) {
throw InvalidInputException("Geoparquet column '%s' does not have an encoding", column_name);
}
const auto encoding_str = yyjson_get_str(encoding_val);
if (strcmp(encoding_str, "WKB") == 0) {
column.geometry_encoding = GeoParquetColumnEncoding::WKB;
} else if (strcmp(encoding_str, "point") == 0) {
column.geometry_encoding = GeoParquetColumnEncoding::POINT;
} else if (strcmp(encoding_str, "linestring") == 0) {
column.geometry_encoding = GeoParquetColumnEncoding::LINESTRING;
} else if (strcmp(encoding_str, "polygon") == 0) {
column.geometry_encoding = GeoParquetColumnEncoding::POLYGON;
} else if (strcmp(encoding_str, "multipoint") == 0) {
column.geometry_encoding = GeoParquetColumnEncoding::MULTIPOINT;
} else if (strcmp(encoding_str, "multilinestring") == 0) {
column.geometry_encoding = GeoParquetColumnEncoding::MULTILINESTRING;
} else if (strcmp(encoding_str, "multipolygon") == 0) {
column.geometry_encoding = GeoParquetColumnEncoding::MULTIPOLYGON;
} else {
throw InvalidInputException("Geoparquet column '%s' has an unsupported encoding", column_name);
}
// Parse the geometry types
const auto geometry_types_val = yyjson_obj_get(column_val, "geometry_types");
if (!yyjson_is_arr(geometry_types_val)) {
throw InvalidInputException("Geoparquet column '%s' does not have geometry types", column_name);
}
// We dont care about the geometry types for now.
// TODO: Parse the bounding box, other metadata that might be useful.
// (Only encoding and geometry types are required to be present)
}
// Return the result
// Make sure to free the JSON document
yyjson_doc_free(geo_metadata);
return result;
} catch (...) {
// Make sure to free the JSON document in case of an exception
yyjson_doc_free(geo_metadata);
throw;
}
}
}
return nullptr;
}
void GeoParquetFileMetadata::AddGeoParquetStats(const string &column_name, const LogicalType &type,
const GeometryStatsData &stats) {
// Lock the metadata
lock_guard<mutex> glock(write_lock);
auto it = geometry_columns.find(column_name);
if (it == geometry_columns.end()) {
auto &column = geometry_columns[column_name];
column.stats.Merge(stats);
column.insertion_index = geometry_columns.size() - 1;
} else {
it->second.stats.Merge(stats);
}
}
void GeoParquetFileMetadata::Write(duckdb_parquet::FileMetaData &file_meta_data) {
// GeoParquet does not support M or ZM coordinates. So remove any columns that have them.
unordered_set<string> invalid_columns;
for (auto &column : geometry_columns) {
if (column.second.stats.extent.HasM()) {
invalid_columns.insert(column.first);
}
}
for (auto &col_name : invalid_columns) {
geometry_columns.erase(col_name);
}
// No columns remaining, nothing to write
if (geometry_columns.empty()) {
return;
}
// Find the primary geometry column
const auto &random_first_column = *geometry_columns.begin();
auto primary_geometry_column = random_first_column.first;
auto primary_insertion_index = random_first_column.second.insertion_index;
for (auto &column : geometry_columns) {
if (column.second.insertion_index < primary_insertion_index) {
primary_insertion_index = column.second.insertion_index;
primary_geometry_column = column.first;
}
}
yyjson_mut_doc *doc = yyjson_mut_doc_new(nullptr);
yyjson_mut_val *root = yyjson_mut_obj(doc);
yyjson_mut_doc_set_root(doc, root);
// Add the version
switch (version) {
case GeoParquetVersion::V1:
case GeoParquetVersion::BOTH:
yyjson_mut_obj_add_strcpy(doc, root, "version", "1.0.0");
break;
case GeoParquetVersion::V2:
yyjson_mut_obj_add_strcpy(doc, root, "version", "2.0.0");
break;
case GeoParquetVersion::NONE:
default:
// Should never happen, we should not be writing anything
yyjson_mut_doc_free(doc);
throw InternalException("GeoParquetVersion::NONE should not write metadata");
}
// Add the primary column
yyjson_mut_obj_add_strncpy(doc, root, "primary_column", primary_geometry_column.c_str(),
primary_geometry_column.size());
// Add the columns
const auto json_columns = yyjson_mut_obj_add_obj(doc, root, "columns");
for (auto &column : geometry_columns) {
const auto column_json = yyjson_mut_obj_add_obj(doc, json_columns, column.first.c_str());
yyjson_mut_obj_add_str(doc, column_json, "encoding", "WKB");
const auto geometry_types = yyjson_mut_obj_add_arr(doc, column_json, "geometry_types");
for (auto &type_name : column.second.stats.types.ToString(false)) {
yyjson_mut_arr_add_strcpy(doc, geometry_types, type_name.c_str());
}
const auto &bbox = column.second.stats.extent;
if (bbox.HasXY()) {
const auto bbox_arr = yyjson_mut_obj_add_arr(doc, column_json, "bbox");
if (!column.second.stats.extent.HasZ()) {
yyjson_mut_arr_add_real(doc, bbox_arr, bbox.x_min);
yyjson_mut_arr_add_real(doc, bbox_arr, bbox.y_min);
yyjson_mut_arr_add_real(doc, bbox_arr, bbox.x_max);
yyjson_mut_arr_add_real(doc, bbox_arr, bbox.y_max);
} else {
yyjson_mut_arr_add_real(doc, bbox_arr, bbox.x_min);
yyjson_mut_arr_add_real(doc, bbox_arr, bbox.y_min);
yyjson_mut_arr_add_real(doc, bbox_arr, bbox.z_min);
yyjson_mut_arr_add_real(doc, bbox_arr, bbox.x_max);
yyjson_mut_arr_add_real(doc, bbox_arr, bbox.y_max);
yyjson_mut_arr_add_real(doc, bbox_arr, bbox.z_max);
}
}
// If the CRS is present, add it
if (!column.second.projjson.empty()) {
const auto crs_doc = yyjson_read(column.second.projjson.c_str(), column.second.projjson.size(), 0);
if (!crs_doc) {
yyjson_mut_doc_free(doc);
throw InvalidInputException("Failed to parse CRS JSON");
}
const auto crs_root = yyjson_doc_get_root(crs_doc);
const auto crs_val = yyjson_val_mut_copy(doc, crs_root);
const auto crs_key = yyjson_mut_strcpy(doc, "projjson");
yyjson_mut_obj_add(column_json, crs_key, crs_val);
yyjson_doc_free(crs_doc);
}
}
yyjson_write_err err;
size_t len;
char *json = yyjson_mut_write_opts(doc, 0, nullptr, &len, &err);
if (!json) {
yyjson_mut_doc_free(doc);
throw SerializationException("Failed to write JSON string: %s", err.msg);
}
// Create a string from the JSON
duckdb_parquet::KeyValue kv;
kv.__set_key("geo");
kv.__set_value(string(json, len));
// Free the JSON and the document
free(json);
yyjson_mut_doc_free(doc);
file_meta_data.key_value_metadata.push_back(kv);
file_meta_data.__isset.key_value_metadata = true;
}
bool GeoParquetFileMetadata::IsGeometryColumn(const string &column_name) const {
return geometry_columns.find(column_name) != geometry_columns.end();
}
bool GeoParquetFileMetadata::IsGeoParquetConversionEnabled(const ClientContext &context) {
Value geoparquet_enabled;
if (!context.TryGetCurrentSetting("enable_geoparquet_conversion", geoparquet_enabled)) {
return false;
}
if (!geoparquet_enabled.GetValue<bool>()) {
// Disabled by setting
return false;
}
if (!context.db->ExtensionIsLoaded("spatial")) {
// Spatial extension is not loaded, we cant convert anyway
return false;
}
return true;
}
LogicalType GeoParquetFileMetadata::GeometryType() {
auto blob_type = LogicalType(LogicalTypeId::BLOB);
blob_type.SetAlias("GEOMETRY");
return blob_type;
}
const unordered_map<string, GeoParquetColumnMetadata> &GeoParquetFileMetadata::GetColumnMeta() const {
return geometry_columns;
}
unique_ptr<ColumnReader> GeoParquetFileMetadata::CreateColumnReader(ParquetReader &reader,
const ParquetColumnSchema &schema,
ClientContext &context) {
// Get the catalog
auto &catalog = Catalog::GetSystemCatalog(context);
// WKB encoding
if (schema.children[0].type.id() == LogicalTypeId::BLOB) {
// Look for a conversion function in the catalog
auto &conversion_func_set =
catalog.GetEntry<ScalarFunctionCatalogEntry>(context, DEFAULT_SCHEMA, "st_geomfromwkb");
auto conversion_func = conversion_func_set.functions.GetFunctionByArguments(context, {LogicalType::BLOB});
// Create a bound function call expression
auto args = vector<unique_ptr<Expression>>();
args.push_back(std::move(make_uniq<BoundReferenceExpression>(LogicalType::BLOB, 0)));
auto expr =
make_uniq<BoundFunctionExpression>(conversion_func.return_type, conversion_func, std::move(args), nullptr);
// Create a child reader
auto child_reader = ColumnReader::CreateReader(reader, schema.children[0]);
// Create an expression reader that applies the conversion function to the child reader
return make_uniq<ExpressionColumnReader>(context, std::move(child_reader), std::move(expr), schema);
}
// Otherwise, unrecognized encoding
throw NotImplementedException("Unsupported geometry encoding");
}
} // namespace duckdb

View File

@@ -0,0 +1,340 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "parquet_bss_decoder.hpp"
#include "parquet_statistics.hpp"
#include "parquet_types.h"
#include "resizable_buffer.hpp"
#include "thrift_tools.hpp"
#include "decoder/byte_stream_split_decoder.hpp"
#include "decoder/delta_binary_packed_decoder.hpp"
#include "decoder/dictionary_decoder.hpp"
#include "decoder/rle_decoder.hpp"
#include "decoder/delta_length_byte_array_decoder.hpp"
#include "decoder/delta_byte_array_decoder.hpp"
#include "parquet_column_schema.hpp"
#include "duckdb/common/operator/cast_operators.hpp"
#include "duckdb/common/types/string_type.hpp"
#include "duckdb/common/types/vector.hpp"
#include "duckdb/common/types/vector_cache.hpp"
namespace duckdb {
class ParquetReader;
struct TableFilterState;
using duckdb_apache::thrift::protocol::TProtocol;
using duckdb_parquet::ColumnChunk;
using duckdb_parquet::CompressionCodec;
using duckdb_parquet::FieldRepetitionType;
using duckdb_parquet::PageHeader;
using duckdb_parquet::SchemaElement;
using duckdb_parquet::Type;
enum class ColumnEncoding {
INVALID,
DICTIONARY,
DELTA_BINARY_PACKED,
RLE,
DELTA_LENGTH_BYTE_ARRAY,
DELTA_BYTE_ARRAY,
BYTE_STREAM_SPLIT,
PLAIN
};
class ColumnReader {
friend class ByteStreamSplitDecoder;
friend class DeltaBinaryPackedDecoder;
friend class DeltaByteArrayDecoder;
friend class DeltaLengthByteArrayDecoder;
friend class DictionaryDecoder;
friend class RLEDecoder;
public:
ColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema_p);
virtual ~ColumnReader();
public:
static unique_ptr<ColumnReader> CreateReader(ParquetReader &reader, const ParquetColumnSchema &schema);
virtual void InitializeRead(idx_t row_group_index, const vector<ColumnChunk> &columns, TProtocol &protocol_p);
virtual idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out);
virtual void Select(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out,
const SelectionVector &sel, idx_t approved_tuple_count);
virtual void Filter(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out,
const TableFilter &filter, TableFilterState &filter_state, SelectionVector &sel,
idx_t &approved_tuple_count, bool is_first_filter);
static void ApplyFilter(Vector &v, const TableFilter &filter, TableFilterState &filter_state, idx_t scan_count,
SelectionVector &sel, idx_t &approved_tuple_count);
virtual void Skip(idx_t num_values);
ParquetReader &Reader();
const LogicalType &Type() const {
return column_schema.type;
}
const ParquetColumnSchema &Schema() const {
return column_schema;
}
inline idx_t ColumnIndex() const {
return column_schema.column_index;
}
inline idx_t MaxDefine() const {
return column_schema.max_define;
}
idx_t MaxRepeat() const {
return column_schema.max_repeat;
}
virtual idx_t FileOffset() const;
virtual uint64_t TotalCompressedSize();
virtual idx_t GroupRowsAvailable();
// register the range this reader will touch for prefetching
virtual void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge);
unique_ptr<BaseStatistics> Stats(idx_t row_group_idx_p, const vector<ColumnChunk> &columns);
template <class VALUE_TYPE, class CONVERSION, bool HAS_DEFINES>
void PlainTemplatedDefines(ByteBuffer &plain_data, const uint8_t *defines, uint64_t num_values, idx_t result_offset,
Vector &result) {
if (CONVERSION::PlainAvailable(plain_data, num_values)) {
PlainTemplatedInternal<VALUE_TYPE, CONVERSION, HAS_DEFINES, false>(plain_data, defines, num_values,
result_offset, result);
} else {
PlainTemplatedInternal<VALUE_TYPE, CONVERSION, HAS_DEFINES, true>(plain_data, defines, num_values,
result_offset, result);
}
}
template <class VALUE_TYPE, class CONVERSION>
void PlainTemplated(ByteBuffer &plain_data, const uint8_t *defines, uint64_t num_values, idx_t result_offset,
Vector &result) {
if (HasDefines() && defines) {
PlainTemplatedDefines<VALUE_TYPE, CONVERSION, true>(plain_data, defines, num_values, result_offset, result);
} else {
PlainTemplatedDefines<VALUE_TYPE, CONVERSION, false>(plain_data, defines, num_values, result_offset,
result);
}
}
template <class CONVERSION, bool HAS_DEFINES>
void PlainSkipTemplatedDefines(ByteBuffer &plain_data, const uint8_t *defines, uint64_t num_values) {
if (CONVERSION::PlainAvailable(plain_data, num_values)) {
PlainSkipTemplatedInternal<CONVERSION, HAS_DEFINES, false>(plain_data, defines, num_values);
} else {
PlainSkipTemplatedInternal<CONVERSION, HAS_DEFINES, true>(plain_data, defines, num_values);
}
}
template <class CONVERSION>
void PlainSkipTemplated(ByteBuffer &plain_data, const uint8_t *defines, uint64_t num_values) {
if (HasDefines() && defines) {
PlainSkipTemplatedDefines<CONVERSION, true>(plain_data, defines, num_values);
} else {
PlainSkipTemplatedDefines<CONVERSION, false>(plain_data, defines, num_values);
}
}
template <class VALUE_TYPE, class CONVERSION>
void PlainSelectTemplated(ByteBuffer &plain_data, const uint8_t *defines, uint64_t num_values, Vector &result,
const SelectionVector &sel, idx_t approved_tuple_count) {
if (HasDefines() && defines) {
PlainSelectTemplatedInternal<VALUE_TYPE, CONVERSION, true, true>(plain_data, defines, num_values, result,
sel, approved_tuple_count);
} else {
PlainSelectTemplatedInternal<VALUE_TYPE, CONVERSION, false, true>(plain_data, defines, num_values, result,
sel, approved_tuple_count);
}
}
idx_t GetValidCount(uint8_t *defines, idx_t count, idx_t offset = 0) const {
if (!defines) {
return count;
}
idx_t valid_count = 0;
for (idx_t i = offset; i < offset + count; i++) {
valid_count += defines[i] == MaxDefine();
}
return valid_count;
}
protected:
virtual bool SupportsDirectFilter() const {
return false;
}
virtual bool SupportsDirectSelect() const {
return false;
}
void DirectFilter(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out,
const TableFilter &filter, TableFilterState &filter_state, SelectionVector &sel,
idx_t &approved_tuple_count);
void DirectSelect(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result,
const SelectionVector &sel, idx_t approved_tuple_count);
private:
//! Check if a previous table filter has filtered out this page
bool PageIsFilteredOut(PageHeader &page_hdr);
void BeginRead(data_ptr_t define_out, data_ptr_t repeat_out);
void FinishRead(idx_t read_count);
idx_t ReadPageHeaders(idx_t max_read, optional_ptr<const TableFilter> filter = nullptr,
optional_ptr<TableFilterState> filter_state = nullptr);
idx_t ReadInternal(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result);
//! Prepare a read of up to "max_read" rows and read the defines/repeats.
//! Returns whether all values are valid (i.e., not NULL)
bool PrepareRead(idx_t read_count, data_ptr_t define_out, data_ptr_t repeat_out, idx_t result_offset);
void ReadData(idx_t read_now, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result, idx_t result_offset);
template <class VALUE_TYPE, class CONVERSION, bool HAS_DEFINES, bool CHECKED>
void PlainTemplatedInternal(ByteBuffer &plain_data, const uint8_t *__restrict defines, const uint64_t num_values,
const idx_t result_offset, Vector &result) {
const auto result_ptr = FlatVector::GetData<VALUE_TYPE>(result);
if (!HAS_DEFINES && !CHECKED && CONVERSION::PlainConstantSize() == sizeof(VALUE_TYPE)) {
// we can memcpy
idx_t copy_count = num_values * CONVERSION::PlainConstantSize();
memcpy(result_ptr + result_offset, plain_data.ptr, copy_count);
plain_data.unsafe_inc(copy_count);
return;
}
auto &result_mask = FlatVector::Validity(result);
for (idx_t row_idx = result_offset; row_idx < result_offset + num_values; row_idx++) {
if (HAS_DEFINES && defines[row_idx] != MaxDefine()) {
result_mask.SetInvalid(row_idx);
continue;
}
result_ptr[row_idx] = CONVERSION::template PlainRead<CHECKED>(plain_data, *this);
}
}
template <class CONVERSION, bool HAS_DEFINES, bool CHECKED>
void PlainSkipTemplatedInternal(ByteBuffer &plain_data, const uint8_t *__restrict defines,
const uint64_t num_values, idx_t row_offset = 0) {
if (!HAS_DEFINES && CONVERSION::PlainConstantSize() > 0) {
if (CHECKED) {
plain_data.inc(num_values * CONVERSION::PlainConstantSize());
} else {
plain_data.unsafe_inc(num_values * CONVERSION::PlainConstantSize());
}
return;
}
for (idx_t row_idx = row_offset; row_idx < row_offset + num_values; row_idx++) {
if (HAS_DEFINES && defines[row_idx] != MaxDefine()) {
continue;
}
CONVERSION::template PlainSkip<CHECKED>(plain_data, *this);
}
}
template <class VALUE_TYPE, class CONVERSION, bool HAS_DEFINES, bool CHECKED>
void PlainSelectTemplatedInternal(ByteBuffer &plain_data, const uint8_t *__restrict defines,
const uint64_t num_values, Vector &result, const SelectionVector &sel,
idx_t approved_tuple_count) {
const auto result_ptr = FlatVector::GetData<VALUE_TYPE>(result);
auto &result_mask = FlatVector::Validity(result);
idx_t current_entry = 0;
for (idx_t i = 0; i < approved_tuple_count; i++) {
auto next_entry = sel.get_index(i);
D_ASSERT(current_entry <= next_entry);
// perform any skips forward if required
PlainSkipTemplatedInternal<CONVERSION, HAS_DEFINES, CHECKED>(plain_data, defines,
next_entry - current_entry, current_entry);
// read this row
if (HAS_DEFINES && defines[next_entry] != MaxDefine()) {
result_mask.SetInvalid(next_entry);
} else {
result_ptr[next_entry] = CONVERSION::template PlainRead<CHECKED>(plain_data, *this);
}
current_entry = next_entry + 1;
}
if (current_entry < num_values) {
// skip forward to the end of where we are selecting
PlainSkipTemplatedInternal<CONVERSION, HAS_DEFINES, CHECKED>(plain_data, defines,
num_values - current_entry, current_entry);
}
}
protected:
Allocator &GetAllocator();
// readers that use the default Read() need to implement those
virtual void PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values);
virtual void Plain(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values, idx_t result_offset, Vector &result);
virtual void Plain(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values,
idx_t result_offset, Vector &result);
virtual void PlainSelect(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values,
Vector &result, const SelectionVector &sel, idx_t count);
// applies any skips that were registered using Skip()
virtual void ApplyPendingSkips(data_ptr_t define_out, data_ptr_t repeat_out);
inline bool HasDefines() const {
return MaxDefine() > 0;
}
inline bool HasRepeats() const {
return MaxRepeat() > 0;
}
protected:
const ParquetColumnSchema &column_schema;
ParquetReader &reader;
idx_t pending_skips = 0;
bool page_is_filtered_out = false;
virtual void ResetPage();
private:
void AllocateBlock(idx_t size);
void PrepareRead(optional_ptr<const TableFilter> filter, optional_ptr<TableFilterState> filter_state);
void PreparePage(PageHeader &page_hdr);
void PrepareDataPage(PageHeader &page_hdr);
void PreparePageV2(PageHeader &page_hdr);
void DecompressInternal(CompressionCodec::type codec, const_data_ptr_t src, idx_t src_size, data_ptr_t dst,
idx_t dst_size);
const ColumnChunk *chunk = nullptr;
TProtocol *protocol;
idx_t page_rows_available;
idx_t group_rows_available;
idx_t chunk_read_offset;
shared_ptr<ResizeableBuffer> block;
ColumnEncoding encoding = ColumnEncoding::INVALID;
unique_ptr<RleBpDecoder> defined_decoder;
unique_ptr<RleBpDecoder> repeated_decoder;
DictionaryDecoder dictionary_decoder;
DeltaBinaryPackedDecoder delta_binary_packed_decoder;
RLEDecoder rle_decoder;
DeltaLengthByteArrayDecoder delta_length_byte_array_decoder;
DeltaByteArrayDecoder delta_byte_array_decoder;
ByteStreamSplitDecoder byte_stream_split_decoder;
//! Resizeable buffers used for the various encodings above
ResizeableBuffer encoding_buffers[2];
public:
template <class TARGET>
TARGET &Cast() {
if (TARGET::TYPE != PhysicalType::INVALID && Type().InternalType() != TARGET::TYPE) {
throw InternalException("Failed to cast column reader to type - type mismatch");
}
return reinterpret_cast<TARGET &>(*this);
}
template <class TARGET>
const TARGET &Cast() const {
if (TARGET::TYPE != PhysicalType::INVALID && Type().InternalType() != TARGET::TYPE) {
throw InternalException("Failed to cast column reader to type - type mismatch");
}
return reinterpret_cast<const TARGET &>(*this);
}
};
} // namespace duckdb

View File

@@ -0,0 +1,145 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "parquet_types.h"
#include "parquet_column_schema.hpp"
namespace duckdb {
class MemoryStream;
class ParquetWriter;
class ColumnWriterPageState;
class PrimitiveColumnWriterState;
struct ChildFieldIDs;
struct ShreddingType;
class ResizeableBuffer;
class ParquetBloomFilter;
class ColumnWriterState {
public:
virtual ~ColumnWriterState();
unsafe_vector<uint16_t> definition_levels;
unsafe_vector<uint16_t> repetition_levels;
unsafe_vector<uint8_t> is_empty;
idx_t parent_null_count = 0;
idx_t null_count = 0;
public:
template <class TARGET>
TARGET &Cast() {
DynamicCastCheck<TARGET>(this);
return reinterpret_cast<TARGET &>(*this);
}
template <class TARGET>
const TARGET &Cast() const {
D_ASSERT(dynamic_cast<const TARGET *>(this));
return reinterpret_cast<const TARGET &>(*this);
}
};
class ColumnWriterPageState {
public:
virtual ~ColumnWriterPageState() {
}
public:
template <class TARGET>
TARGET &Cast() {
DynamicCastCheck<TARGET>(this);
return reinterpret_cast<TARGET &>(*this);
}
template <class TARGET>
const TARGET &Cast() const {
D_ASSERT(dynamic_cast<const TARGET *>(this));
return reinterpret_cast<const TARGET &>(*this);
}
};
class ColumnWriter {
protected:
static constexpr uint16_t PARQUET_DEFINE_VALID = UINT16_C(65535);
public:
ColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path,
bool can_have_nulls);
virtual ~ColumnWriter();
public:
const LogicalType &Type() const {
return column_schema.type;
}
const ParquetColumnSchema &Schema() const {
return column_schema;
}
inline idx_t SchemaIndex() const {
return column_schema.schema_index;
}
inline idx_t MaxDefine() const {
return column_schema.max_define;
}
idx_t MaxRepeat() const {
return column_schema.max_repeat;
}
static ParquetColumnSchema FillParquetSchema(vector<duckdb_parquet::SchemaElement> &schemas,
const LogicalType &type, const string &name, bool allow_geometry,
optional_ptr<const ChildFieldIDs> field_ids,
optional_ptr<const ShreddingType> shredding_types,
idx_t max_repeat = 0, idx_t max_define = 1,
bool can_have_nulls = true);
//! Create the column writer for a specific type recursively
static unique_ptr<ColumnWriter> CreateWriterRecursive(ClientContext &context, ParquetWriter &writer,
const vector<duckdb_parquet::SchemaElement> &parquet_schemas,
const ParquetColumnSchema &schema,
vector<string> path_in_schema);
virtual unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) = 0;
//! indicates whether the write need to analyse the data before preparing it
virtual bool HasAnalyze() {
return false;
}
virtual void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) {
throw NotImplementedException("Writer does not need analysis");
}
//! Called after all data has been passed to Analyze
virtual void FinalizeAnalyze(ColumnWriterState &state) {
throw NotImplementedException("Writer does not need analysis");
}
virtual void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) = 0;
virtual void BeginWrite(ColumnWriterState &state) = 0;
virtual void Write(ColumnWriterState &state, Vector &vector, idx_t count) = 0;
virtual void FinalizeWrite(ColumnWriterState &state) = 0;
protected:
void HandleDefineLevels(ColumnWriterState &state, ColumnWriterState *parent, const ValidityMask &validity,
const idx_t count, const uint16_t define_value, const uint16_t null_value) const;
void HandleRepeatLevels(ColumnWriterState &state_p, ColumnWriterState *parent, idx_t count) const;
void CompressPage(MemoryStream &temp_writer, size_t &compressed_size, data_ptr_t &compressed_data,
AllocatedData &compressed_buf);
public:
ParquetWriter &writer;
const ParquetColumnSchema &column_schema;
vector<string> schema_path;
bool can_have_nulls;
protected:
vector<unique_ptr<ColumnWriter>> child_writers;
};
} // namespace duckdb

View File

@@ -0,0 +1,221 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// decode_utils.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb/common/fast_mem.hpp"
#include "duckdb/common/bitpacking.hpp"
#include "resizable_buffer.hpp"
namespace duckdb {
class ParquetDecodeUtils {
//===--------------------------------------------------------------------===//
// Bitpacking
//===--------------------------------------------------------------------===//
private:
static const uint64_t BITPACK_MASKS[];
static const uint64_t BITPACK_MASKS_SIZE;
static const uint8_t BITPACK_DLEN;
static void CheckWidth(const uint8_t width) {
if (width >= BITPACK_MASKS_SIZE) {
throw InvalidInputException("The width (%d) of the bitpacked data exceeds the supported max width (%d), "
"the file might be corrupted.",
width, BITPACK_MASKS_SIZE);
}
}
public:
template <class T>
static void BitUnpack(ByteBuffer &src, bitpacking_width_t &bitpack_pos, T *dst, idx_t count,
const bitpacking_width_t width) {
CheckWidth(width);
const auto mask = BITPACK_MASKS[width];
src.available(count * width / BITPACK_DLEN); // check if buffer has enough space available once
if (bitpack_pos == 0 && count >= BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) {
idx_t remainder = count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE;
idx_t aligned_count = count - remainder;
BitUnpackAlignedInternal(src, dst, aligned_count, width);
dst += aligned_count;
count = remainder;
}
for (idx_t i = 0; i < count; i++) {
auto val = (src.unsafe_get<uint8_t>() >> bitpack_pos) & mask;
bitpack_pos += width;
while (bitpack_pos > BITPACK_DLEN) {
src.unsafe_inc(1);
val |= (static_cast<T>(src.unsafe_get<uint8_t>())
<< static_cast<T>(BITPACK_DLEN - (bitpack_pos - width))) &
mask;
bitpack_pos -= BITPACK_DLEN;
}
dst[i] = val;
}
}
static void Skip(ByteBuffer &src, bitpacking_width_t &bitpack_pos, idx_t count, const bitpacking_width_t width) {
CheckWidth(width);
src.available(count * width / BITPACK_DLEN); // check if buffer has enough space available once
if (bitpack_pos == 0 && count >= BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) {
idx_t remainder = count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE;
idx_t aligned_count = count - remainder;
SkipAligned(src, aligned_count, width);
count = remainder;
}
// FIXME: we should be able to just do this in one go instead of having this loop
for (idx_t i = 0; i < count; i++) {
bitpack_pos += width;
while (bitpack_pos > BITPACK_DLEN) {
src.unsafe_inc(1);
bitpack_pos -= BITPACK_DLEN;
}
}
}
template <class T>
static void BitPackAligned(T *src, data_ptr_t dst, const idx_t count, const bitpacking_width_t width) {
D_ASSERT(width < BITPACK_MASKS_SIZE);
D_ASSERT(count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0);
BitpackingPrimitives::PackBuffer<T, true>(dst, src, count, width);
}
template <class T>
static void BitUnpackAlignedInternal(ByteBuffer &src, T *dst, const idx_t count, const bitpacking_width_t width) {
D_ASSERT(count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0);
if (cast_pointer_to_uint64(src.ptr) % sizeof(T) == 0) {
// Fast path: aligned
BitpackingPrimitives::UnPackBuffer<T>(data_ptr_cast(dst), src.ptr, count, width);
src.unsafe_inc(count * width / BITPACK_DLEN);
return;
}
for (idx_t i = 0; i < count; i += BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) {
const auto next_read = BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE * width / BITPACK_DLEN;
// Buffer for alignment
T aligned_data[BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE];
// Copy over to aligned buffer
FastMemcpy(aligned_data, src.ptr, next_read);
// Unpack
BitpackingPrimitives::UnPackBlock<T>(data_ptr_cast(dst), data_ptr_cast(aligned_data), width, true);
src.unsafe_inc(next_read);
dst += BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE;
}
}
template <class T>
static void BitUnpackAligned(ByteBuffer &src, T *dst, const idx_t count, const bitpacking_width_t width) {
CheckWidth(width);
if (count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE != 0) {
throw InvalidInputException("Aligned bitpacking count must be a multiple of %llu",
BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE);
}
const auto read_size = count * width / BITPACK_DLEN;
src.available(read_size); // check if buffer has enough space available once
BitUnpackAlignedInternal(src, dst, count, width);
}
static void SkipAligned(ByteBuffer &src, const idx_t count, const bitpacking_width_t width) {
CheckWidth(width);
if (count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE != 0) {
throw InvalidInputException("Aligned bitpacking count must be a multiple of %llu",
BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE);
}
const auto read_size = count * width / BITPACK_DLEN;
src.inc(read_size);
}
//===--------------------------------------------------------------------===//
// Zigzag
//===--------------------------------------------------------------------===//
private:
//! https://lemire.me/blog/2022/11/25/making-all-your-integers-positive-with-zigzag-encoding/
template <class UNSIGNED>
static typename std::enable_if<std::is_unsigned<UNSIGNED>::value, typename std::make_signed<UNSIGNED>::type>::type
ZigzagToIntInternal(UNSIGNED x) {
return (x >> 1) ^ (-(x & 1));
}
template <typename SIGNED>
static typename std::enable_if<std::is_signed<SIGNED>::value, typename std::make_unsigned<SIGNED>::type>::type
IntToZigzagInternal(SIGNED x) {
using UNSIGNED = typename std::make_unsigned<SIGNED>::type;
return (static_cast<UNSIGNED>(x) << 1) ^ static_cast<UNSIGNED>(x >> (sizeof(SIGNED) * 8 - 1));
}
public:
template <class UNSIGNED>
static typename std::enable_if<std::is_unsigned<UNSIGNED>::value, typename std::make_signed<UNSIGNED>::type>::type
ZigzagToInt(UNSIGNED x) {
auto integer = ZigzagToIntInternal(x);
D_ASSERT(x == IntToZigzagInternal(integer)); // test roundtrip
return integer;
}
template <typename SIGNED>
static typename std::enable_if<std::is_signed<SIGNED>::value, typename std::make_unsigned<SIGNED>::type>::type
IntToZigzag(SIGNED x) {
auto zigzag = IntToZigzagInternal(x);
D_ASSERT(x == ZigzagToIntInternal(zigzag)); // test roundtrip
return zigzag;
}
//===--------------------------------------------------------------------===//
// Varint
//===--------------------------------------------------------------------===//
public:
template <class T>
static uint8_t GetVarintSize(T val) {
uint8_t res = 0;
do {
val >>= 7;
res++;
} while (val != 0);
return res;
}
template <class T>
static void VarintEncode(T val, WriteStream &ser) {
do {
uint8_t byte = val & 127;
val >>= 7;
if (val != 0) {
byte |= 128;
}
ser.Write<uint8_t>(byte);
} while (val != 0);
}
template <class T, bool CHECKED = true>
static T VarintDecode(ByteBuffer &buf) {
T result = 0;
uint8_t shift = 0;
while (true) {
uint8_t byte;
if (CHECKED) {
byte = buf.read<uint8_t>();
} else {
byte = buf.unsafe_read<uint8_t>();
}
result |= T(byte & 127) << shift;
if ((byte & 128) == 0) {
break;
}
shift += 7;
if (shift > sizeof(T) * 8) {
throw std::runtime_error("Varint-decoding found too large number");
}
}
return result;
}
};
} // namespace duckdb

View File

@@ -0,0 +1,32 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// decoder/byte_stream_split_decoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "parquet_bss_decoder.hpp"
namespace duckdb {
class ColumnReader;
class ByteStreamSplitDecoder {
public:
explicit ByteStreamSplitDecoder(ColumnReader &reader);
public:
void InitializePage();
void Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset);
void Skip(uint8_t *defines, idx_t skip_count);
private:
ColumnReader &reader;
ResizeableBuffer &decoded_data_buffer;
unique_ptr<BssDecoder> bss_decoder;
};
} // namespace duckdb

View File

@@ -0,0 +1,33 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// decoder/delta_binary_packed_decoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "parquet_dbp_decoder.hpp"
#include "resizable_buffer.hpp"
namespace duckdb {
class ColumnReader;
class DeltaBinaryPackedDecoder {
public:
explicit DeltaBinaryPackedDecoder(ColumnReader &reader);
public:
void InitializePage();
void Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset);
void Skip(uint8_t *defines, idx_t skip_count);
private:
ColumnReader &reader;
ResizeableBuffer &decoded_data_buffer;
unique_ptr<DbpDecoder> dbp_decoder;
};
} // namespace duckdb

View File

@@ -0,0 +1,38 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// decoder/delta_byte_array_decoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "parquet_dbp_decoder.hpp"
#include "resizable_buffer.hpp"
namespace duckdb {
class ColumnReader;
class DeltaByteArrayDecoder {
public:
explicit DeltaByteArrayDecoder(ColumnReader &reader);
public:
void InitializePage();
void Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset);
void Skip(uint8_t *defines, idx_t skip_count);
static void ReadDbpData(Allocator &allocator, ResizeableBuffer &buffer, ResizeableBuffer &result_buffer,
idx_t &value_count);
private:
ColumnReader &reader;
unique_ptr<Vector> byte_array_data;
idx_t byte_array_count = 0;
idx_t delta_offset = 0;
};
} // namespace duckdb

View File

@@ -0,0 +1,43 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// decoder/delta_length_byte_array_decoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "parquet_dbp_decoder.hpp"
#include "resizable_buffer.hpp"
namespace duckdb {
class ColumnReader;
class DeltaLengthByteArrayDecoder {
public:
explicit DeltaLengthByteArrayDecoder(ColumnReader &reader);
public:
void InitializePage();
void Read(shared_ptr<ResizeableBuffer> &block, uint8_t *defines, idx_t read_count, Vector &result,
idx_t result_offset);
void Skip(uint8_t *defines, idx_t skip_count);
private:
template <bool HAS_DEFINES>
void ReadInternal(shared_ptr<ResizeableBuffer> &block, uint8_t *defines, idx_t read_count, Vector &result,
idx_t result_offset);
template <bool HAS_DEFINES>
void SkipInternal(uint8_t *defines, idx_t skip_count);
private:
ColumnReader &reader;
ResizeableBuffer &length_buffer;
idx_t byte_array_count = 0;
idx_t length_idx;
};
} // namespace duckdb

View File

@@ -0,0 +1,56 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// decoder/dictionary_decoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "parquet_rle_bp_decoder.hpp"
#include "resizable_buffer.hpp"
namespace duckdb {
class ColumnReader;
struct TableFilterState;
class DictionaryDecoder {
public:
explicit DictionaryDecoder(ColumnReader &reader);
public:
void InitializeDictionary(idx_t dictionary_size, optional_ptr<const TableFilter> filter,
optional_ptr<TableFilterState> filter_state, bool has_defines);
void InitializePage();
idx_t Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset);
void Skip(uint8_t *defines, idx_t skip_count);
bool CanFilter(const TableFilter &filter, TableFilterState &filter_state);
bool DictionarySupportsFilter(const TableFilter &filter, TableFilterState &filter_state);
void Filter(uint8_t *defines, idx_t read_count, Vector &result, SelectionVector &sel, idx_t &approved_tuple_count);
bool HasFilter() const {
return filter_result.get();
}
bool HasFilteredOutAllValues() const {
return HasFilter() && filter_count == 0;
}
private:
idx_t GetValidValues(uint8_t *defines, idx_t read_count, idx_t result_offset);
void ConvertDictToSelVec(uint32_t *offsets, const SelectionVector &rows, idx_t count);
private:
ColumnReader &reader;
ResizeableBuffer &offset_buffer;
unique_ptr<RleBpDecoder> dict_decoder;
SelectionVector valid_sel;
SelectionVector dictionary_selection_vector;
idx_t dictionary_size;
buffer_ptr<VectorChildBuffer> dictionary;
unsafe_unique_array<bool> filter_result;
idx_t filter_count;
bool can_have_nulls;
};
} // namespace duckdb

View File

@@ -0,0 +1,32 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// decoder/rle_decoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "parquet_rle_bp_decoder.hpp"
namespace duckdb {
class ColumnReader;
class RLEDecoder {
public:
explicit RLEDecoder(ColumnReader &reader);
public:
void InitializePage();
void Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset);
void Skip(uint8_t *defines, idx_t skip_count);
private:
ColumnReader &reader;
ResizeableBuffer &decoded_data_buffer;
unique_ptr<RleBpDecoder> rle_decoder;
};
} // namespace duckdb

View File

@@ -0,0 +1,102 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// geo_parquet.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_writer.hpp"
#include "duckdb/common/string.hpp"
#include "duckdb/common/types/data_chunk.hpp"
#include "duckdb/common/unordered_map.hpp"
#include "duckdb/common/unordered_set.hpp"
#include "parquet_types.h"
namespace duckdb {
struct ParquetColumnSchema;
class ParquetReader;
class ColumnReader;
class ClientContext;
class ExpressionExecutor;
enum class GeoParquetColumnEncoding : uint8_t {
WKB = 1,
POINT,
LINESTRING,
POLYGON,
MULTIPOINT,
MULTILINESTRING,
MULTIPOLYGON,
};
enum class GeoParquetVersion : uint8_t {
// Write GeoParquet 1.0 metadata
// GeoParquet 1.0 has the widest support among readers and writers
V1,
// Write GeoParquet 2.0
// The GeoParquet 2.0 options is identical to GeoParquet 1.0 except the underlying storage
// of spatial columns is Parquet native geometry, where the Parquet writer will include
// native statistics according to the underlying Parquet options. Compared to 'BOTH', this will
// actually write the metadata as containing GeoParquet version 2.0.0
// However, V2 isnt standardized yet, so this option is still a bit experimental
V2,
// Write GeoParquet 1.0 metadata, with native Parquet geometry types
// This is a bit of a hold-over option for compatibility with systems that
// reject GeoParquet 2.0 metadata, but can read Parquet native geometry types as they simply ignore the extra
// logical type. DuckDB v1.4.0 falls into this category.
BOTH,
// Do not write GeoParquet metadata
// This option suppresses GeoParquet metadata; however, spatial types will be written as
// Parquet native Geometry/Geography.
NONE,
};
struct GeoParquetColumnMetadata {
// The encoding of the geometry column
GeoParquetColumnEncoding geometry_encoding;
// The statistics of the geometry column
GeometryStatsData stats;
// The crs of the geometry column (if any) in PROJJSON format
string projjson;
// Used to track the "primary" geometry column (if any)
idx_t insertion_index = 0;
};
class GeoParquetFileMetadata {
public:
explicit GeoParquetFileMetadata(GeoParquetVersion geo_parquet_version) : version(geo_parquet_version) {
}
void AddGeoParquetStats(const string &column_name, const LogicalType &type, const GeometryStatsData &stats);
void Write(duckdb_parquet::FileMetaData &file_meta_data);
// Try to read GeoParquet metadata. Returns nullptr if not found, invalid or the required spatial extension is not
// available.
static unique_ptr<GeoParquetFileMetadata> TryRead(const duckdb_parquet::FileMetaData &file_meta_data,
const ClientContext &context);
const unordered_map<string, GeoParquetColumnMetadata> &GetColumnMeta() const;
static unique_ptr<ColumnReader> CreateColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
ClientContext &context);
bool IsGeometryColumn(const string &column_name) const;
static bool IsGeoParquetConversionEnabled(const ClientContext &context);
static LogicalType GeometryType();
private:
mutex write_lock;
unordered_map<string, GeoParquetColumnMetadata> geometry_columns;
GeoParquetVersion version;
};
} // namespace duckdb

View File

@@ -0,0 +1,196 @@
[
{
"class": "ParquetColumnDefinition",
"includes": [
"parquet_reader.hpp"
],
"members": [
{
"id": 100,
"name": "field_id",
"type": "int32_t"
},
{
"id": 101,
"name": "name",
"type": "string"
},
{
"id": 103,
"name": "type",
"type": "LogicalType"
},
{
"id": 104,
"name": "default_value",
"type": "Value"
},
{
"id": 105,
"name": "identifier",
"type": "Value",
"default": "Value()"
}
],
"pointer_type": "none"
},
{
"class": "ParquetEncryptionConfig",
"includes": [
"parquet_crypto.hpp"
],
"members": [
{
"id": 100,
"name": "footer_key",
"type": "string"
},
{
"id": 101,
"name": "column_keys",
"type": "unordered_map<string, string>"
}
],
"pointer_type": "shared_ptr"
},
{
"class": "ParquetOptionsSerialization",
"includes": [
"parquet_reader.hpp"
],
"members": [
{
"id": 100,
"name": "binary_as_string",
"type": "bool",
"property": "parquet_options.binary_as_string"
},
{
"id": 101,
"name": "file_row_number",
"type": "bool",
"property": "parquet_options.file_row_number"
},
{
"id": 102,
"name": "file_options",
"type": "MultiFileOptions"
},
{
"id": 103,
"name": "schema",
"type": "vector<ParquetColumnDefinition>",
"property": "parquet_options.schema"
},
{
"id": 104,
"name": "encryption_config",
"type": "shared_ptr<ParquetEncryptionConfig>",
"default": "nullptr",
"property": "parquet_options.encryption_config"
},
{
"id": 105,
"name": "debug_use_openssl",
"type": "bool",
"default": "true",
"property": "parquet_options.debug_use_openssl"
},
{
"id": 106,
"name": "explicit_cardinality",
"type": "idx_t",
"default": "0",
"property": "parquet_options.explicit_cardinality"
},
{
"id": 107,
"name": "can_have_nan",
"type": "bool",
"default": "false",
"property": "parquet_options.can_have_nan"
}
],
"pointer_type": "none"
},
{
"class": "FieldID",
"includes": [
"parquet_field_id.hpp"
],
"members": [
{
"id": 100,
"name": "set",
"type": "bool"
},
{
"id": 101,
"name": "field_id",
"type": "int32_t"
},
{
"id": 102,
"name": "child_field_ids",
"type": "ChildFieldIDs"
}
],
"pointer_type": "none"
},
{
"class": "ChildFieldIDs",
"includes": [
"parquet_field_id.hpp"
],
"members": [
{
"id": 100,
"name": "ids",
"type": "case_insensitive_map_t<FieldID>",
"serialize_property": "ids.operator*()",
"deserialize_property": "ids.operator*()"
}
],
"pointer_type": "none"
},
{
"class": "ShreddingType",
"includes": [
"parquet_shredding.hpp"
],
"members": [
{
"id": 100,
"name": "set",
"type": "bool"
},
{
"id": 101,
"name": "type",
"type": "LogicalType"
},
{
"id": 102,
"name": "children",
"type": "ChildShreddingTypes"
}
],
"pointer_type": "none"
},
{
"class": "ChildShreddingTypes",
"includes": [
"parquet_shredding.hpp"
],
"members": [
{
"id": 100,
"name": "types",
"type": "case_insensitive_map_t<ShreddingType>",
"serialize_property": "types.operator*()",
"deserialize_property": "types.operator*()"
}
],
"pointer_type": "none"
}
]

View File

@@ -0,0 +1,61 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_bss_decoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "parquet_types.h"
#include "resizable_buffer.hpp"
namespace duckdb {
/// Decoder for the Byte Stream Split encoding
class BssDecoder {
public:
/// Create a decoder object. buffer/buffer_len is the encoded data.
BssDecoder(data_ptr_t buffer, uint32_t buffer_len) : buffer_(buffer, buffer_len), value_offset_(0) {
}
public:
template <typename T>
void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
if (buffer_.len % sizeof(T) != 0) {
duckdb::stringstream error;
error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
<< ") should be a multiple of the type size (" << sizeof(T) << ")";
throw std::runtime_error(error.str());
}
uint32_t num_buffer_values = buffer_.len / sizeof(T);
buffer_.available((value_offset_ + batch_size) * sizeof(T));
for (uint32_t byte_offset = 0; byte_offset < sizeof(T); ++byte_offset) {
data_ptr_t input_bytes = buffer_.ptr + byte_offset * num_buffer_values + value_offset_;
for (uint32_t i = 0; i < batch_size; ++i) {
values_target_ptr[byte_offset + i * sizeof(T)] = *(input_bytes + i);
}
}
value_offset_ += batch_size;
}
template <typename T>
void Skip(uint32_t batch_size) {
if (buffer_.len % sizeof(T) != 0) {
duckdb::stringstream error;
error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
<< ") should be a multiple of the type size (" << sizeof(T) << ")";
throw std::runtime_error(error.str());
}
buffer_.available((value_offset_ + batch_size) * sizeof(T));
value_offset_ += batch_size;
}
private:
ByteBuffer buffer_;
uint32_t value_offset_;
};
} // namespace duckdb

View File

@@ -0,0 +1,47 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_bss_encoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "decode_utils.hpp"
namespace duckdb {
class BssEncoder {
public:
explicit BssEncoder(const idx_t total_value_count_p, const idx_t bit_width_p)
: total_value_count(total_value_count_p), bit_width(bit_width_p), count(0) {
}
public:
void BeginWrite(Allocator &allocator) {
buffer = allocator.Allocate(total_value_count * bit_width + 1);
}
template <class T>
void WriteValue(const T &value) {
D_ASSERT(sizeof(T) == bit_width);
for (idx_t i = 0; i < sizeof(T); i++) {
buffer.get()[i * total_value_count + count] = reinterpret_cast<const_data_ptr_t>(&value)[i];
}
count++;
}
void FinishWrite(WriteStream &writer) {
writer.WriteData(buffer.get(), total_value_count * bit_width);
}
private:
const idx_t total_value_count;
const idx_t bit_width;
idx_t count;
AllocatedData buffer;
};
} // namespace duckdb

View File

@@ -0,0 +1,58 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_column_schema.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "parquet_types.h"
namespace duckdb {
using duckdb_parquet::FileMetaData;
struct ParquetOptions;
enum class ParquetColumnSchemaType { COLUMN, FILE_ROW_NUMBER, GEOMETRY, EXPRESSION, VARIANT };
enum class ParquetExtraTypeInfo {
NONE,
IMPALA_TIMESTAMP,
UNIT_NS,
UNIT_MS,
UNIT_MICROS,
DECIMAL_BYTE_ARRAY,
DECIMAL_INT32,
DECIMAL_INT64,
FLOAT16
};
struct ParquetColumnSchema {
ParquetColumnSchema() = default;
ParquetColumnSchema(idx_t max_define, idx_t max_repeat, idx_t schema_index, idx_t file_index,
ParquetColumnSchemaType schema_type = ParquetColumnSchemaType::COLUMN);
ParquetColumnSchema(string name, LogicalType type, idx_t max_define, idx_t max_repeat, idx_t schema_index,
idx_t column_index, ParquetColumnSchemaType schema_type = ParquetColumnSchemaType::COLUMN);
ParquetColumnSchema(ParquetColumnSchema parent, LogicalType result_type, ParquetColumnSchemaType schema_type);
ParquetColumnSchemaType schema_type;
string name;
LogicalType type;
idx_t max_define;
idx_t max_repeat;
idx_t schema_index;
idx_t column_index;
optional_idx parent_schema_index;
uint32_t type_length = 0;
uint32_t type_scale = 0;
duckdb_parquet::Type::type parquet_type = duckdb_parquet::Type::INT32;
ParquetExtraTypeInfo type_info = ParquetExtraTypeInfo::NONE;
vector<ParquetColumnSchema> children;
unique_ptr<BaseStatistics> Stats(const FileMetaData &file_meta_data, const ParquetOptions &parquet_options,
idx_t row_group_idx_p, const vector<duckdb_parquet::ColumnChunk> &columns) const;
};
} // namespace duckdb

View File

@@ -0,0 +1,89 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_crypto.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "parquet_types.h"
#include "duckdb/common/encryption_state.hpp"
#include "duckdb/storage/object_cache.hpp"
namespace duckdb {
using duckdb_apache::thrift::TBase;
using duckdb_apache::thrift::protocol::TProtocol;
class BufferedFileWriter;
class ParquetKeys : public ObjectCacheEntry {
public:
static ParquetKeys &Get(ClientContext &context);
public:
void AddKey(const string &key_name, const string &key);
bool HasKey(const string &key_name) const;
const string &GetKey(const string &key_name) const;
public:
static string ObjectType();
string GetObjectType() override;
private:
unordered_map<string, string> keys;
};
class ParquetEncryptionConfig {
public:
explicit ParquetEncryptionConfig();
ParquetEncryptionConfig(ClientContext &context, const Value &arg);
ParquetEncryptionConfig(string footer_key);
public:
static shared_ptr<ParquetEncryptionConfig> Create(ClientContext &context, const Value &arg);
const string &GetFooterKey() const;
public:
void Serialize(Serializer &serializer) const;
static shared_ptr<ParquetEncryptionConfig> Deserialize(Deserializer &deserializer);
private:
//! The encryption key used for the footer
string footer_key;
//! Mapping from column name to key name
unordered_map<string, string> column_keys;
};
class ParquetCrypto {
public:
//! Encrypted modules
static constexpr idx_t LENGTH_BYTES = 4;
static constexpr idx_t NONCE_BYTES = 12;
static constexpr idx_t TAG_BYTES = 16;
//! Block size we encrypt/decrypt
static constexpr idx_t CRYPTO_BLOCK_SIZE = 4096;
static constexpr idx_t BLOCK_SIZE = 16;
public:
//! Decrypt and read a Thrift object from the transport protocol
static uint32_t Read(TBase &object, TProtocol &iprot, const string &key, const EncryptionUtil &encryption_util_p);
//! Encrypt and write a Thrift object to the transport protocol
static uint32_t Write(const TBase &object, TProtocol &oprot, const string &key,
const EncryptionUtil &encryption_util_p);
//! Decrypt and read a buffer
static uint32_t ReadData(TProtocol &iprot, const data_ptr_t buffer, const uint32_t buffer_size, const string &key,
const EncryptionUtil &encryption_util_p);
//! Encrypt and write a buffer to a file
static uint32_t WriteData(TProtocol &oprot, const const_data_ptr_t buffer, const uint32_t buffer_size,
const string &key, const EncryptionUtil &encryption_util_p);
public:
static void AddKey(ClientContext &context, const FunctionParameters &parameters);
static bool ValidKey(const std::string &key);
};
} // namespace duckdb

View File

@@ -0,0 +1,163 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_dbp_deccoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "decode_utils.hpp"
namespace duckdb {
class DbpDecoder {
public:
DbpDecoder(const data_ptr_t buffer, const uint32_t buffer_len)
: buffer_(buffer, buffer_len),
//<block size in values> <number of miniblocks in a block> <total value count> <first value>
block_size_in_values(ParquetDecodeUtils::VarintDecode<uint64_t>(buffer_)),
number_of_miniblocks_per_block(DecodeNumberOfMiniblocksPerBlock(buffer_)),
number_of_values_in_a_miniblock(block_size_in_values / number_of_miniblocks_per_block),
total_value_count(ParquetDecodeUtils::VarintDecode<uint64_t>(buffer_)),
previous_value(ParquetDecodeUtils::ZigzagToInt(ParquetDecodeUtils::VarintDecode<uint64_t>(buffer_))),
// init state to something sane
is_first_value(true), read_values(0), min_delta(NumericLimits<int64_t>::Maximum()),
miniblock_index(number_of_miniblocks_per_block - 1), list_of_bitwidths_of_miniblocks(nullptr),
miniblock_offset(number_of_values_in_a_miniblock),
unpacked_data_offset(BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) {
if (!(block_size_in_values % number_of_miniblocks_per_block == 0 &&
number_of_values_in_a_miniblock % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0)) {
throw InvalidInputException("Parquet file has invalid block sizes for DELTA_BINARY_PACKED");
}
}
ByteBuffer BufferPtr() const {
return buffer_;
}
uint64_t TotalValues() const {
return total_value_count;
}
template <typename T>
void GetBatch(const data_ptr_t target_values_ptr, const idx_t batch_size) {
if (read_values + batch_size > total_value_count) {
throw std::runtime_error("DBP decode did not find enough values");
}
read_values += batch_size;
GetBatchInternal<T>(target_values_ptr, batch_size);
}
template <class T>
void Skip(idx_t skip_count) {
if (read_values + skip_count > total_value_count) {
throw std::runtime_error("DBP decode did not find enough values");
}
read_values += skip_count;
GetBatchInternal<T, true>(nullptr, skip_count);
}
void Finalize() {
if (miniblock_offset == number_of_values_in_a_miniblock) {
return;
}
auto data = make_unsafe_uniq_array<int64_t>(number_of_values_in_a_miniblock);
GetBatchInternal<int64_t>(data_ptr_cast(data.get()), number_of_values_in_a_miniblock - miniblock_offset);
}
private:
static idx_t DecodeNumberOfMiniblocksPerBlock(ByteBuffer &buffer) {
auto res = ParquetDecodeUtils::VarintDecode<uint64_t>(buffer);
if (res == 0) {
throw InvalidInputException(
"Parquet file has invalid number of miniblocks per block for DELTA_BINARY_PACKED");
}
return res;
}
template <typename T, bool SKIP_READ = false>
void GetBatchInternal(const data_ptr_t target_values_ptr, const idx_t batch_size) {
if (batch_size == 0) {
return;
}
D_ASSERT(target_values_ptr || SKIP_READ);
T *target_values = nullptr;
if (!SKIP_READ) {
target_values = reinterpret_cast<T *>(target_values_ptr);
}
idx_t target_values_offset = 0;
if (is_first_value) {
if (!SKIP_READ) {
target_values[0] = static_cast<T>(previous_value);
}
target_values_offset++;
is_first_value = false;
}
while (target_values_offset < batch_size) {
// Copy over any remaining data
const idx_t next = MinValue(batch_size - target_values_offset,
BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE - unpacked_data_offset);
if (next != 0) {
for (idx_t i = 0; i < next; i++) {
const auto &unpacked_value = unpacked_data[unpacked_data_offset + i];
auto current_value = static_cast<T>(static_cast<uint64_t>(previous_value) +
static_cast<uint64_t>(min_delta) + unpacked_value);
if (!SKIP_READ) {
target_values[target_values_offset + i] = current_value;
}
previous_value = static_cast<int64_t>(current_value);
}
target_values_offset += next;
unpacked_data_offset += next;
continue;
}
// Move to next miniblock / block
D_ASSERT(unpacked_data_offset == BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE);
D_ASSERT(miniblock_index < number_of_miniblocks_per_block);
D_ASSERT(miniblock_offset <= number_of_values_in_a_miniblock);
if (miniblock_offset == number_of_values_in_a_miniblock) {
miniblock_offset = 0;
if (++miniblock_index == number_of_miniblocks_per_block) {
// <min delta> <list of bitwidths of miniblocks> <miniblocks>
min_delta = ParquetDecodeUtils::ZigzagToInt(ParquetDecodeUtils::VarintDecode<uint64_t>(buffer_));
buffer_.available(number_of_miniblocks_per_block);
list_of_bitwidths_of_miniblocks = buffer_.ptr;
buffer_.unsafe_inc(number_of_miniblocks_per_block);
miniblock_index = 0;
}
}
// Unpack from current miniblock
ParquetDecodeUtils::BitUnpackAligned(buffer_, unpacked_data,
BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE,
list_of_bitwidths_of_miniblocks[miniblock_index]);
unpacked_data_offset = 0;
miniblock_offset += BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE;
}
}
private:
ByteBuffer buffer_;
const idx_t block_size_in_values;
const idx_t number_of_miniblocks_per_block;
const idx_t number_of_values_in_a_miniblock;
const idx_t total_value_count;
int64_t previous_value;
bool is_first_value;
idx_t read_values;
//! Block stuff
int64_t min_delta;
idx_t miniblock_index;
bitpacking_width_t *list_of_bitwidths_of_miniblocks;
idx_t miniblock_offset;
uint64_t unpacked_data[BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE];
idx_t unpacked_data_offset;
};
} // namespace duckdb

View File

@@ -0,0 +1,229 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_dbp_encoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "decode_utils.hpp"
namespace duckdb {
class DbpEncoder {
private:
static constexpr uint64_t BLOCK_SIZE_IN_VALUES = 2048;
static constexpr uint64_t NUMBER_OF_MINIBLOCKS_IN_A_BLOCK = 8;
static constexpr uint64_t NUMBER_OF_VALUES_IN_A_MINIBLOCK = BLOCK_SIZE_IN_VALUES / NUMBER_OF_MINIBLOCKS_IN_A_BLOCK;
public:
explicit DbpEncoder(const idx_t total_value_count_p) : total_value_count(total_value_count_p), count(0) {
}
public:
template <class T>
void BeginWrite(WriteStream &writer, const T &first_value) {
throw InternalException("DbpEncoder should only be used with integers");
}
template <class T>
void WriteValue(WriteStream &writer, const T &value) {
throw InternalException("DbpEncoder should only be used with integers");
}
void FinishWrite(WriteStream &writer) {
if (count + block_count != total_value_count) {
throw InternalException("value count mismatch when writing DELTA_BINARY_PACKED");
}
if (block_count != 0) {
WriteBlock(writer);
}
}
private:
void BeginWriteInternal(WriteStream &writer, const int64_t &first_value) {
// <block size in values> <number of miniblocks in a block> <total value count> <first value>
// the block size is a multiple of 128; it is stored as a ULEB128 int
ParquetDecodeUtils::VarintEncode(BLOCK_SIZE_IN_VALUES, writer);
// the miniblock count per block is a divisor of the block size such that their quotient,
// the number of values in a miniblock, is a multiple of 32
static_assert(BLOCK_SIZE_IN_VALUES % NUMBER_OF_MINIBLOCKS_IN_A_BLOCK == 0 &&
NUMBER_OF_VALUES_IN_A_MINIBLOCK % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0,
"invalid block sizes for DELTA_BINARY_PACKED");
// it is stored as a ULEB128 int
ParquetDecodeUtils::VarintEncode(NUMBER_OF_MINIBLOCKS_IN_A_BLOCK, writer);
// the total value count is stored as a ULEB128 int
ParquetDecodeUtils::VarintEncode(total_value_count, writer);
// the first value is stored as a zigzag ULEB128 int
ParquetDecodeUtils::VarintEncode(ParquetDecodeUtils::IntToZigzag(first_value), writer);
// initialize
if (total_value_count != 0) {
count++;
}
previous_value = first_value;
min_delta = NumericLimits<int64_t>::Maximum();
block_count = 0;
}
void WriteValueInternal(WriteStream &writer, const int64_t &value) {
// 1. Compute the differences between consecutive elements. For the first element in the block,
// use the last element in the previous block or, in the case of the first block,
// use the first value of the whole sequence, stored in the header.
// Subtractions in steps 1) and 2) may incur signed arithmetic overflow,
// and so will the corresponding additions when decoding.
// Overflow should be allowed and handled as wrapping around in 2s complement notation
// so that the original values are correctly restituted.
// This may require explicit care in some programming languages
// (for example by doing all arithmetic in the unsigned domain).
const auto delta = static_cast<int64_t>(static_cast<uint64_t>(value) - static_cast<uint64_t>(previous_value));
previous_value = value;
// Compute the frame of reference (the minimum of the deltas in the block).
min_delta = MinValue(min_delta, delta);
// append. if block is full, write it out
data[block_count++] = delta;
if (block_count == BLOCK_SIZE_IN_VALUES) {
WriteBlock(writer);
}
}
void WriteBlock(WriteStream &writer) {
D_ASSERT(count + block_count == total_value_count || block_count == BLOCK_SIZE_IN_VALUES);
const auto number_of_miniblocks =
(block_count + NUMBER_OF_VALUES_IN_A_MINIBLOCK - 1) / NUMBER_OF_VALUES_IN_A_MINIBLOCK;
for (idx_t miniblock_idx = 0; miniblock_idx < number_of_miniblocks; miniblock_idx++) {
for (idx_t i = 0; i < NUMBER_OF_VALUES_IN_A_MINIBLOCK; i++) {
const idx_t index = miniblock_idx * NUMBER_OF_VALUES_IN_A_MINIBLOCK + i;
auto &value = data[index];
if (index < block_count) {
// 2. Compute the frame of reference (the minimum of the deltas in the block).
// Subtract this min delta from all deltas in the block.
// This guarantees that all values are non-negative.
D_ASSERT(min_delta <= value);
value = static_cast<int64_t>(static_cast<uint64_t>(value) - static_cast<uint64_t>(min_delta));
} else {
// If there are not enough values to fill the last miniblock, we pad the miniblock
// so that its length is always the number of values in a full miniblock multiplied by the bit
// width. The values of the padding bits should be zero, but readers must accept paddings consisting
// of arbitrary bits as well.
value = 0;
}
}
}
for (idx_t miniblock_idx = 0; miniblock_idx < NUMBER_OF_MINIBLOCKS_IN_A_BLOCK; miniblock_idx++) {
auto &width = list_of_bitwidths_of_miniblocks[miniblock_idx];
if (miniblock_idx < number_of_miniblocks) {
const auto src = &data[miniblock_idx * NUMBER_OF_VALUES_IN_A_MINIBLOCK];
width = BitpackingPrimitives::MinimumBitWidth(reinterpret_cast<uint64_t *>(src),
NUMBER_OF_VALUES_IN_A_MINIBLOCK);
D_ASSERT(width <= sizeof(int64_t) * 8);
} else {
// If, in the last block, less than <number of miniblocks in a block> miniblocks are needed to store the
// values, the bytes storing the bit widths of the unneeded miniblocks are still present, their value
// should be zero, but readers must accept arbitrary values as well. There are no additional padding
// bytes for the miniblock bodies though, as if their bit widths were 0 (regardless of the actual byte
// values). The reader knows when to stop reading by keeping track of the number of values read.
width = 0;
}
}
// 3. Encode the frame of reference (min delta) as a zigzag ULEB128 int
// followed by the bit widths of the miniblocks
// and the delta values (minus the min delta) bit-packed per miniblock.
// <min delta> <list of bitwidths of miniblocks> <miniblocks>
// the min delta is a zigzag ULEB128 int (we compute a minimum as we need positive integers for bit packing)
ParquetDecodeUtils::VarintEncode(ParquetDecodeUtils::IntToZigzag(min_delta), writer);
// the bitwidth of each block is stored as a byte
writer.WriteData(list_of_bitwidths_of_miniblocks, NUMBER_OF_MINIBLOCKS_IN_A_BLOCK);
// each miniblock is a list of bit packed ints according to the bit width stored at the beginning of the block
for (idx_t miniblock_idx = 0; miniblock_idx < number_of_miniblocks; miniblock_idx++) {
const auto src = &data[miniblock_idx * NUMBER_OF_VALUES_IN_A_MINIBLOCK];
const auto &width = list_of_bitwidths_of_miniblocks[miniblock_idx];
memset(data_packed, 0, sizeof(data_packed));
ParquetDecodeUtils::BitPackAligned(reinterpret_cast<uint64_t *>(src), data_packed,
NUMBER_OF_VALUES_IN_A_MINIBLOCK, width);
const auto write_size = NUMBER_OF_VALUES_IN_A_MINIBLOCK * width / 8;
#ifdef DEBUG
// immediately verify that unpacking yields the input data
int64_t verification_data[NUMBER_OF_VALUES_IN_A_MINIBLOCK];
ByteBuffer byte_buffer(data_ptr_cast(data_packed), write_size);
bitpacking_width_t bitpack_pos = 0;
ParquetDecodeUtils::BitUnpack(byte_buffer, bitpack_pos, reinterpret_cast<uint64_t *>(verification_data),
NUMBER_OF_VALUES_IN_A_MINIBLOCK, width);
for (idx_t i = 0; i < NUMBER_OF_VALUES_IN_A_MINIBLOCK; i++) {
D_ASSERT(src[i] == verification_data[i]);
}
#endif
writer.WriteData(data_packed, write_size);
}
count += block_count;
min_delta = NumericLimits<int64_t>::Maximum();
block_count = 0;
}
private:
//! Overall fields
const idx_t total_value_count;
idx_t count;
int64_t previous_value;
//! Block-specific fields
int64_t min_delta;
int64_t data[BLOCK_SIZE_IN_VALUES];
idx_t block_count;
//! Bitpacking fields
bitpacking_width_t list_of_bitwidths_of_miniblocks[NUMBER_OF_MINIBLOCKS_IN_A_BLOCK];
data_t data_packed[NUMBER_OF_VALUES_IN_A_MINIBLOCK * sizeof(int64_t)];
};
template <>
inline void DbpEncoder::BeginWrite(WriteStream &writer, const int32_t &first_value) {
BeginWriteInternal(writer, first_value);
}
template <>
inline void DbpEncoder::BeginWrite(WriteStream &writer, const int64_t &first_value) {
BeginWriteInternal(writer, first_value);
}
template <>
inline void DbpEncoder::BeginWrite(WriteStream &writer, const uint32_t &first_value) {
BeginWriteInternal(writer, first_value);
}
template <>
inline void DbpEncoder::BeginWrite(WriteStream &writer, const uint64_t &first_value) {
BeginWriteInternal(writer, first_value);
}
template <>
inline void DbpEncoder::WriteValue(WriteStream &writer, const int32_t &first_value) {
WriteValueInternal(writer, first_value);
}
template <>
inline void DbpEncoder::WriteValue(WriteStream &writer, const int64_t &first_value) {
WriteValueInternal(writer, first_value);
}
template <>
inline void DbpEncoder::WriteValue(WriteStream &writer, const uint32_t &first_value) {
WriteValueInternal(writer, first_value);
}
template <>
inline void DbpEncoder::WriteValue(WriteStream &writer, const uint64_t &first_value) {
WriteValueInternal(writer, first_value);
}
} // namespace duckdb

View File

@@ -0,0 +1,55 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_decimal_utils.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
class ParquetDecimalUtils {
public:
template <class PHYSICAL_TYPE>
static PHYSICAL_TYPE ReadDecimalValue(const_data_ptr_t pointer, idx_t size, const ParquetColumnSchema &) {
PHYSICAL_TYPE res = 0;
auto res_ptr = (uint8_t *)&res;
bool positive = (*pointer & 0x80) == 0;
// numbers are stored as two's complement so some muckery is required
for (idx_t i = 0; i < MinValue<idx_t>(size, sizeof(PHYSICAL_TYPE)); i++) {
auto byte = *(pointer + (size - i - 1));
res_ptr[i] = positive ? byte : byte ^ 0xFF;
}
// Verify that there are only 0s here
if (size > sizeof(PHYSICAL_TYPE)) {
for (idx_t i = sizeof(PHYSICAL_TYPE); i < size; i++) {
auto byte = *(pointer + (size - i - 1));
if (!positive) {
byte ^= 0xFF;
}
if (byte != 0) {
throw InvalidInputException("Invalid decimal encoding in Parquet file");
}
}
}
if (!positive) {
res += 1;
return -res;
}
return res;
}
static unique_ptr<ColumnReader> CreateReader(ParquetReader &reader, const ParquetColumnSchema &schema);
};
template <>
double ParquetDecimalUtils::ReadDecimalValue(const_data_ptr_t pointer, idx_t size, const ParquetColumnSchema &schema);
} // namespace duckdb

View File

@@ -0,0 +1,69 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_dlba_encoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "parquet_dbp_encoder.hpp"
#include "duckdb/common/serializer/memory_stream.hpp"
namespace duckdb {
class DlbaEncoder {
public:
DlbaEncoder(const idx_t total_value_count_p, const idx_t total_string_size_p)
: dbp_encoder(total_value_count_p), total_string_size(total_string_size_p) {
}
public:
template <class T>
void BeginWrite(Allocator &, WriteStream &, const T &) {
throw InternalException("DlbaEncoder should only be used with strings");
}
template <class T>
void WriteValue(WriteStream &, const T &) {
throw InternalException("DlbaEncoder should only be used with strings");
}
void FinishWrite(WriteStream &writer) {
dbp_encoder.FinishWrite(writer);
writer.WriteData(buffer.get(), stream->GetPosition());
}
template <class SRC>
static idx_t GetStringSize(const SRC &) {
return 0;
}
private:
DbpEncoder dbp_encoder;
const idx_t total_string_size;
AllocatedData buffer;
unsafe_unique_ptr<MemoryStream> stream;
};
template <>
inline void DlbaEncoder::BeginWrite(Allocator &allocator, WriteStream &writer, const string_t &first_value) {
buffer = allocator.Allocate(total_string_size + 1);
stream = make_unsafe_uniq<MemoryStream>(buffer.get(), buffer.GetSize());
dbp_encoder.BeginWrite(writer, UnsafeNumericCast<int64_t>(first_value.GetSize()));
stream->WriteData(const_data_ptr_cast(first_value.GetData()), first_value.GetSize());
}
template <>
inline void DlbaEncoder::WriteValue(WriteStream &writer, const string_t &value) {
dbp_encoder.WriteValue(writer, UnsafeNumericCast<int64_t>(value.GetSize()));
stream->WriteData(const_data_ptr_cast(value.GetData()), value.GetSize());
}
template <>
inline idx_t DlbaEncoder::GetStringSize(const string_t &src_value) {
return src_value.GetSize();
}
} // namespace duckdb

View File

@@ -0,0 +1,22 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_extension.hpp
//
//
//===----------------------------------------------------------------------===/
#pragma once
#include "duckdb.hpp"
namespace duckdb {
class ParquetExtension : public Extension {
public:
void Load(ExtensionLoader &loader) override;
std::string Name() override;
std::string Version() const override;
};
} // namespace duckdb

View File

@@ -0,0 +1,39 @@
#pragma once
#include "duckdb/common/serializer/buffered_file_writer.hpp"
#include "duckdb/common/case_insensitive_map.hpp"
namespace duckdb {
struct FieldID;
struct ChildFieldIDs {
ChildFieldIDs();
ChildFieldIDs Copy() const;
unique_ptr<case_insensitive_map_t<FieldID>> ids;
void Serialize(Serializer &serializer) const;
static ChildFieldIDs Deserialize(Deserializer &source);
};
struct FieldID {
public:
static constexpr const auto DUCKDB_FIELD_ID = "__duckdb_field_id";
FieldID();
explicit FieldID(int32_t field_id);
FieldID Copy() const;
bool set;
int32_t field_id;
ChildFieldIDs child_field_ids;
void Serialize(Serializer &serializer) const;
static FieldID Deserialize(Deserializer &source);
public:
static void GenerateFieldIDs(ChildFieldIDs &field_ids, idx_t &field_id, const vector<string> &names,
const vector<LogicalType> &sql_types);
static void GetFieldIDs(const Value &field_ids_value, ChildFieldIDs &field_ids,
unordered_set<uint32_t> &unique_field_ids,
const case_insensitive_map_t<LogicalType> &name_to_type_map);
};
} // namespace duckdb

View File

@@ -0,0 +1,50 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_file_metadata_cache.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "duckdb/storage/object_cache.hpp"
#include "geo_parquet.hpp"
#include "parquet_types.h"
namespace duckdb {
struct CachingFileHandle;
enum class ParquetCacheValidity { VALID, INVALID, UNKNOWN };
class ParquetFileMetadataCache : public ObjectCacheEntry {
public:
ParquetFileMetadataCache(unique_ptr<duckdb_parquet::FileMetaData> file_metadata, CachingFileHandle &handle,
unique_ptr<GeoParquetFileMetadata> geo_metadata, idx_t footer_size);
~ParquetFileMetadataCache() override = default;
//! Parquet file metadata
unique_ptr<const duckdb_parquet::FileMetaData> metadata;
//! GeoParquet metadata
unique_ptr<GeoParquetFileMetadata> geo_metadata;
//! Parquet footer size
idx_t footer_size;
public:
static string ObjectType();
string GetObjectType() override;
bool IsValid(CachingFileHandle &new_handle) const;
//! Check if a cache entry is valid based ONLY on the OpenFileInfo (without doing any file system calls)
//! If the OpenFileInfo does not have enough information this can return UNKNOWN
ParquetCacheValidity IsValid(const OpenFileInfo &info) const;
private:
bool validate;
timestamp_t last_modified;
string version_tag;
};
} // namespace duckdb

View File

@@ -0,0 +1,17 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_timestamp.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
namespace duckdb {
float Float16ToFloat32(const uint16_t &float16_value);
} // namespace duckdb

View File

@@ -0,0 +1,41 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_metadata.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "parquet_reader.hpp"
#include "duckdb/function/function_set.hpp"
namespace duckdb {
class ParquetMetaDataFunction : public TableFunction {
public:
ParquetMetaDataFunction();
};
class ParquetSchemaFunction : public TableFunction {
public:
ParquetSchemaFunction();
};
class ParquetKeyValueMetadataFunction : public TableFunction {
public:
ParquetKeyValueMetadataFunction();
};
class ParquetFileMetadataFunction : public TableFunction {
public:
ParquetFileMetadataFunction();
};
class ParquetBloomProbeFunction : public TableFunction {
public:
ParquetBloomProbeFunction();
};
} // namespace duckdb

View File

@@ -0,0 +1,66 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_multi_file_info.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb/common/multi_file/multi_file_function.hpp"
#include "parquet_reader.hpp"
namespace duckdb {
class ParquetFileReaderOptions : public BaseFileReaderOptions {
public:
explicit ParquetFileReaderOptions(ParquetOptions options_p) : options(std::move(options_p)) {
}
explicit ParquetFileReaderOptions(ClientContext &context) : options(context) {
}
ParquetOptions options;
};
struct ParquetMultiFileInfo : MultiFileReaderInterface {
static unique_ptr<MultiFileReaderInterface> CreateInterface(ClientContext &context);
unique_ptr<BaseFileReaderOptions> InitializeOptions(ClientContext &context,
optional_ptr<TableFunctionInfo> info) override;
bool ParseCopyOption(ClientContext &context, const string &key, const vector<Value> &values,
BaseFileReaderOptions &options, vector<string> &expected_names,
vector<LogicalType> &expected_types) override;
bool ParseOption(ClientContext &context, const string &key, const Value &val, MultiFileOptions &file_options,
BaseFileReaderOptions &options) override;
void BindReader(ClientContext &context, vector<LogicalType> &return_types, vector<string> &names,
MultiFileBindData &bind_data) override;
unique_ptr<TableFunctionData> InitializeBindData(MultiFileBindData &multi_file_data,
unique_ptr<BaseFileReaderOptions> options) override;
void FinalizeBindData(MultiFileBindData &multi_file_data) override;
void GetBindInfo(const TableFunctionData &bind_data, BindInfo &info) override;
optional_idx MaxThreads(const MultiFileBindData &bind_data, const MultiFileGlobalState &global_state,
FileExpandResult expand_result) override;
unique_ptr<GlobalTableFunctionState> InitializeGlobalState(ClientContext &context, MultiFileBindData &bind_data,
MultiFileGlobalState &global_state) override;
unique_ptr<LocalTableFunctionState> InitializeLocalState(ExecutionContext &, GlobalTableFunctionState &) override;
shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
BaseUnionData &union_data, const MultiFileBindData &bind_data_p) override;
shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
const OpenFileInfo &file, idx_t file_idx,
const MultiFileBindData &bind_data) override;
shared_ptr<BaseFileReader> CreateReader(ClientContext &context, const OpenFileInfo &file,
BaseFileReaderOptions &options,
const MultiFileOptions &file_options) override;
unique_ptr<NodeStatistics> GetCardinality(const MultiFileBindData &bind_data, idx_t file_count) override;
void GetVirtualColumns(ClientContext &context, MultiFileBindData &bind_data, virtual_column_map_t &result) override;
unique_ptr<MultiFileReaderInterface> Copy() override;
FileGlobInput GetGlobInput() override;
};
class ParquetScanFunction {
public:
static TableFunctionSet GetFunctionSet();
};
} // namespace duckdb

View File

@@ -0,0 +1,239 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "duckdb/storage/caching_file_system.hpp"
#include "duckdb/common/common.hpp"
#include "duckdb/common/encryption_state.hpp"
#include "duckdb/common/exception.hpp"
#include "duckdb/common/multi_file/base_file_reader.hpp"
#include "duckdb/common/multi_file/multi_file_options.hpp"
#include "duckdb/common/string_util.hpp"
#include "duckdb/common/types/data_chunk.hpp"
#include "column_reader.hpp"
#include "parquet_file_metadata_cache.hpp"
#include "parquet_rle_bp_decoder.hpp"
#include "parquet_types.h"
#include "resizable_buffer.hpp"
#include "duckdb/execution/adaptive_filter.hpp"
#include <exception>
namespace duckdb_parquet {
namespace format {
class FileMetaData;
}
} // namespace duckdb_parquet
namespace duckdb {
class Allocator;
class ClientContext;
class BaseStatistics;
class TableFilterSet;
class ParquetEncryptionConfig;
class ParquetReader;
struct ParquetReaderPrefetchConfig {
// Percentage of data in a row group span that should be scanned for enabling whole group prefetch
static constexpr double WHOLE_GROUP_PREFETCH_MINIMUM_SCAN = 0.95;
};
struct ParquetScanFilter {
ParquetScanFilter(ClientContext &context, idx_t filter_idx, TableFilter &filter);
~ParquetScanFilter();
ParquetScanFilter(ParquetScanFilter &&) = default;
idx_t filter_idx;
TableFilter &filter;
unique_ptr<TableFilterState> filter_state;
};
struct ParquetReaderScanState {
vector<idx_t> group_idx_list;
int64_t current_group;
idx_t offset_in_group;
idx_t group_offset;
unique_ptr<CachingFileHandle> file_handle;
unique_ptr<ColumnReader> root_reader;
duckdb_base_std::unique_ptr<duckdb_apache::thrift::protocol::TProtocol> thrift_file_proto;
bool finished;
SelectionVector sel;
ResizeableBuffer define_buf;
ResizeableBuffer repeat_buf;
bool prefetch_mode = false;
bool current_group_prefetched = false;
//! Adaptive filter
unique_ptr<AdaptiveFilter> adaptive_filter;
//! Table filter list
vector<ParquetScanFilter> scan_filters;
//! (optional) pointer to the PhysicalOperator for logging
optional_ptr<const PhysicalOperator> op;
};
struct ParquetColumnDefinition {
public:
static ParquetColumnDefinition FromSchemaValue(ClientContext &context, const Value &column_value);
public:
// DEPRECATED, use 'identifier' instead
int32_t field_id;
string name;
LogicalType type;
Value default_value;
Value identifier;
public:
void Serialize(Serializer &serializer) const;
static ParquetColumnDefinition Deserialize(Deserializer &deserializer);
};
struct ParquetOptions {
explicit ParquetOptions() {
}
explicit ParquetOptions(ClientContext &context);
bool binary_as_string = false;
bool variant_legacy_encoding = false;
bool file_row_number = false;
shared_ptr<ParquetEncryptionConfig> encryption_config;
bool debug_use_openssl = true;
vector<ParquetColumnDefinition> schema;
idx_t explicit_cardinality = 0;
bool can_have_nan = false; // if floats or doubles can contain NaN values
};
struct ParquetOptionsSerialization {
ParquetOptionsSerialization() = default;
ParquetOptionsSerialization(ParquetOptions parquet_options_p, MultiFileOptions file_options_p)
: parquet_options(std::move(parquet_options_p)), file_options(std::move(file_options_p)) {
}
ParquetOptions parquet_options;
MultiFileOptions file_options;
public:
void Serialize(Serializer &serializer) const;
static ParquetOptionsSerialization Deserialize(Deserializer &deserializer);
};
struct ParquetUnionData : public BaseUnionData {
explicit ParquetUnionData(OpenFileInfo file_p) : BaseUnionData(std::move(file_p)) {
}
~ParquetUnionData() override;
unique_ptr<BaseStatistics> GetStatistics(ClientContext &context, const string &name) override;
ParquetOptions options;
shared_ptr<ParquetFileMetadataCache> metadata;
unique_ptr<ParquetColumnSchema> root_schema;
};
class ParquetReader : public BaseFileReader {
public:
ParquetReader(ClientContext &context, OpenFileInfo file, ParquetOptions parquet_options,
shared_ptr<ParquetFileMetadataCache> metadata = nullptr);
~ParquetReader() override;
CachingFileSystem fs;
Allocator &allocator;
shared_ptr<ParquetFileMetadataCache> metadata;
ParquetOptions parquet_options;
unique_ptr<ParquetColumnSchema> root_schema;
shared_ptr<EncryptionUtil> encryption_util;
//! How many rows have been read from this file
atomic<idx_t> rows_read;
public:
string GetReaderType() const override {
return "Parquet";
}
shared_ptr<BaseUnionData> GetUnionData(idx_t file_idx) override;
unique_ptr<BaseStatistics> GetStatistics(ClientContext &context, const string &name) override;
bool TryInitializeScan(ClientContext &context, GlobalTableFunctionState &gstate,
LocalTableFunctionState &lstate) override;
void Scan(ClientContext &context, GlobalTableFunctionState &global_state, LocalTableFunctionState &local_state,
DataChunk &chunk) override;
void FinishFile(ClientContext &context, GlobalTableFunctionState &gstate_p) override;
double GetProgressInFile(ClientContext &context) override;
public:
void InitializeScan(ClientContext &context, ParquetReaderScanState &state, vector<idx_t> groups_to_read);
void Scan(ClientContext &context, ParquetReaderScanState &state, DataChunk &output);
idx_t NumRows() const;
idx_t NumRowGroups() const;
const duckdb_parquet::FileMetaData *GetFileMetadata() const;
uint32_t Read(duckdb_apache::thrift::TBase &object, TProtocol &iprot);
uint32_t ReadData(duckdb_apache::thrift::protocol::TProtocol &iprot, const data_ptr_t buffer,
const uint32_t buffer_size);
unique_ptr<BaseStatistics> ReadStatistics(const string &name);
CachingFileHandle &GetHandle() {
return *file_handle;
}
static unique_ptr<BaseStatistics> ReadStatistics(ClientContext &context, ParquetOptions parquet_options,
shared_ptr<ParquetFileMetadataCache> metadata, const string &name);
static unique_ptr<BaseStatistics> ReadStatistics(const ParquetUnionData &union_data, const string &name);
LogicalType DeriveLogicalType(const SchemaElement &s_ele, ParquetColumnSchema &schema) const;
void AddVirtualColumn(column_t virtual_column_id) override;
void GetPartitionStats(vector<PartitionStatistics> &result);
static void GetPartitionStats(const duckdb_parquet::FileMetaData &metadata, vector<PartitionStatistics> &result);
static bool MetadataCacheEnabled(ClientContext &context);
static shared_ptr<ParquetFileMetadataCache> GetMetadataCacheEntry(ClientContext &context, const OpenFileInfo &file);
private:
//! Construct a parquet reader but **do not** open a file, used in ReadStatistics only
ParquetReader(ClientContext &context, ParquetOptions parquet_options,
shared_ptr<ParquetFileMetadataCache> metadata);
void InitializeSchema(ClientContext &context);
bool ScanInternal(ClientContext &context, ParquetReaderScanState &state, DataChunk &output);
//! Parse the schema of the file
unique_ptr<ParquetColumnSchema> ParseSchema(ClientContext &context);
ParquetColumnSchema ParseSchemaRecursive(idx_t depth, idx_t max_define, idx_t max_repeat, idx_t &next_schema_idx,
idx_t &next_file_idx, ClientContext &context);
unique_ptr<ColumnReader> CreateReader(ClientContext &context);
unique_ptr<ColumnReader> CreateReaderRecursive(ClientContext &context, const vector<ColumnIndex> &indexes,
const ParquetColumnSchema &schema);
const duckdb_parquet::RowGroup &GetGroup(ParquetReaderScanState &state);
uint64_t GetGroupCompressedSize(ParquetReaderScanState &state);
idx_t GetGroupOffset(ParquetReaderScanState &state);
// Group span is the distance between the min page offset and the max page offset plus the max page compressed size
uint64_t GetGroupSpan(ParquetReaderScanState &state);
void PrepareRowGroupBuffer(ParquetReaderScanState &state, idx_t out_col_idx);
ParquetColumnSchema ParseColumnSchema(const SchemaElement &s_ele, idx_t max_define, idx_t max_repeat,
idx_t schema_index, idx_t column_index,
ParquetColumnSchemaType type = ParquetColumnSchemaType::COLUMN);
MultiFileColumnDefinition ParseColumnDefinition(const duckdb_parquet::FileMetaData &file_meta_data,
ParquetColumnSchema &element);
private:
unique_ptr<CachingFileHandle> file_handle;
};
} // namespace duckdb

View File

@@ -0,0 +1,158 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_rle_bp_decoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "decode_utils.hpp"
#include "parquet_types.h"
#include "resizable_buffer.hpp"
#include "thrift_tools.hpp"
namespace duckdb {
class RleBpDecoder {
public:
/// Create a decoder object. buffer/buffer_len is the decoded data.
/// bit_width is the width of each value (before encoding).
RleBpDecoder(data_ptr_t buffer, uint32_t buffer_len, uint32_t bit_width)
: buffer_(buffer, buffer_len), bit_width_(bit_width), current_value_(0), repeat_count_(0), literal_count_(0) {
if (bit_width >= 64) {
throw std::runtime_error("Decode bit width too large");
}
byte_encoded_len = ((bit_width_ + 7) / 8);
max_val = (uint64_t(1) << bit_width_) - 1;
}
template <class T>
bool HasRepeatedBatch(const uint32_t batch_size, const T value) {
if (repeat_count_ == 0 && literal_count_ == 0) {
NextCounts();
}
return repeat_count_ >= batch_size && current_value_ == static_cast<uint64_t>(value);
}
template <typename T>
void GetRepeatedBatch(const uint32_t batch_size, const T value) {
D_ASSERT(repeat_count_ >= batch_size && current_value_ == static_cast<uint64_t>(value));
repeat_count_ -= batch_size;
}
template <typename T>
void GetBatch(data_ptr_t values_target_ptr, const uint32_t batch_size) {
auto values = reinterpret_cast<T *>(values_target_ptr);
uint32_t values_read = 0;
while (values_read < batch_size) {
if (repeat_count_ > 0) {
auto repeat_batch = MinValue<uint32_t>(batch_size - values_read, repeat_count_);
std::fill_n(values + values_read, repeat_batch, static_cast<T>(current_value_));
repeat_count_ -= repeat_batch;
values_read += repeat_batch;
} else if (literal_count_ > 0) {
auto literal_batch = MinValue<uint32_t>(batch_size - values_read, literal_count_);
ParquetDecodeUtils::BitUnpack<T>(buffer_, bitpack_pos, values + values_read, literal_batch, bit_width_);
literal_count_ -= literal_batch;
values_read += literal_batch;
} else {
NextCounts();
}
}
D_ASSERT(values_read == batch_size);
}
void Skip(uint32_t batch_size) {
uint32_t values_skipped = 0;
while (values_skipped < batch_size) {
if (repeat_count_ > 0) {
auto repeat_batch = MinValue<uint32_t>(batch_size - values_skipped, repeat_count_);
repeat_count_ -= repeat_batch;
values_skipped += repeat_batch;
} else if (literal_count_ > 0) {
auto literal_batch = MinValue<uint32_t>(batch_size - values_skipped, literal_count_);
ParquetDecodeUtils::Skip(buffer_, bitpack_pos, literal_batch, bit_width_);
literal_count_ -= literal_batch;
values_skipped += literal_batch;
} else {
NextCounts();
}
}
D_ASSERT(values_skipped == batch_size);
}
static uint8_t ComputeBitWidth(idx_t val) {
if (val == 0) {
return 0;
}
uint8_t ret = 1;
while ((((idx_t)1u << (idx_t)ret) - 1) < val) {
ret++;
}
return ret;
}
private:
ByteBuffer buffer_;
/// Number of bits needed to encode the value. Must be between 0 and 64.
uint32_t bit_width_;
uint64_t current_value_;
uint32_t repeat_count_;
uint32_t literal_count_;
uint8_t byte_encoded_len;
uint64_t max_val;
uint8_t bitpack_pos = 0;
/// Fills literal_count_ and repeat_count_ with next values. Returns false if there
/// are no more.
template <bool CHECKED>
void NextCountsTemplated() {
// Read the next run's indicator int, it could be a literal or repeated run.
// The int is encoded as a vlq-encoded value.
if (bitpack_pos != 0) {
if (CHECKED) {
buffer_.inc(1);
} else {
buffer_.unsafe_inc(1);
}
bitpack_pos = 0;
}
auto indicator_value = ParquetDecodeUtils::VarintDecode<uint32_t, CHECKED>(buffer_);
// lsb indicates if it is a literal run or repeated run
bool is_literal = indicator_value & 1;
if (is_literal) {
literal_count_ = (indicator_value >> 1) * 8;
} else {
repeat_count_ = indicator_value >> 1;
// (ARROW-4018) this is not big-endian compatible, lol
current_value_ = 0;
if (CHECKED) {
buffer_.available(byte_encoded_len);
}
for (auto i = 0; i < byte_encoded_len; i++) {
auto next_byte = Load<uint8_t>(buffer_.ptr + i);
current_value_ |= (next_byte << (i * 8));
}
buffer_.unsafe_inc(byte_encoded_len);
// sanity check
if (repeat_count_ > 0 && current_value_ > max_val) {
throw std::runtime_error("Payload value bigger than allowed. Corrupted file?");
}
}
}
void NextCounts() {
if (buffer_.check_available(byte_encoded_len + sizeof(uint32_t) + 2)) {
NextCountsTemplated<false>();
} else {
NextCountsTemplated<true>();
}
}
};
} // namespace duckdb

View File

@@ -0,0 +1,155 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_rle_bp_encoder.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "decode_utils.hpp"
namespace duckdb {
class RleBpEncoder {
public:
explicit RleBpEncoder(uint32_t bit_width_p) : bit_width(bit_width_p), byte_width((bit_width + 7) / 8) {
}
public:
void BeginWrite() {
rle_count = 0;
bp_block_count = 0;
}
void WriteValue(WriteStream &writer, const uint32_t &value) {
if (bp_block_count != 0) {
// We already committed to a BP run
D_ASSERT(rle_count == 0);
bp_block[bp_block_count++] = value;
if (bp_block_count == BP_BLOCK_SIZE) {
WriteRun(writer);
}
return;
}
if (rle_count == 0) {
// Starting fresh, try for an RLE run first
rle_value = value;
rle_count = 1;
return;
}
// We're trying for an RLE run
if (rle_value == value) {
// Same as current RLE value
rle_count++;
return;
}
// Value differs from current RLE value
if (rle_count >= MINIMUM_RLE_COUNT) {
// We have enough values for an RLE run
WriteRun(writer);
rle_value = value;
rle_count = 1;
return;
}
// Not enough values, convert and commit to a BP run
D_ASSERT(bp_block_count == 0);
for (idx_t i = 0; i < rle_count; i++) {
bp_block[bp_block_count++] = rle_value;
}
bp_block[bp_block_count++] = value;
rle_count = 0;
}
void WriteMany(WriteStream &writer, uint32_t value, idx_t count) {
if (rle_count != 0) {
// If an RLE run is going on, write a single value to either finish it or convert to BP
WriteValue(writer, value);
count--;
}
if (bp_block_count != 0) {
// If a BP run is going on, finish it
while (bp_block_count != 0 && count > 0) {
WriteValue(writer, value);
count--;
}
}
// Set remaining as current RLE run
rle_value = value;
rle_count += count;
}
void FinishWrite(WriteStream &writer) {
WriteRun(writer);
}
private:
//! Meta information
uint32_t bit_width;
uint32_t byte_width;
//! RLE stuff
static constexpr idx_t MINIMUM_RLE_COUNT = 4;
uint32_t rle_value;
idx_t rle_count;
//! BP stuff
static constexpr idx_t BP_BLOCK_SIZE = 256;
static_assert(BP_BLOCK_SIZE % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0,
"BP_BLOCK_SIZE must be divisible by BITPACKING_ALGORITHM_GROUP_SIZE");
uint32_t bp_block[BP_BLOCK_SIZE] = {0};
uint32_t bp_block_packed[BP_BLOCK_SIZE] = {0};
idx_t bp_block_count;
private:
void WriteRun(WriteStream &writer) {
if (rle_count != 0) {
WriteCurrentBlockRLE(writer);
} else {
WriteCurrentBlockBP(writer);
}
}
void WriteCurrentBlockRLE(WriteStream &writer) {
ParquetDecodeUtils::VarintEncode(rle_count << 1 | 0, writer); // (... | 0) signals RLE run
D_ASSERT(rle_value >> (byte_width * 8) == 0);
switch (byte_width) {
case 1:
writer.Write<uint8_t>(rle_value);
break;
case 2:
writer.Write<uint16_t>(rle_value);
break;
case 3:
writer.Write<uint8_t>(rle_value & 0xFF);
writer.Write<uint8_t>((rle_value >> 8) & 0xFF);
writer.Write<uint8_t>((rle_value >> 16) & 0xFF);
break;
case 4:
writer.Write<uint32_t>(rle_value);
break;
default:
throw InternalException("unsupported byte width for RLE encoding");
}
rle_count = 0;
}
void WriteCurrentBlockBP(WriteStream &writer) {
if (bp_block_count == 0) {
return;
}
ParquetDecodeUtils::VarintEncode(BP_BLOCK_SIZE / 8 << 1 | 1, writer); // (... | 1) signals BP run
ParquetDecodeUtils::BitPackAligned(bp_block, data_ptr_cast(bp_block_packed), BP_BLOCK_SIZE, bit_width);
writer.WriteData(data_ptr_cast(bp_block_packed), BP_BLOCK_SIZE * bit_width / 8);
bp_block_count = 0;
}
};
} // namespace duckdb

View File

@@ -0,0 +1,49 @@
#pragma once
#include "duckdb/common/serializer/buffered_file_writer.hpp"
#include "duckdb/common/case_insensitive_map.hpp"
#include "duckdb/common/types/variant.hpp"
namespace duckdb {
struct ShreddingType;
struct ChildShreddingTypes {
public:
ChildShreddingTypes();
public:
ChildShreddingTypes Copy() const;
public:
void Serialize(Serializer &serializer) const;
static ChildShreddingTypes Deserialize(Deserializer &source);
public:
unique_ptr<case_insensitive_map_t<ShreddingType>> types;
};
struct ShreddingType {
public:
ShreddingType();
explicit ShreddingType(const LogicalType &type);
public:
ShreddingType Copy() const;
public:
void Serialize(Serializer &serializer) const;
static ShreddingType Deserialize(Deserializer &source);
public:
static ShreddingType GetShreddingTypes(const Value &val);
void AddChild(const string &name, ShreddingType &&child);
optional_ptr<const ShreddingType> GetChild(const string &name) const;
public:
bool set = false;
LogicalType type;
ChildShreddingTypes children;
};
} // namespace duckdb

View File

@@ -0,0 +1,111 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_statistics.hpp
//
//
//===----------------------------------------------------------------------===/
#pragma once
#include "duckdb.hpp"
#include "duckdb/storage/statistics/base_statistics.hpp"
#include "parquet_types.h"
#include "resizable_buffer.hpp"
namespace duckdb {
using duckdb_parquet::ColumnChunk;
using duckdb_parquet::SchemaElement;
struct LogicalType;
struct ParquetColumnSchema;
class ResizeableBuffer;
struct ParquetStatisticsUtils {
static unique_ptr<BaseStatistics> TransformColumnStatistics(const ParquetColumnSchema &reader,
const vector<ColumnChunk> &columns, bool can_have_nan);
static Value ConvertValue(const LogicalType &type, const ParquetColumnSchema &schema_ele, const std::string &stats);
static bool BloomFilterSupported(const LogicalTypeId &type_id);
static bool BloomFilterExcludes(const TableFilter &filter, const duckdb_parquet::ColumnMetaData &column_meta_data,
duckdb_apache::thrift::protocol::TProtocol &file_proto, Allocator &allocator);
static unique_ptr<BaseStatistics> CreateNumericStats(const LogicalType &type, const ParquetColumnSchema &schema_ele,
const duckdb_parquet::Statistics &parquet_stats);
private:
static Value ConvertValueInternal(const LogicalType &type, const ParquetColumnSchema &schema_ele,
const std::string &stats);
};
class ParquetBloomFilter {
static constexpr const idx_t DEFAULT_BLOCK_COUNT = 32; // 4k filter
public:
ParquetBloomFilter(idx_t num_entries, double bloom_filter_false_positive_ratio);
ParquetBloomFilter(unique_ptr<ResizeableBuffer> data_p);
void FilterInsert(uint64_t x);
bool FilterCheck(uint64_t x);
void Shrink(idx_t new_block_count);
double OneRatio();
ResizeableBuffer *Get();
private:
unique_ptr<ResizeableBuffer> data;
idx_t block_count;
};
// see https://github.com/apache/parquet-format/blob/master/BloomFilter.md
struct ParquetBloomBlock {
struct ParquetBloomMaskResult {
uint8_t bit_set[8] = {0};
};
uint32_t block[8] = {0};
static bool check_bit(uint32_t &x, const uint8_t i) {
D_ASSERT(i < 32);
return (x >> i) & (uint32_t)1;
}
static void set_bit(uint32_t &x, const uint8_t i) {
D_ASSERT(i < 32);
x |= (uint32_t)1 << i;
D_ASSERT(check_bit(x, i));
}
static ParquetBloomMaskResult Mask(uint32_t x) {
static const uint32_t parquet_bloom_salt[8] = {0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
ParquetBloomMaskResult result;
for (idx_t i = 0; i < 8; i++) {
result.bit_set[i] = (x * parquet_bloom_salt[i]) >> 27;
}
return result;
}
static void BlockInsert(ParquetBloomBlock &b, uint32_t x) {
auto masked = Mask(x);
for (idx_t i = 0; i < 8; i++) {
set_bit(b.block[i], masked.bit_set[i]);
D_ASSERT(check_bit(b.block[i], masked.bit_set[i]));
}
}
static bool BlockCheck(ParquetBloomBlock &b, uint32_t x) {
auto masked = Mask(x);
for (idx_t i = 0; i < 8; i++) {
if (!check_bit(b.block[i], masked.bit_set[i])) {
return false;
}
}
return true;
}
};
} // namespace duckdb

View File

@@ -0,0 +1,621 @@
#pragma once
namespace duckdb {
class StripeStreams {
public:
virtual ~StripeStreams() = default;
/**
* get column selector for current stripe reading session
* @return column selector will hold column projection info
*/
virtual const dwio::common::ColumnSelector &getColumnSelector() const = 0;
// Get row reader options
virtual const dwio::common::RowReaderOptclass StripeStreams {
public:
virtual ~StripeStreams() = default;
/**
* get column selector for current stripe reading session
* @return column selector will hold column projection info
*/
virtual const dwio::common::ColumnSelector &getColumnSelector() const = 0;
// Get row reader options
virtual const dwio::common::RowReaderOptions &getRowReaderOptions() const = 0;
/**
* Get the encoding for the given column for this stripe.
*/
virtual const proto::ColumnEncoding &getEncoding(const EncodingKey &) const = 0;
/**
* Get the stream for the given column/kind in this stripe.
* @param streamId stream identifier object
* @param throwIfNotFound fail if a stream is required and not found
* @return the new stream
*/
virtual unique_ptr<SeekableInputStream> getStream(const StreamIdentifier &si, bool throwIfNotFound) const = 0;
/**
* visit all streams of given node and execute visitor logic
* return number of streams visited
*/
virtual uint32_t visitStreamsOfNode(uint32_t node, std::function<void(const StreamInformation &)> visitor)
const = 0;
/**
* Get the value of useVInts for the given column in this stripe.
* Defaults to true.
* @param streamId stream identifier
*/
virtual bool getUseVInts(const StreamIdentifier &streamId) const = 0;
/**
* Get the memory pool for this reader.
*/
virtual memory::MemoryPool &getMemoryPool() const = 0;
/**
* Get the RowGroupIndex.
* @return a vector of RowIndex belonging to the stripe
*/
virtual unique_ptr<proto::RowIndex> getRowGroupIndex(const StreamIdentifier &si) const = 0;
/**
* Get stride index provider which is used by string dictionary reader to
* get the row index stride index where next() happens
*/
virtual const StrideIndexProvider &getStrideIndexProvider() const = 0;
}
ions &getRowReaderOptions() const = 0;
/**
* Get the encoding for the given column for this stripe.
*/
virtual const proto::ColumnEncoding &getEncoding(const EncodingKey &) const = 0;
/**
* Get the stream for the given column/kind in this stripe.
* @param streamId stream identifier object
* @param throwIfNotFound fail if a stream is required and not found
* @return the new stream
*/
virtual unique_ptr<SeekableInputStream> getStream(const StreamIdentifier &si, bool throwIfNotFound) const = 0;
/**
* visit all streams of given node and execute visitor logic
* return number of streams visited
*/
virtual uint32_t visitStreamsOfNode(uint32_t node,
std::function<void(const StreamInformation &)> visitor) const = 0;
/**
* Get the value of useVInts for the given column in this stripe.
* Defaults to true.
* @param streamId stream identifier
*/
virtual bool getUseVInts(const StreamIdentifier &streamId) const = 0;
/**
* Get the memory pool for this reader.
*/
virtual memory::MemoryPool &getMemoryPool() const = 0;
/**
* Get the RowGroupIndex.
* @return a vector of RowIndex belonging to the stripe
*/
virtual unique_ptr<proto::RowIndex> getRowGroupIndex(const StreamIdentifier &si) const = 0;
/**
* Get stride index provider which is used by string dictionary reader to
* get the row index stride index where next() happens
*/
virtual const StrideIndexProvider &getStrideIndexProvider() const = 0;
};
class ColumnReader {
public:
ColumnReader(const EncodingKey &ek, StripeStreams &stripe);
virtual ~ColumnReader() = default;
/**
* Skip number of specified rows.
* @param numValues the number of values to skip
* @return the number of non-null values skipped
*/
virtual uint64_t skip(uint64_t numValues);
/**
* Read the next group of values into a RowVector.
* @param numValues the number of values to read
* @param vector to read into
*/
virtual void next(uint64_t numValues, VectorPtr &result, const uint64_t *nulls = nullptr) = 0;
};
class SelectiveColumnReader : public ColumnReader {
public:
static constexpr uint64_t kStringBufferSize = 16 * 1024;
SelectiveColumnReader(const EncodingKey &ek, StripeStreams &stripe, common::ScanSpec *scanSpec);
/**
* Read the next group of values into a RowVector.
* @param numValues the number of values to read
* @param vector to read into
*/
void next(uint64_t /*numValues*/, VectorPtr & /*result*/, const uint64_t * /*incomingNulls*/) override {
DATALIB_CHECK(false) << "next() is only defined in SelectiveStructColumnReader";
}
// Creates a reader for the given stripe.
static unique_ptr<SelectiveColumnReader> build(const std::shared_ptr<const dwio::common::TypeWithId> &requestedType,
const std::shared_ptr<const dwio::common::TypeWithId> &dataType,
StripeStreams &stripe, common::ScanSpec *scanSpec,
uint32_t sequence = 0);
// Seeks to offset and reads the rows in 'rows' and applies
// filters and value processing as given by 'scanSpec supplied at
// construction. 'offset' is relative to start of stripe. 'rows' are
// relative to 'offset', so that row 0 is the 'offset'th row from
// start of stripe. 'rows' is expected to stay constant
// between this and the next call to read.
virtual void read(vector_size_t offset, RowSet rows, const uint64_t *incomingNulls) = 0;
// Extracts the values at 'rows' into '*result'. May rewrite or
// reallocate '*result'. 'rows' must be the same set or a subset of
// 'rows' passed to the last 'read().
virtual void getValues(RowSet rows, VectorPtr *result) = 0;
// Returns the rows that were selected/visited by the last
// read(). If 'this' has no filter, returns 'rows' passed to last
// read().
const RowSet outputRows() const {
if (scanSpec_->hasFilter()) {
return outputRows_;
}
return inputRows_;
}
// Advances to 'offset', so that the next item to be read is the
// offset-th from the start of stripe.
void seekTo(vector_size_t offset, bool readsNullsOnly);
// The below functions are called from ColumnVisitor to fill the result set.
inline void addOutputRow(vector_size_t row) {
outputRows_.push_back(row);
}
template <typename T>
inline void addNull() {
DATALIB_DCHECK(rawResultNulls_ && rawValues_ && (numValues_ + 1) * sizeof(T) < rawSize_);
anyNulls_ = true;
bits::setBit(rawResultNulls_, numValues_);
reinterpret_cast<T *>(rawValues_)[numValues_] = T();
numValues_++;
}
template <typename T>
inline void addValue(const T value) {
// @lint-ignore-every HOWTOEVEN ConstantArgumentPassByValue
static_assert(std::is_pod<T>::value, "General case of addValue is only for primitive types");
DATALIB_DCHECK(rawValues_ && (numValues _ + 1) * sizeof(T) < rawSize_);
reinterpret_cast<T *>(rawValues_)[numValues_] = value;
numValues_++;
}
void dropResults(vector_size_t count) {
outputRows_.resize(outputRows_.size() - count);
numValues_ -= count;
}
common::ScanSpec *scanSpec() const {
return scanSpec_;
}
auto readOffset() const {
return readOffset_;
}
void setReadOffset(vector_size_t readOffset) {
readOffset_ = readOffset;
}
protected:
static constexpr int8_t kNoValueSize = -1;
template <typename T>
void ensureValuesCapacity(vector_size_t numRows);
void prepareNulls(vector_size_t numRows, bool needNulls);
template <typename T>
void filterNulls(RowSet rows, bool isNull, bool extractValues);
template <typename T>
void prepareRead(vector_size_t offset, RowSet rows, const uint64_t *incomingNulls);
void setOutputRows(RowSet rows) {
outputRows_.resize(rows.size());
if (!rows.size()) {
return;
}
memcpy(outputRows_.data(), &rows[0], rows.size() * sizeof(vector_size_t));
}
template <typename T, typename TVector>
void getFlatValues(RowSet rows, VectorPtr *result);
template <typename T, typename TVector>
void compactScalarValues(RowSet rows);
void addStringValue(folly::StringPiece value);
// Specification of filters, value extraction, pruning etc. The
// spec is assigned at construction and the contents may change at
// run time based on adaptation. Owned by caller.
common::ScanSpec *const scanSpec_;
// Row number after last read row, relative to stripe start.
vector_size_t readOffset_ = 0;
// The rows to process in read(). References memory supplied by
// caller. The values must remain live until the next call to read().
RowSet inputRows_;
// Rows passing the filter in readWithVisitor. Must stay
// constant between consecutive calls to read().
vector<vector_size_t> outputRows_;
// The row number corresponding to each element in 'values_'
vector<vector_size_t> valueRows_;
// The set of all nulls in the range of read(). Created when first
// needed and then reused. Not returned to callers.
BufferPtr nullsInReadRange_;
// Nulls buffer for readWithVisitor. Not set if no nulls. 'numValues'
// is the index of the first non-set bit.
BufferPtr resultNulls_;
uint64_t *rawResultNulls_ = nullptr;
// Buffer for gathering scalar values in readWithVisitor.
BufferPtr values_;
// Writable content in 'values'
void *rawValues_ = nullptr;
vector_size_t numValues_ = 0;
// Size of fixed width value in 'rawValues'. For integers, values
// are read at 64 bit width and can be compacted or extracted at a
// different width.
int8_t valueSize_ = kNoValueSize;
// Buffers backing the StringViews in 'values' when reading strings.
vector<BufferPtr> stringBuffers_;
// Writable contents of 'stringBuffers_.back()'.
char *rawStringBuffer_ = nullptr;
// Total writable bytes in 'rawStringBuffer_'.
int32_t rawStringSize_ = 0;
// Number of written bytes in 'rawStringBuffer_'.
uint32_t rawStringUsed_ = 0;
// True if last read() added any nulls.
bool anyNulls_ = false;
// True if all values in scope for last read() are null.
bool allNull_ = false;
};
struct ExtractValues {
static constexpr bool kSkipNulls = false;
bool acceptsNulls() const {
return true;
}
template <typename V>
void addValue(vector_size_t /*rowIndex*/, V /*value*/) {
}
void addNull(vector_size_t /*rowIndex*/) {
}
};
class Filter {
protected:
Filter(bool deterministic, bool nullAllowed, FilterKind kind)
: nullAllowed_(nullAllowed), deterministic_(deterministic), kind_(kind) {
}
public:
virtual ~Filter() = default;
// Templates parametrized on filter need to know determinism at compile
// time. If this is false, deterministic() will be consulted at
// runtime.
static constexpr bool deterministic = true;
FilterKind kind() const {
return kind_;
}
virtual unique_ptr<Filter> clone() const = 0;
/**
* A filter becomes non-deterministic when applies to nested column,
* e.g. a[1] > 10 is non-deterministic because > 10 filter applies only to
* some positions, e.g. first entry in a set of entries that correspond to a
* single top-level position.
*/
virtual bool isDeterministic() const {
return deterministic_;
}
/**
* When a filter applied to a nested column fails, the whole top-level
* position should fail. To enable this functionality, the filter keeps track
* of the boundaries of top-level positions and allows the caller to find out
* where the current top-level position started and how far it continues.
* @return number of positions from the start of the current top-level
* position up to the current position (excluding current position)
*/
virtual int getPrecedingPositionsToFail() const {
return 0;
}
/**
* @return number of positions remaining until the end of the current
* top-level position
*/
virtual int getSucceedingPositionsToFail() const {
return 0;
}
virtual bool testNull() const {
return nullAllowed_;
}
/**
* Used to apply is [not] null filters to complex types, e.g.
* a[1] is null AND a[3] is not null, where a is an array(array(T)).
*
* In these case, the exact values are not known, but it is known whether they
* are null or not. Furthermore, for some positions only nulls are allowed
* (a[1] is null), for others only non-nulls (a[3] is not null), and for the
* rest both are allowed (a[2] and a[N], where N > 3).
*/
virtual bool testNonNull() const {
DWIO_RAISE("not supported");
}
virtual bool testInt64(int64_t /* unused */) const {
DWIO_RAISE("not supported");
}
virtual bool testDouble(double /* unused */) const {
DWIO_RAISE("not supported");
}
virtual bool testFloat(float /* unused */) const {
DWIO_RAISE("not supported");
}
virtual bool testBool(bool /* unused */) const {
DWIO_RAISE("not supported");
}
virtual bool testBytes(const char * /* unused */, int32_t /* unused */) const {
DWIO_RAISE("not supported");
}
/**
* Filters like string equality and IN, as well as conditions on cardinality
* of lists and maps can be at least partly decided by looking at lengths
* alone. If this is false, then no further checks are needed. If true,
* eventual filters on the data itself need to be evaluated.
*/
virtual bool testLength(int32_t /* unused */) const {
DWIO_RAISE("not supported");
}
protected:
const bool nullAllowed_;
private:
const bool deterministic_;
const FilterKind kind_;
};
// Template parameter for controlling filtering and action on a set of rows.
template <typename T, typename TFilter, typename ExtractValues, bool isDense>
class ColumnVisitor {
public:
using FilterType = TFilter;
static constexpr bool dense = isDense;
ColumnVisitor(TFilter &filter, SelectiveColumnReader *reader, const RowSet &rows, ExtractValues values)
: filter_(filter), reader_(reader), allowNulls_(!TFilter::deterministic || filter.testNull()), rows_(&rows[0]),
numRows_(rows.size()), rowIndex_(0), values_(values) {
}
bool allowNulls() {
if (ExtractValues::kSkipNulls && TFilter::deterministic) {
return false;
}
return allowNulls_ && values_.acceptsNulls();
}
vector_size_t start() {
return isDense ? 0 : rowAt(0);
}
// Tests for a null value and processes it. If the value is not
// null, returns 0 and has no effect. If the value is null, advances
// to the next non-null value in 'rows_'. Returns the number of
// values (not including nulls) to skip to get to the next non-null.
// If there is no next non-null in 'rows_', sets 'atEnd'. If 'atEnd'
// is set and a non-zero skip is returned, the caller must perform
// the skip before returning.
FOLLY_ALWAYS_INLINE vector_size_t checkAndSkipNulls(const uint64_t *nulls, vector_size_t &current, bool &atEnd) {
auto testRow = currentRow();
// Check that the caller and the visitor are in sync about current row.
DATALIB_DCHECK(current == testRow);
uint32_t nullIndex = testRow >> 6;
uint64_t nullWord = nulls[nullIndex];
if (!nullWord) {
return 0;
}
uint8_t nullBit = testRow & 63;
if ((nullWord & (1UL << nullBit)) == 0) {
return 0;
}
// We have a null. We find the next non-null.
if (++rowIndex_ >= numRows_) {
atEnd = true;
return 0;
}
auto rowOfNullWord = testRow - nullBit;
if (isDense) {
if (nullBit == 63) {
nullBit = 0;
rowOfNullWord += 64;
nullWord = nulls[++nullIndex];
} else {
++nullBit;
// set all the bits below the row to null.
nullWord |= f4d::bits::lowMask(nullBit);
}
for (;;) {
auto nextNonNull = count_trailing_zeros(~nullWord);
if (rowOfNullWord + nextNonNull >= numRows_) {
// Nulls all the way to the end.
atEnd = true;
return 0;
}
if (nextNonNull < 64) {
DATALIB_CHECK(rowIndex_ <= rowOfNullWord + nextNonNull);
rowIndex_ = rowOfNullWord + nextNonNull;
current = currentRow();
return 0;
}
rowOfNullWord += 64;
nullWord = nulls[++nullIndex];
}
} else {
// Sparse row numbers. We find the first non-null and count
// how many non-nulls on rows not in 'rows_' we skipped.
int32_t toSkip = 0;
nullWord |= f4d::bits::lowMask(nullBit);
for (;;) {
testRow = currentRow();
while (testRow >= rowOfNullWord + 64) {
toSkip += __builtin_popcountll(~nullWord);
nullWord = nulls[++nullIndex];
rowOfNullWord += 64;
}
// testRow is inside nullWord. See if non-null.
nullBit = testRow & 63;
if ((nullWord & (1UL << nullBit)) == 0) {
toSkip += __builtin_popcountll(~nullWord & f4d::bits::lowMask(nullBit));
current = testRow;
return toSkip;
}
if (++rowIndex_ >= numRows_) {
// We end with a null. Add the non-nulls below the final null.
toSkip += __builtin_popcountll(~nullWord & f4d::bits::lowMask(testRow - rowOfNullWord));
atEnd = true;
return toSkip;
}
}
}
}
vector_size_t processNull(bool &atEnd) {
vector_size_t previous = currentRow();
if (filter_.testNull()) {
filterPassedForNull();
} else {
filterFailed();
}
if (++rowIndex_ >= numRows_) {
atEnd = true;
return rows_[numRows_ - 1] - previous;
}
if (TFilter::deterministic && isDense) {
return 0;
}
return currentRow() - previous - 1;
}
FOLLY_ALWAYS_INLINE vector_size_t process(T value, bool &atEnd) {
if (!TFilter::deterministic) {
auto previous = currentRow();
if (common::applyFilter(filter_, value)) {
filterPassed(value);
} else {
filterFailed();
}
if (++rowIndex_ >= numRows_) {
atEnd = true;
return rows_[numRows_ - 1] - previous;
}
return currentRow() - previous - 1;
}
// The filter passes or fails and we go to the next row if any.
if (common::applyFilter(filter_, value)) {
filterPassed(value);
} else {
filterFailed();
}
if (++rowIndex_ >= numRows_) {
atEnd = true;
return 0;
}
if (isDense) {
return 0;
}
return currentRow() - rows_[rowIndex_ - 1] - 1;
}
inline vector_size_t rowAt(vector_size_t index) {
if (isDense) {
return index;
}
return rows_[index];
}
vector_size_t currentRow() {
if (isDense) {
return rowIndex_;
}
return rows_[rowIndex_];
}
vector_size_t numRows() {
return numRows_;
}
void filterPassed(T value) {
addResult(value);
if (!std::is_same<TFilter, common::AlwaysTrue>::value) {
addOutputRow(currentRow());
}
}
inline void filterPassedForNull() {
addNull();
if (!std::is_same<TFilter, common::AlwaysTrue>::value) {
addOutputRow(currentRow());
}
}
FOLLY_ALWAYS_INLINE void filterFailed();
inline void addResult(T value);
inline void addNull();
inline void addOutputRow(vector_size_t row);
protected:
TFilter &filter_;
SelectiveColumnReader *reader_;
const bool allowNulls_;
const vector_size_t *rows_;
vector_size_t numRows_;
vector_size_t rowIndex_;
ExtractValues values_;
};
} // namespace duckdb

View File

@@ -0,0 +1,44 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_timestamp.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
namespace duckdb {
struct Int96 {
uint32_t value[3];
};
timestamp_t ImpalaTimestampToTimestamp(const Int96 &raw_ts);
timestamp_ns_t ImpalaTimestampToTimestampNS(const Int96 &raw_ts);
Int96 TimestampToImpalaTimestamp(timestamp_t &ts);
timestamp_t ParquetTimestampMicrosToTimestamp(const int64_t &raw_ts);
timestamp_t ParquetTimestampMsToTimestamp(const int64_t &raw_ts);
timestamp_t ParquetTimestampNsToTimestamp(const int64_t &raw_ts);
timestamp_ns_t ParquetTimestampMsToTimestampNs(const int64_t &raw_ms);
timestamp_ns_t ParquetTimestampUsToTimestampNs(const int64_t &raw_us);
timestamp_ns_t ParquetTimestampNsToTimestampNs(const int64_t &raw_ns);
date_t ParquetIntToDate(const int32_t &raw_date);
dtime_t ParquetMsIntToTime(const int32_t &raw_millis);
dtime_t ParquetIntToTime(const int64_t &raw_micros);
dtime_t ParquetNsIntToTime(const int64_t &raw_nanos);
dtime_ns_t ParquetMsIntToTimeNs(const int32_t &raw_millis);
dtime_ns_t ParquetUsIntToTimeNs(const int64_t &raw_micros);
dtime_ns_t ParquetIntToTimeNs(const int64_t &raw_nanos);
dtime_tz_t ParquetIntToTimeMsTZ(const int32_t &raw_millis);
dtime_tz_t ParquetIntToTimeTZ(const int64_t &raw_micros);
dtime_tz_t ParquetIntToTimeNsTZ(const int64_t &raw_nanos);
} // namespace duckdb

View File

@@ -0,0 +1,182 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "duckdb/common/common.hpp"
#include "duckdb/common/optional_idx.hpp"
#include "duckdb/common/encryption_state.hpp"
#include "duckdb/common/exception.hpp"
#include "duckdb/common/mutex.hpp"
#include "duckdb/common/atomic.hpp"
#include "duckdb/common/serializer/buffered_file_writer.hpp"
#include "duckdb/common/types/column/column_data_collection.hpp"
#include "duckdb/function/copy_function.hpp"
#include "parquet_statistics.hpp"
#include "column_writer.hpp"
#include "parquet_field_id.hpp"
#include "parquet_shredding.hpp"
#include "parquet_types.h"
#include "geo_parquet.hpp"
#include "writer/parquet_write_stats.hpp"
#include "thrift/protocol/TCompactProtocol.h"
namespace duckdb {
class FileSystem;
class FileOpener;
class ParquetEncryptionConfig;
class ParquetStatsAccumulator;
class Serializer;
class Deserializer;
class ColumnWriterStatistics;
struct CopyFunctionFileStatistics;
struct PreparedRowGroup {
duckdb_parquet::RowGroup row_group;
vector<unique_ptr<ColumnWriterState>> states;
};
struct ParquetBloomFilterEntry {
unique_ptr<ParquetBloomFilter> bloom_filter;
idx_t row_group_idx;
idx_t column_idx;
};
enum class ParquetVersion : uint8_t {
V1 = 1, //! Excludes DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY, BYTE_STREAM_SPLIT
V2 = 2, //! Includes the encodings above
};
class ParquetWriter {
public:
ParquetWriter(ClientContext &context, FileSystem &fs, string file_name, vector<LogicalType> types,
vector<string> names, duckdb_parquet::CompressionCodec::type codec, ChildFieldIDs field_ids,
ShreddingType shredding_types, const vector<pair<string, string>> &kv_metadata,
shared_ptr<ParquetEncryptionConfig> encryption_config, optional_idx dictionary_size_limit,
idx_t string_dictionary_page_size_limit, bool enable_bloom_filters,
double bloom_filter_false_positive_ratio, int64_t compression_level, bool debug_use_openssl,
ParquetVersion parquet_version, GeoParquetVersion geoparquet_version);
~ParquetWriter();
public:
void PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGroup &result);
void FlushRowGroup(PreparedRowGroup &row_group);
void Flush(ColumnDataCollection &buffer);
void Finalize();
static duckdb_parquet::Type::type DuckDBTypeToParquetType(const LogicalType &duckdb_type);
static void SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele,
bool allow_geometry);
ClientContext &GetContext() {
return context;
}
duckdb_apache::thrift::protocol::TProtocol *GetProtocol() {
return protocol.get();
}
duckdb_parquet::CompressionCodec::type GetCodec() {
return codec;
}
duckdb_parquet::Type::type GetType(idx_t schema_idx) {
return file_meta_data.schema[schema_idx].type;
}
LogicalType GetSQLType(idx_t schema_idx) const {
return sql_types[schema_idx];
}
BufferedFileWriter &GetWriter() {
return *writer;
}
idx_t FileSize() {
return total_written;
}
optional_idx DictionarySizeLimit() const {
return dictionary_size_limit;
}
idx_t StringDictionaryPageSizeLimit() const {
return string_dictionary_page_size_limit;
}
double EnableBloomFilters() const {
return enable_bloom_filters;
}
double BloomFilterFalsePositiveRatio() const {
return bloom_filter_false_positive_ratio;
}
int64_t CompressionLevel() const {
return compression_level;
}
idx_t NumberOfRowGroups() {
return num_row_groups;
}
ParquetVersion GetParquetVersion() const {
return parquet_version;
}
GeoParquetVersion GetGeoParquetVersion() const {
return geoparquet_version;
}
const string &GetFileName() const {
return file_name;
}
uint32_t Write(const duckdb_apache::thrift::TBase &object);
uint32_t WriteData(const const_data_ptr_t buffer, const uint32_t buffer_size);
GeoParquetFileMetadata &GetGeoParquetData();
static bool TryGetParquetType(const LogicalType &duckdb_type,
optional_ptr<duckdb_parquet::Type::type> type = nullptr);
void BufferBloomFilter(idx_t col_idx, unique_ptr<ParquetBloomFilter> bloom_filter);
void SetWrittenStatistics(CopyFunctionFileStatistics &written_stats);
void FlushColumnStats(idx_t col_idx, duckdb_parquet::ColumnChunk &chunk,
optional_ptr<ColumnWriterStatistics> writer_stats);
private:
void GatherWrittenStatistics();
private:
ClientContext &context;
string file_name;
vector<LogicalType> sql_types;
vector<string> column_names;
duckdb_parquet::CompressionCodec::type codec;
ChildFieldIDs field_ids;
ShreddingType shredding_types;
shared_ptr<ParquetEncryptionConfig> encryption_config;
optional_idx dictionary_size_limit;
idx_t string_dictionary_page_size_limit;
bool enable_bloom_filters;
double bloom_filter_false_positive_ratio;
int64_t compression_level;
bool debug_use_openssl;
shared_ptr<EncryptionUtil> encryption_util;
ParquetVersion parquet_version;
GeoParquetVersion geoparquet_version;
vector<ParquetColumnSchema> column_schemas;
unique_ptr<BufferedFileWriter> writer;
//! Atomics to reduce contention when rotating writes to multiple Parquet files
atomic<idx_t> total_written;
atomic<idx_t> num_row_groups;
std::shared_ptr<duckdb_apache::thrift::protocol::TProtocol> protocol;
duckdb_parquet::FileMetaData file_meta_data;
std::mutex lock;
vector<unique_ptr<ColumnWriter>> column_writers;
unique_ptr<GeoParquetFileMetadata> geoparquet_data;
vector<ParquetBloomFilterEntry> bloom_filters;
optional_ptr<CopyFunctionFileStatistics> written_stats;
unique_ptr<ParquetStatsAccumulator> stats_accumulator;
};
} // namespace duckdb

View File

@@ -0,0 +1,70 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/boolean_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
struct BooleanParquetValueConversion;
class BooleanColumnReader : public TemplatedColumnReader<bool, BooleanParquetValueConversion> {
public:
static constexpr const PhysicalType TYPE = PhysicalType::BOOL;
public:
BooleanColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: TemplatedColumnReader<bool, BooleanParquetValueConversion>(reader, schema), byte_pos(0) {
}
uint8_t byte_pos;
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override {
byte_pos = 0;
TemplatedColumnReader<bool, BooleanParquetValueConversion>::InitializeRead(row_group_idx_p, columns,
protocol_p);
}
void ResetPage() override {
byte_pos = 0;
}
};
struct BooleanParquetValueConversion {
template <bool CHECKED>
static bool PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
auto &byte_pos = reader.Cast<BooleanColumnReader>().byte_pos;
bool ret = (*plain_data.ptr >> byte_pos) & 1;
if (++byte_pos == 8) {
byte_pos = 0;
if (CHECKED) {
plain_data.inc(1);
} else {
plain_data.unsafe_inc(1);
}
}
return ret;
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
PlainRead<CHECKED>(plain_data, reader);
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return plain_data.check_available((count + 7) / 8);
}
static idx_t PlainConstantSize() {
return 0;
}
};
} // namespace duckdb

View File

@@ -0,0 +1,46 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/callback_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
#include "parquet_reader.hpp"
namespace duckdb {
template <class PARQUET_PHYSICAL_TYPE, class DUCKDB_PHYSICAL_TYPE,
DUCKDB_PHYSICAL_TYPE (*FUNC)(const PARQUET_PHYSICAL_TYPE &input)>
class CallbackColumnReader
: public TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
CallbackParquetValueConversion<PARQUET_PHYSICAL_TYPE, DUCKDB_PHYSICAL_TYPE, FUNC>> {
using BaseType =
TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
CallbackParquetValueConversion<PARQUET_PHYSICAL_TYPE, DUCKDB_PHYSICAL_TYPE, FUNC>>;
public:
static constexpr const PhysicalType TYPE = PhysicalType::INVALID;
public:
CallbackColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
CallbackParquetValueConversion<PARQUET_PHYSICAL_TYPE, DUCKDB_PHYSICAL_TYPE, FUNC>>(
reader, schema) {
}
protected:
void Dictionary(shared_ptr<ResizeableBuffer> dictionary_data, idx_t num_entries) {
BaseType::AllocateDict(num_entries * sizeof(DUCKDB_PHYSICAL_TYPE));
auto dict_ptr = (DUCKDB_PHYSICAL_TYPE *)this->dict->ptr;
for (idx_t i = 0; i < num_entries; i++) {
dict_ptr[i] = FUNC(dictionary_data->read<PARQUET_PHYSICAL_TYPE>());
}
}
};
} // namespace duckdb

View File

@@ -0,0 +1,65 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/decimal_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
#include "parquet_reader.hpp"
#include "parquet_decimal_utils.hpp"
namespace duckdb {
template <class DUCKDB_PHYSICAL_TYPE, bool FIXED_LENGTH>
struct DecimalParquetValueConversion {
template <bool CHECKED>
static DUCKDB_PHYSICAL_TYPE PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
idx_t byte_len;
if (FIXED_LENGTH) {
byte_len = reader.Schema().type_length;
} else {
byte_len = plain_data.read<uint32_t>();
}
plain_data.available(byte_len);
auto res = ParquetDecimalUtils::ReadDecimalValue<DUCKDB_PHYSICAL_TYPE>(const_data_ptr_cast(plain_data.ptr),
byte_len, reader.Schema());
plain_data.inc(byte_len);
return res;
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
uint32_t decimal_len = FIXED_LENGTH ? reader.Schema().type_length : plain_data.read<uint32_t>();
plain_data.inc(decimal_len);
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return true;
}
static idx_t PlainConstantSize() {
return 0;
}
};
template <class DUCKDB_PHYSICAL_TYPE, bool FIXED_LENGTH>
class DecimalColumnReader
: public TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
DecimalParquetValueConversion<DUCKDB_PHYSICAL_TYPE, FIXED_LENGTH>> {
using BaseType =
TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE, DecimalParquetValueConversion<DUCKDB_PHYSICAL_TYPE, FIXED_LENGTH>>;
public:
DecimalColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: TemplatedColumnReader<DUCKDB_PHYSICAL_TYPE,
DecimalParquetValueConversion<DUCKDB_PHYSICAL_TYPE, FIXED_LENGTH>>(reader, schema) {
}
};
} // namespace duckdb

View File

@@ -0,0 +1,56 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/expression_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "duckdb/execution/expression_executor.hpp"
namespace duckdb {
//! A column reader that executes an expression over a child reader
class ExpressionColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::INVALID;
public:
ExpressionColumnReader(ClientContext &context, unique_ptr<ColumnReader> child_reader, unique_ptr<Expression> expr,
const ParquetColumnSchema &schema);
ExpressionColumnReader(ClientContext &context, unique_ptr<ColumnReader> child_reader, unique_ptr<Expression> expr,
unique_ptr<ParquetColumnSchema> owned_schema);
unique_ptr<ColumnReader> child_reader;
DataChunk intermediate_chunk;
unique_ptr<Expression> expr;
ExpressionExecutor executor;
// If this reader was created on top of a child reader, after-the-fact, the schema needs to live somewhere
unique_ptr<ParquetColumnSchema> owned_schema;
public:
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override;
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) override;
void Skip(idx_t num_values) override;
idx_t GroupRowsAvailable() override;
uint64_t TotalCompressedSize() override {
return child_reader->TotalCompressedSize();
}
idx_t FileOffset() const override {
return child_reader->FileOffset();
}
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override {
child_reader->RegisterPrefetch(transport, allow_merge);
}
};
} // namespace duckdb

View File

@@ -0,0 +1,67 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/interval_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
#include "parquet_reader.hpp"
namespace duckdb {
//===--------------------------------------------------------------------===//
// Interval Column Reader
//===--------------------------------------------------------------------===//
struct IntervalValueConversion {
static constexpr const idx_t PARQUET_INTERVAL_SIZE = 12;
static interval_t ReadParquetInterval(const_data_ptr_t input) {
interval_t result;
result.months = Load<int32_t>(input);
result.days = Load<int32_t>(input + sizeof(uint32_t));
result.micros = int64_t(Load<uint32_t>(input + sizeof(uint32_t) * 2)) * 1000;
return result;
}
template <bool CHECKED>
static interval_t PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
plain_data.available(PARQUET_INTERVAL_SIZE);
}
auto res = ReadParquetInterval(const_data_ptr_cast(plain_data.ptr));
plain_data.unsafe_inc(PARQUET_INTERVAL_SIZE);
return res;
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
plain_data.inc(PARQUET_INTERVAL_SIZE);
} else {
plain_data.unsafe_inc(PARQUET_INTERVAL_SIZE);
}
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return plain_data.check_available(count * PARQUET_INTERVAL_SIZE);
}
static idx_t PlainConstantSize() {
return 0;
}
};
class IntervalColumnReader : public TemplatedColumnReader<interval_t, IntervalValueConversion> {
public:
IntervalColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: TemplatedColumnReader<interval_t, IntervalValueConversion>(reader, schema) {
}
};
} // namespace duckdb

View File

@@ -0,0 +1,62 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/list_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
class ListColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::LIST;
public:
ListColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
unique_ptr<ColumnReader> child_column_reader_p);
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out) override;
void ApplyPendingSkips(data_ptr_t define_out, data_ptr_t repeat_out) override;
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override {
child_column_reader->InitializeRead(row_group_idx_p, columns, protocol_p);
}
idx_t GroupRowsAvailable() override {
return child_column_reader->GroupRowsAvailable() + overflow_child_count;
}
uint64_t TotalCompressedSize() override {
return child_column_reader->TotalCompressedSize();
}
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override {
child_column_reader->RegisterPrefetch(transport, allow_merge);
}
protected:
template <class OP>
idx_t ReadInternal(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out,
optional_ptr<Vector> result_out);
private:
unique_ptr<ColumnReader> child_column_reader;
ResizeableBuffer child_defines;
ResizeableBuffer child_repeats;
uint8_t *child_defines_ptr;
uint8_t *child_repeats_ptr;
VectorCache read_cache;
Vector read_vector;
idx_t overflow_child_count;
};
} // namespace duckdb

View File

@@ -0,0 +1,38 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/null_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "duckdb/common/helper.hpp"
namespace duckdb {
class NullColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::INVALID;
public:
NullColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema) : ColumnReader(reader, schema) {};
shared_ptr<ResizeableBuffer> dict;
public:
void Plain(ByteBuffer &plain_data, uint8_t *defines, uint64_t num_values, idx_t result_offset,
Vector &result) override {
(void)defines;
(void)plain_data;
auto &result_mask = FlatVector::Validity(result);
for (idx_t row_idx = 0; row_idx < num_values; row_idx++) {
result_mask.SetInvalid(row_idx + result_offset);
}
}
};
} // namespace duckdb

View File

@@ -0,0 +1,52 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/row_number_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb/common/limits.hpp"
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
//! Reads a file-absolute row number as a virtual column that's not actually stored in the file
class RowNumberColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::INT64;
public:
RowNumberColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema);
public:
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) override;
void Filter(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out,
const TableFilter &filter, TableFilterState &filter_state, SelectionVector &sel,
idx_t &approved_tuple_count, bool is_first_filter) override;
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override;
void Skip(idx_t num_values) override {
row_group_offset += num_values;
}
idx_t GroupRowsAvailable() override {
return NumericLimits<idx_t>::Maximum();
};
uint64_t TotalCompressedSize() override {
return 0;
}
idx_t FileOffset() const override {
return 0;
}
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override {
}
private:
idx_t row_group_offset;
};
} // namespace duckdb

View File

@@ -0,0 +1,91 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/string_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
class StringColumnReader : public ColumnReader {
enum class StringColumnType : uint8_t { VARCHAR, JSON, OTHER };
static StringColumnType GetStringColumnType(const LogicalType &type) {
if (type.IsJSONType()) {
return StringColumnType::JSON;
}
if (type.id() == LogicalTypeId::VARCHAR) {
return StringColumnType::VARCHAR;
}
return StringColumnType::OTHER;
}
public:
static constexpr const PhysicalType TYPE = PhysicalType::VARCHAR;
public:
StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema);
idx_t fixed_width_string_length;
const StringColumnType string_column_type;
public:
static void VerifyString(const char *str_data, uint32_t str_len, const bool isVarchar);
void VerifyString(const char *str_data, uint32_t str_len);
static void ReferenceBlock(Vector &result, shared_ptr<ResizeableBuffer> &block);
protected:
void Plain(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values, idx_t result_offset,
Vector &result) override {
throw NotImplementedException("StringColumnReader can only read plain data from a shared buffer");
}
void Plain(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values, idx_t result_offset,
Vector &result) override;
void PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values) override;
void PlainSelect(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values, Vector &result,
const SelectionVector &sel, idx_t count) override;
bool SupportsDirectFilter() const override {
return true;
}
bool SupportsDirectSelect() const override {
return true;
}
};
struct StringParquetValueConversion {
template <bool CHECKED>
static string_t PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
auto &scr = reader.Cast<StringColumnReader>();
uint32_t str_len =
scr.fixed_width_string_length == 0 ? plain_data.read<uint32_t>() : scr.fixed_width_string_length;
plain_data.available(str_len);
auto plain_str = char_ptr_cast(plain_data.ptr);
scr.VerifyString(plain_str, str_len);
auto ret_str = string_t(plain_str, str_len);
plain_data.inc(str_len);
return ret_str;
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
auto &scr = reader.Cast<StringColumnReader>();
uint32_t str_len =
scr.fixed_width_string_length == 0 ? plain_data.read<uint32_t>() : scr.fixed_width_string_length;
plain_data.inc(str_len);
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return false;
}
static idx_t PlainConstantSize() {
return 0;
}
};
} // namespace duckdb

View File

@@ -0,0 +1,39 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/struct_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
class StructColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::STRUCT;
public:
StructColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
vector<unique_ptr<ColumnReader>> child_readers_p);
vector<unique_ptr<ColumnReader>> child_readers;
public:
ColumnReader &GetChildReader(idx_t child_idx);
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override;
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) override;
void Skip(idx_t num_values) override;
idx_t GroupRowsAvailable() override;
uint64_t TotalCompressedSize() override;
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override;
};
} // namespace duckdb

View File

@@ -0,0 +1,110 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/templated_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "duckdb/common/helper.hpp"
namespace duckdb {
template <class VALUE_TYPE>
struct TemplatedParquetValueConversion {
template <bool CHECKED>
static VALUE_TYPE PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
return plain_data.read<VALUE_TYPE>();
} else {
return plain_data.unsafe_read<VALUE_TYPE>();
}
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
plain_data.inc(sizeof(VALUE_TYPE));
} else {
plain_data.unsafe_inc(sizeof(VALUE_TYPE));
}
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return plain_data.check_available(count * sizeof(VALUE_TYPE));
}
static idx_t PlainConstantSize() {
return sizeof(VALUE_TYPE);
}
};
template <class VALUE_TYPE, class VALUE_CONVERSION>
class TemplatedColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::INVALID;
public:
TemplatedColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema) : ColumnReader(reader, schema) {
}
shared_ptr<ResizeableBuffer> dict;
public:
void AllocateDict(idx_t size) {
if (!dict) {
dict = make_shared_ptr<ResizeableBuffer>(GetAllocator(), size);
} else {
dict->resize(GetAllocator(), size);
}
}
void Plain(ByteBuffer &plain_data, uint8_t *defines, uint64_t num_values, idx_t result_offset,
Vector &result) override {
PlainTemplated<VALUE_TYPE, VALUE_CONVERSION>(plain_data, defines, num_values, result_offset, result);
}
void PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values) override {
PlainSkipTemplated<VALUE_CONVERSION>(plain_data, defines, num_values);
}
bool SupportsDirectFilter() const override {
return true;
}
};
template <class PARQUET_PHYSICAL_TYPE, class DUCKDB_PHYSICAL_TYPE,
DUCKDB_PHYSICAL_TYPE (*FUNC)(const PARQUET_PHYSICAL_TYPE &input)>
struct CallbackParquetValueConversion {
template <bool CHECKED>
static DUCKDB_PHYSICAL_TYPE PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
return FUNC(plain_data.read<PARQUET_PHYSICAL_TYPE>());
} else {
return FUNC(plain_data.unsafe_read<PARQUET_PHYSICAL_TYPE>());
}
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
plain_data.inc(sizeof(PARQUET_PHYSICAL_TYPE));
} else {
plain_data.unsafe_inc(sizeof(PARQUET_PHYSICAL_TYPE));
}
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return plain_data.check_available(count * sizeof(PARQUET_PHYSICAL_TYPE));
}
static idx_t PlainConstantSize() {
return 0;
}
};
} // namespace duckdb

View File

@@ -0,0 +1,60 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/uuid_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "templated_column_reader.hpp"
#include "parquet_reader.hpp"
#include "duckdb/common/types/uuid.hpp"
namespace duckdb {
struct UUIDValueConversion {
static hugeint_t ReadParquetUUID(const_data_ptr_t input) {
// Use the utility function from BaseUUID
return BaseUUID::FromBlob(input);
}
template <bool CHECKED>
static hugeint_t PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
plain_data.available(sizeof(hugeint_t));
}
auto res = ReadParquetUUID(const_data_ptr_cast(plain_data.ptr));
plain_data.unsafe_inc(sizeof(hugeint_t));
return res;
}
template <bool CHECKED>
static void PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
if (CHECKED) {
plain_data.inc(sizeof(hugeint_t));
} else {
plain_data.unsafe_inc(sizeof(hugeint_t));
}
}
static bool PlainAvailable(const ByteBuffer &plain_data, const idx_t count) {
return plain_data.check_available(count * sizeof(hugeint_t));
}
static idx_t PlainConstantSize() {
return 0;
}
};
class UUIDColumnReader : public TemplatedColumnReader<hugeint_t, UUIDValueConversion> {
public:
UUIDColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: TemplatedColumnReader<hugeint_t, UUIDValueConversion>(reader, schema) {
}
};
} // namespace duckdb

View File

@@ -0,0 +1,148 @@
#pragma once
#include "duckdb/common/types/string_type.hpp"
#include "duckdb/common/types/value.hpp"
#include "reader/variant/variant_value.hpp"
using namespace duckdb_yyjson;
namespace duckdb {
//! ------------ Metadata ------------
struct VariantMetadataHeader {
public:
static VariantMetadataHeader FromHeaderByte(uint8_t byte);
public:
//! The version of the protocol used (only '1' supported for now)
uint8_t version;
//! Number of bytes per dictionary size and offset field
uint8_t offset_size;
//! Whether dictionary strings are sorted and unique
bool sorted_strings = false;
};
struct VariantMetadata {
public:
explicit VariantMetadata(const string_t &metadata);
public:
const string_t &metadata;
public:
VariantMetadataHeader header;
const_data_ptr_t offsets;
const_data_ptr_t bytes;
//! The json object keys have to be null-terminated
//! But we don't receive them null-terminated
vector<string> strings;
};
//! ------------ Value ------------
enum class VariantBasicType : uint8_t { PRIMITIVE = 0, SHORT_STRING = 1, OBJECT = 2, ARRAY = 3, INVALID };
enum class VariantPrimitiveType : uint8_t {
NULL_TYPE = 0,
BOOLEAN_TRUE = 1,
BOOLEAN_FALSE = 2,
INT8 = 3,
INT16 = 4,
INT32 = 5,
INT64 = 6,
DOUBLE = 7,
DECIMAL4 = 8,
DECIMAL8 = 9,
DECIMAL16 = 10,
DATE = 11,
TIMESTAMP_MICROS = 12,
TIMESTAMP_NTZ_MICROS = 13,
FLOAT = 14,
BINARY = 15,
STRING = 16,
TIME_NTZ_MICROS = 17,
TIMESTAMP_NANOS = 18,
TIMESTAMP_NTZ_NANOS = 19,
UUID = 20,
INVALID
};
struct VariantValueMetadata {
public:
VariantValueMetadata() {
}
public:
static VariantValueMetadata FromHeaderByte(uint8_t byte);
static VariantBasicType VariantBasicTypeFromByte(uint8_t byte) {
if (byte >= static_cast<uint8_t>(VariantBasicType::INVALID)) {
throw NotImplementedException("Variant BasicType (%d) is not supported", byte);
}
return static_cast<VariantBasicType>(byte);
}
static VariantPrimitiveType VariantPrimitiveTypeFromByte(uint8_t byte) {
if (byte >= static_cast<uint8_t>(VariantPrimitiveType::INVALID)) {
throw NotImplementedException("Variant PrimitiveType (%d) is not supported", byte);
}
return static_cast<VariantPrimitiveType>(byte);
}
public:
VariantBasicType basic_type;
public:
//! Primitive Type header
VariantPrimitiveType primitive_type;
public:
//! Short String header
uint8_t string_size;
public:
//! Object header | Array header
//! Size in bytes for each 'field_offset' entry
uint32_t field_offset_size;
//! Size in bytes for each 'field_id' entry
uint32_t field_id_size;
//! Whether the number of elements is encoded in 1 byte (false) or 4 bytes (true)
bool is_large;
};
struct VariantDecodeResult {
public:
VariantDecodeResult() = default;
~VariantDecodeResult() {
if (doc) {
yyjson_mut_doc_free(doc);
}
if (data) {
free(data);
}
}
public:
yyjson_mut_doc *doc = nullptr;
char *data = nullptr;
};
class VariantBinaryDecoder {
public:
VariantBinaryDecoder() = delete;
public:
static VariantValue Decode(const VariantMetadata &metadata, const_data_ptr_t data);
public:
static VariantValue PrimitiveTypeDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data);
static VariantValue ShortStringDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data);
static VariantValue ObjectDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata,
const_data_ptr_t data);
static VariantValue ArrayDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata,
const_data_ptr_t data);
};
} // namespace duckdb

View File

@@ -0,0 +1,24 @@
#pragma once
#include "reader/variant/variant_value.hpp"
#include "reader/variant/variant_binary_decoder.hpp"
namespace duckdb {
class VariantShreddedConversion {
public:
VariantShreddedConversion() = delete;
public:
static vector<VariantValue> Convert(Vector &metadata, Vector &group, idx_t offset, idx_t length, idx_t total_size,
bool is_field);
static vector<VariantValue> ConvertShreddedLeaf(Vector &metadata, Vector &value, Vector &typed_value, idx_t offset,
idx_t length, idx_t total_size, const bool is_field);
static vector<VariantValue> ConvertShreddedArray(Vector &metadata, Vector &value, Vector &typed_value, idx_t offset,
idx_t length, idx_t total_size, const bool is_field);
static vector<VariantValue> ConvertShreddedObject(Vector &metadata, Vector &value, Vector &typed_value,
idx_t offset, idx_t length, idx_t total_size,
const bool is_field);
};
} // namespace duckdb

View File

@@ -0,0 +1,54 @@
#pragma once
#include "duckdb/common/map.hpp"
#include "duckdb/common/vector.hpp"
#include "duckdb/common/types/value.hpp"
#include "yyjson.hpp"
using namespace duckdb_yyjson;
namespace duckdb {
enum class VariantValueType : uint8_t { PRIMITIVE, OBJECT, ARRAY, MISSING };
struct VariantValue {
public:
VariantValue() : value_type(VariantValueType::MISSING) {
}
explicit VariantValue(VariantValueType type) : value_type(type) {
}
explicit VariantValue(Value &&val) : value_type(VariantValueType::PRIMITIVE), primitive_value(std::move(val)) {
}
// Delete copy constructor and copy assignment operator
VariantValue(const VariantValue &) = delete;
VariantValue &operator=(const VariantValue &) = delete;
// Default move constructor and move assignment operator
VariantValue(VariantValue &&) noexcept = default;
VariantValue &operator=(VariantValue &&) noexcept = default;
public:
bool IsNull() const {
return value_type == VariantValueType::PRIMITIVE && primitive_value.IsNull();
}
bool IsMissing() const {
return value_type == VariantValueType::MISSING;
}
public:
void AddChild(const string &key, VariantValue &&val);
void AddItem(VariantValue &&val);
public:
yyjson_mut_val *ToJSON(ClientContext &context, yyjson_mut_doc *doc) const;
public:
VariantValueType value_type;
//! FIXME: how can we get a deterministic child order for a partially shredded object?
map<string, VariantValue> object_children;
vector<VariantValue> array_items;
Value primitive_value;
};
} // namespace duckdb

View File

@@ -0,0 +1,44 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// reader/variant_column_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_reader.hpp"
#include "reader/templated_column_reader.hpp"
namespace duckdb {
class VariantColumnReader : public ColumnReader {
public:
static constexpr const PhysicalType TYPE = PhysicalType::VARCHAR;
public:
VariantColumnReader(ClientContext &context, ParquetReader &reader, const ParquetColumnSchema &schema,
vector<unique_ptr<ColumnReader>> child_readers_p);
ClientContext &context;
vector<unique_ptr<ColumnReader>> child_readers;
public:
ColumnReader &GetChildReader(idx_t child_idx);
void InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) override;
idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) override;
void Skip(idx_t num_values) override;
idx_t GroupRowsAvailable() override;
uint64_t TotalCompressedSize() override;
void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) override;
protected:
idx_t metadata_reader_idx;
idx_t value_reader_idx;
};
} // namespace duckdb

View File

@@ -0,0 +1,114 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// resizable_buffer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "duckdb/common/allocator.hpp"
#include <exception>
namespace duckdb {
class ByteBuffer { // on to the 10 thousandth impl
public:
ByteBuffer() {};
ByteBuffer(data_ptr_t ptr, uint64_t len) : ptr(ptr), len(len) {};
data_ptr_t ptr = nullptr;
uint64_t len = 0;
public:
void inc(const uint64_t increment) {
available(increment);
unsafe_inc(increment);
}
void unsafe_inc(const uint64_t increment) {
len -= increment;
ptr += increment;
}
template <class T>
T read() {
available(sizeof(T));
return unsafe_read<T>();
}
template <class T>
T unsafe_read() {
T val = unsafe_get<T>();
unsafe_inc(sizeof(T));
return val;
}
template <class T>
T get() {
available(sizeof(T));
return unsafe_get<T>();
}
template <class T>
T unsafe_get() {
return Load<T>(ptr);
}
void copy_to(char *dest, const uint64_t len) const {
available(len);
unsafe_copy_to(dest, len);
}
void unsafe_copy_to(char *dest, const uint64_t len) const {
std::memcpy(dest, ptr, len);
}
void zero() const {
std::memset(ptr, 0, len);
}
void available(const uint64_t req_len) const {
if (!check_available(req_len)) {
throw std::runtime_error("Out of buffer");
}
}
bool check_available(const uint64_t req_len) const {
return req_len <= len;
}
};
class ResizeableBuffer : public ByteBuffer {
public:
ResizeableBuffer() {
}
ResizeableBuffer(Allocator &allocator, const uint64_t new_size) {
resize(allocator, new_size);
}
void resize(Allocator &allocator, const uint64_t new_size) {
len = new_size;
if (new_size == 0) {
return;
}
if (new_size > alloc_len) {
alloc_len = NextPowerOfTwo(new_size);
allocated_data.Reset(); // Have to reset before allocating new buffer (otherwise we use ~2x the memory)
allocated_data = allocator.Allocate(alloc_len);
ptr = allocated_data.get();
}
}
void reset() {
ptr = allocated_data.get();
len = alloc_len;
}
private:
AllocatedData allocated_data;
idx_t alloc_len = 0;
};
} // namespace duckdb

View File

@@ -0,0 +1,228 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// thrift_tools.hpp
//
//
//===----------------------------------------------------------------------===/
#pragma once
#include <list>
#include "thrift/protocol/TCompactProtocol.h"
#include "thrift/transport/TBufferTransports.h"
#include "duckdb.hpp"
#include "duckdb/storage/caching_file_system.hpp"
#include "duckdb/common/file_system.hpp"
#include "duckdb/common/allocator.hpp"
namespace duckdb {
// A ReadHead for prefetching data in a specific range
struct ReadHead {
ReadHead(idx_t location, uint64_t size) : location(location), size(size) {};
// Hint info
idx_t location;
uint64_t size;
// Current info
BufferHandle buffer_handle;
data_ptr_t buffer_ptr;
bool data_isset = false;
idx_t GetEnd() const {
return size + location;
}
};
// Comparator for ReadHeads that are either overlapping, adjacent, or within ALLOW_GAP bytes from each other
struct ReadHeadComparator {
static constexpr uint64_t ALLOW_GAP = 1 << 14; // 16 KiB
bool operator()(const ReadHead *a, const ReadHead *b) const {
auto a_start = a->location;
auto a_end = a->location + a->size;
auto b_start = b->location;
if (a_end <= NumericLimits<idx_t>::Maximum() - ALLOW_GAP) {
a_end += ALLOW_GAP;
}
return a_start < b_start && a_end < b_start;
}
};
// Two-step read ahead buffer
// 1: register all ranges that will be read, merging ranges that are consecutive
// 2: prefetch all registered ranges
struct ReadAheadBuffer {
explicit ReadAheadBuffer(CachingFileHandle &file_handle_p) : file_handle(file_handle_p) {
}
// The list of read heads
std::list<ReadHead> read_heads;
// Set for merging consecutive ranges
std::set<ReadHead *, ReadHeadComparator> merge_set;
CachingFileHandle &file_handle;
idx_t total_size = 0;
// Add a read head to the prefetching list
void AddReadHead(idx_t pos, uint64_t len, bool merge_buffers = true) {
// Attempt to merge with existing
if (merge_buffers) {
ReadHead new_read_head {pos, len};
auto lookup_set = merge_set.find(&new_read_head);
if (lookup_set != merge_set.end()) {
auto existing_head = *lookup_set;
auto new_start = MinValue<idx_t>(existing_head->location, new_read_head.location);
auto new_length = MaxValue<idx_t>(existing_head->GetEnd(), new_read_head.GetEnd()) - new_start;
existing_head->location = new_start;
existing_head->size = new_length;
return;
}
}
read_heads.emplace_front(ReadHead(pos, len));
total_size += len;
auto &read_head = read_heads.front();
if (merge_buffers) {
merge_set.insert(&read_head);
}
if (read_head.GetEnd() > file_handle.GetFileSize()) {
throw std::runtime_error("Prefetch registered for bytes outside file: " + file_handle.GetPath() +
", attempted range: [" + std::to_string(pos) + ", " +
std::to_string(read_head.GetEnd()) +
"), file size: " + std::to_string(file_handle.GetFileSize()));
}
}
// Returns the relevant read head
ReadHead *GetReadHead(idx_t pos) {
for (auto &read_head : read_heads) {
if (pos >= read_head.location && pos < read_head.GetEnd()) {
return &read_head;
}
}
return nullptr;
}
// Prefetch all read heads
void Prefetch() {
for (auto &read_head : read_heads) {
if (read_head.GetEnd() > file_handle.GetFileSize()) {
throw std::runtime_error("Prefetch registered requested for bytes outside file");
}
read_head.buffer_handle = file_handle.Read(read_head.buffer_ptr, read_head.size, read_head.location);
D_ASSERT(read_head.buffer_handle.IsValid());
read_head.data_isset = true;
}
}
};
class ThriftFileTransport : public duckdb_apache::thrift::transport::TVirtualTransport<ThriftFileTransport> {
public:
static constexpr uint64_t PREFETCH_FALLBACK_BUFFERSIZE = 1000000;
ThriftFileTransport(CachingFileHandle &file_handle_p, bool prefetch_mode_p)
: file_handle(file_handle_p), location(0), size(file_handle.GetFileSize()),
ra_buffer(ReadAheadBuffer(file_handle)), prefetch_mode(prefetch_mode_p) {
}
uint32_t read(uint8_t *buf, uint32_t len) {
auto prefetch_buffer = ra_buffer.GetReadHead(location);
if (prefetch_buffer != nullptr && location - prefetch_buffer->location + len <= prefetch_buffer->size) {
D_ASSERT(location - prefetch_buffer->location + len <= prefetch_buffer->size);
if (!prefetch_buffer->data_isset) {
prefetch_buffer->buffer_handle =
file_handle.Read(prefetch_buffer->buffer_ptr, prefetch_buffer->size, prefetch_buffer->location);
D_ASSERT(prefetch_buffer->buffer_handle.IsValid());
prefetch_buffer->data_isset = true;
}
D_ASSERT(prefetch_buffer->buffer_handle.IsValid());
memcpy(buf, prefetch_buffer->buffer_ptr + location - prefetch_buffer->location, len);
} else if (prefetch_mode && len < PREFETCH_FALLBACK_BUFFERSIZE && len > 0) {
Prefetch(location, MinValue<uint64_t>(PREFETCH_FALLBACK_BUFFERSIZE, file_handle.GetFileSize() - location));
auto prefetch_buffer_fallback = ra_buffer.GetReadHead(location);
D_ASSERT(location - prefetch_buffer_fallback->location + len <= prefetch_buffer_fallback->size);
memcpy(buf, prefetch_buffer_fallback->buffer_ptr + location - prefetch_buffer_fallback->location, len);
} else {
// No prefetch, do a regular (non-caching) read
file_handle.GetFileHandle().Read(context, buf, len, location);
}
location += len;
return len;
}
// Prefetch a single buffer
void Prefetch(idx_t pos, uint64_t len) {
RegisterPrefetch(pos, len, false);
FinalizeRegistration();
PrefetchRegistered();
}
// Register a buffer for prefixing
void RegisterPrefetch(idx_t pos, uint64_t len, bool can_merge = true) {
ra_buffer.AddReadHead(pos, len, can_merge);
}
// Prevents any further merges, should be called before PrefetchRegistered
void FinalizeRegistration() {
ra_buffer.merge_set.clear();
}
// Prefetch all previously registered ranges
void PrefetchRegistered() {
ra_buffer.Prefetch();
}
void ClearPrefetch() {
ra_buffer.read_heads.clear();
ra_buffer.merge_set.clear();
}
void Skip(idx_t skip_count) {
location += skip_count;
}
bool HasPrefetch() const {
return !ra_buffer.read_heads.empty() || !ra_buffer.merge_set.empty();
}
void SetLocation(idx_t location_p) {
location = location_p;
}
idx_t GetLocation() const {
return location;
}
optional_ptr<ReadHead> GetReadHead(idx_t pos) {
return ra_buffer.GetReadHead(pos);
}
idx_t GetSize() const {
return size;
}
private:
QueryContext context;
CachingFileHandle &file_handle;
idx_t location;
idx_t size;
// Multi-buffer prefetch
ReadAheadBuffer ra_buffer;
// Whether the prefetch mode is enabled. In this mode the DirectIO flag of the handle will be set and the parquet
// reader will manage the read buffering.
bool prefetch_mode;
};
} // namespace duckdb

View File

@@ -0,0 +1,34 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/array_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "writer/list_column_writer.hpp"
namespace duckdb {
class ArrayColumnWriter : public ListColumnWriter {
public:
ArrayColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
unique_ptr<ColumnWriter> child_writer_p, bool can_have_nulls)
: ListColumnWriter(writer, column_schema, std::move(schema_path_p), std::move(child_writer_p), can_have_nulls) {
}
~ArrayColumnWriter() override = default;
public:
void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) override;
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
protected:
void WriteArrayState(ListColumnWriterState &state, idx_t array_size, uint16_t first_repeat_level,
idx_t define_value, const bool is_empty = false);
};
} // namespace duckdb

View File

@@ -0,0 +1,33 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/boolean_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "writer/primitive_column_writer.hpp"
namespace duckdb {
class BooleanColumnWriter : public PrimitiveColumnWriter {
public:
BooleanColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
bool can_have_nulls);
~BooleanColumnWriter() override = default;
public:
unique_ptr<ColumnWriterStatistics> InitializeStatsState() override;
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p, ColumnWriterPageState *state_p,
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override;
unique_ptr<ColumnWriterPageState> InitializePageState(PrimitiveColumnWriterState &state, idx_t page_idx) override;
void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) override;
idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state) const override;
};
} // namespace duckdb

View File

@@ -0,0 +1,30 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/decimal_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "writer/primitive_column_writer.hpp"
namespace duckdb {
class FixedDecimalColumnWriter : public PrimitiveColumnWriter {
public:
FixedDecimalColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
vector<string> schema_path_p, bool can_have_nulls);
~FixedDecimalColumnWriter() override = default;
public:
unique_ptr<ColumnWriterStatistics> InitializeStatsState() override;
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p, ColumnWriterPageState *page_state,
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override;
idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state) const override;
};
} // namespace duckdb

View File

@@ -0,0 +1,50 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/enum_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "writer/primitive_column_writer.hpp"
namespace duckdb {
class EnumWriterPageState;
class EnumColumnWriter : public PrimitiveColumnWriter {
public:
EnumColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
bool can_have_nulls);
~EnumColumnWriter() override = default;
uint32_t bit_width;
public:
unique_ptr<ColumnWriterStatistics> InitializeStatsState() override;
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p, ColumnWriterPageState *page_state_p,
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override;
unique_ptr<ColumnWriterPageState> InitializePageState(PrimitiveColumnWriterState &state, idx_t page_idx) override;
void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) override;
duckdb_parquet::Encoding::type GetEncoding(PrimitiveColumnWriterState &state) override;
bool HasDictionary(PrimitiveColumnWriterState &state) override;
idx_t DictionarySize(PrimitiveColumnWriterState &state_p) override;
void FlushDictionary(PrimitiveColumnWriterState &state, ColumnWriterStatistics *stats_p) override;
idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state) const override;
private:
template <class T>
void WriteEnumInternal(WriteStream &temp_writer, Vector &input_column, idx_t chunk_start, idx_t chunk_end,
EnumWriterPageState &page_state);
};
} // namespace duckdb

View File

@@ -0,0 +1,52 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/list_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_writer.hpp"
namespace duckdb {
class ListColumnWriterState : public ColumnWriterState {
public:
ListColumnWriterState(duckdb_parquet::RowGroup &row_group, idx_t col_idx) : row_group(row_group), col_idx(col_idx) {
}
~ListColumnWriterState() override = default;
duckdb_parquet::RowGroup &row_group;
idx_t col_idx;
unique_ptr<ColumnWriterState> child_state;
idx_t parent_index = 0;
};
class ListColumnWriter : public ColumnWriter {
public:
ListColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
unique_ptr<ColumnWriter> child_writer_p, bool can_have_nulls)
: ColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
child_writers.push_back(std::move(child_writer_p));
}
~ListColumnWriter() override = default;
public:
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override;
bool HasAnalyze() override;
void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
void FinalizeAnalyze(ColumnWriterState &state) override;
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) override;
void BeginWrite(ColumnWriterState &state) override;
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
void FinalizeWrite(ColumnWriterState &state) override;
protected:
ColumnWriter &GetChildWriter();
};
} // namespace duckdb

View File

@@ -0,0 +1,326 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/parquet_write_operators.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "writer/parquet_write_stats.hpp"
#include "zstd/common/xxhash.hpp"
#include "duckdb/common/types/uhugeint.hpp"
#include "duckdb/common/types/uuid.hpp"
namespace duckdb {
struct BaseParquetOperator {
template <class SRC, class TGT>
static void WriteToStream(const TGT &input, WriteStream &ser) {
ser.WriteData(const_data_ptr_cast(&input), sizeof(TGT));
}
template <class SRC, class TGT>
static constexpr idx_t WriteSize(const TGT &input) {
return sizeof(TGT);
}
template <class SRC, class TGT>
static uint64_t XXHash64(const TGT &target_value) {
return duckdb_zstd::XXH64(&target_value, sizeof(target_value), 0);
}
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return nullptr;
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
}
template <class SRC, class TGT>
static idx_t GetRowSize(const Vector &, idx_t) {
return sizeof(TGT);
}
};
struct ParquetCastOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return TGT(input);
}
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<NumericStatisticsState<SRC, TGT, BaseParquetOperator>>();
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
auto &numeric_stats = stats->Cast<NumericStatisticsState<SRC, TGT, BaseParquetOperator>>();
if (LessThan::Operation(target_value, numeric_stats.min)) {
numeric_stats.min = target_value;
}
if (GreaterThan::Operation(target_value, numeric_stats.max)) {
numeric_stats.max = target_value;
}
}
};
struct FloatingPointOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return TGT(input);
}
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<FloatingPointStatisticsState<SRC, TGT, BaseParquetOperator>>();
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
auto &numeric_stats = stats->Cast<FloatingPointStatisticsState<SRC, TGT, BaseParquetOperator>>();
if (Value::IsNan(target_value)) {
numeric_stats.has_nan = true;
} else {
if (LessThan::Operation(target_value, numeric_stats.min)) {
numeric_stats.min = target_value;
}
if (GreaterThan::Operation(target_value, numeric_stats.max)) {
numeric_stats.max = target_value;
}
}
}
};
struct ParquetTimestampNSOperator : public ParquetCastOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return TGT(input);
}
};
struct ParquetTimestampSOperator : public ParquetCastOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return Timestamp::FromEpochSecondsPossiblyInfinite(input).value;
}
};
// We will need a different operator for GEOGRAPHY later, so we define a base geo operator
struct ParquetBaseGeoOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return input;
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
auto &geo_stats = stats->Cast<GeoStatisticsState>();
geo_stats.Update(target_value);
}
template <class SRC, class TGT>
static void WriteToStream(const TGT &target_value, WriteStream &ser) {
ser.Write<uint32_t>(target_value.GetSize());
ser.WriteData(const_data_ptr_cast(target_value.GetData()), target_value.GetSize());
}
template <class SRC, class TGT>
static idx_t WriteSize(const TGT &target_value) {
return sizeof(uint32_t) + target_value.GetSize();
}
template <class SRC, class TGT>
static uint64_t XXHash64(const TGT &target_value) {
return duckdb_zstd::XXH64(target_value.GetData(), target_value.GetSize(), 0);
}
template <class SRC, class TGT>
static idx_t GetRowSize(const Vector &vector, idx_t index) {
// This needs to add the 4 bytes (just like WriteSize) otherwise we underestimate and we have to realloc
// This seriously harms performance, mostly by making it very inconsistent (see internal issue #4990)
return sizeof(uint32_t) + FlatVector::GetData<string_t>(vector)[index].GetSize();
}
};
struct ParquetGeometryOperator : public ParquetBaseGeoOperator {
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<GeoStatisticsState>();
}
};
struct ParquetBaseStringOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return input;
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
auto &string_stats = stats->Cast<StringStatisticsState>();
string_stats.Update(target_value);
}
template <class SRC, class TGT>
static void WriteToStream(const TGT &target_value, WriteStream &ser) {
ser.Write<uint32_t>(target_value.GetSize());
ser.WriteData(const_data_ptr_cast(target_value.GetData()), target_value.GetSize());
}
template <class SRC, class TGT>
static idx_t WriteSize(const TGT &target_value) {
return sizeof(uint32_t) + target_value.GetSize();
}
template <class SRC, class TGT>
static uint64_t XXHash64(const TGT &target_value) {
return duckdb_zstd::XXH64(target_value.GetData(), target_value.GetSize(), 0);
}
template <class SRC, class TGT>
static idx_t GetRowSize(const Vector &vector, idx_t index) {
// This needs to add the 4 bytes (just like WriteSize) otherwise we underestimate and we have to realloc
// This seriously harms performance, mostly by making it very inconsistent (see internal issue #4990)
return sizeof(uint32_t) + FlatVector::GetData<string_t>(vector)[index].GetSize();
}
};
struct ParquetBlobOperator : public ParquetBaseStringOperator {
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<StringStatisticsState>(LogicalTypeId::BLOB);
}
};
struct ParquetStringOperator : public ParquetBaseStringOperator {
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<StringStatisticsState>();
}
};
struct ParquetIntervalTargetType {
static constexpr const idx_t PARQUET_INTERVAL_SIZE = 12;
data_t bytes[PARQUET_INTERVAL_SIZE];
};
struct ParquetIntervalOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
if (input.days < 0 || input.months < 0 || input.micros < 0) {
throw IOException("Parquet files do not support negative intervals");
}
TGT result;
Store<uint32_t>(input.months, result.bytes);
Store<uint32_t>(input.days, result.bytes + sizeof(uint32_t));
Store<uint32_t>(input.micros / 1000, result.bytes + sizeof(uint32_t) * 2);
return result;
}
template <class SRC, class TGT>
static void WriteToStream(const TGT &target_value, WriteStream &ser) {
ser.WriteData(target_value.bytes, ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE);
}
template <class SRC, class TGT>
static constexpr idx_t WriteSize(const TGT &target_value) {
return ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE;
}
template <class SRC, class TGT>
static uint64_t XXHash64(const TGT &target_value) {
return duckdb_zstd::XXH64(target_value.bytes, ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE, 0);
}
};
struct ParquetUUIDTargetType {
static constexpr const idx_t PARQUET_UUID_SIZE = 16;
data_t bytes[PARQUET_UUID_SIZE];
};
struct ParquetUUIDOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
TGT result;
// Use the utility function from BaseUUID
BaseUUID::ToBlob(input, result.bytes);
return result;
}
template <class SRC, class TGT>
static void WriteToStream(const TGT &target_value, WriteStream &ser) {
ser.WriteData(target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE);
}
template <class SRC, class TGT>
static constexpr idx_t WriteSize(const TGT &target_value) {
return ParquetUUIDTargetType::PARQUET_UUID_SIZE;
}
template <class SRC, class TGT>
static uint64_t XXHash64(const TGT &target_value) {
return duckdb_zstd::XXH64(target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE, 0);
}
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<UUIDStatisticsState>();
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats_p, TGT target_value) {
auto &stats = stats_p->Cast<UUIDStatisticsState>();
if (!stats.has_stats || memcmp(target_value.bytes, stats.min, ParquetUUIDTargetType::PARQUET_UUID_SIZE) < 0) {
memcpy(stats.min, target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE);
}
if (!stats.has_stats || memcmp(target_value.bytes, stats.max, ParquetUUIDTargetType::PARQUET_UUID_SIZE) > 0) {
memcpy(stats.max, target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE);
}
stats.has_stats = true;
}
};
struct ParquetTimeTZOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return input.time().micros;
}
};
struct ParquetHugeintOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return Hugeint::Cast<double>(input);
}
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<ColumnWriterStatistics>();
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
}
};
struct ParquetUhugeintOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return Uhugeint::Cast<double>(input);
}
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<ColumnWriterStatistics>();
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
}
};
} // namespace duckdb

View File

@@ -0,0 +1,305 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/parquet_write_stats.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_writer.hpp"
#include "geo_parquet.hpp"
namespace duckdb {
class ColumnWriterStatistics {
public:
virtual ~ColumnWriterStatistics();
virtual bool HasStats();
virtual string GetMin();
virtual string GetMax();
virtual string GetMinValue();
virtual string GetMaxValue();
virtual bool CanHaveNaN();
virtual bool HasNaN();
virtual bool MinIsExact();
virtual bool MaxIsExact();
virtual bool HasGeoStats();
virtual optional_ptr<GeometryStatsData> GetGeoStats();
virtual void WriteGeoStats(duckdb_parquet::GeospatialStatistics &stats);
public:
template <class TARGET>
TARGET &Cast() {
DynamicCastCheck<TARGET>(this);
return reinterpret_cast<TARGET &>(*this);
}
template <class TARGET>
const TARGET &Cast() const {
D_ASSERT(dynamic_cast<const TARGET *>(this));
return reinterpret_cast<const TARGET &>(*this);
}
};
template <class SRC, class T, class OP>
class NumericStatisticsState : public ColumnWriterStatistics {
public:
NumericStatisticsState() : min(NumericLimits<T>::Maximum()), max(NumericLimits<T>::Minimum()) {
}
T min;
T max;
public:
bool HasStats() override {
return min <= max;
}
string GetMin() override {
return NumericLimits<SRC>::IsSigned() ? GetMinValue() : string();
}
string GetMax() override {
return NumericLimits<SRC>::IsSigned() ? GetMaxValue() : string();
}
string GetMinValue() override {
return HasStats() ? string(char_ptr_cast(&min), sizeof(T)) : string();
}
string GetMaxValue() override {
return HasStats() ? string(char_ptr_cast(&max), sizeof(T)) : string();
}
};
template <class SRC, class T, class OP>
class FloatingPointStatisticsState : public NumericStatisticsState<SRC, T, OP> {
public:
bool has_nan = false;
public:
bool CanHaveNaN() override {
return true;
}
bool HasNaN() override {
return has_nan;
}
};
class StringStatisticsState : public ColumnWriterStatistics {
static constexpr const idx_t MAX_STRING_STATISTICS_SIZE = 256;
public:
explicit StringStatisticsState(LogicalTypeId type = LogicalTypeId::VARCHAR)
: type(type), has_stats(false), min_truncated(false), max_truncated(false), min(), max() {
}
LogicalTypeId type;
bool has_stats;
bool min_truncated;
bool max_truncated;
bool failed_truncate = false;
string min;
string max;
public:
bool HasStats() override {
return has_stats;
}
void Update(const string_t &val) {
if (failed_truncate) {
return;
}
if (!has_stats || LessThan::Operation(val, string_t(min))) {
if (val.GetSize() > MAX_STRING_STATISTICS_SIZE) {
// string value exceeds our max string stats size - truncate
min = TruncateMin(val, MAX_STRING_STATISTICS_SIZE);
min_truncated = true;
} else {
min = val.GetString();
min_truncated = false;
}
}
if (!has_stats || GreaterThan::Operation(val, string_t(max))) {
if (val.GetSize() > MAX_STRING_STATISTICS_SIZE) {
// string value exceeds our max string stats size - truncate
if (!TryTruncateMax(val, MAX_STRING_STATISTICS_SIZE, max)) {
// we failed to truncate - this can happen in some edge cases
// skip stats for this column
failed_truncate = true;
has_stats = false;
min = string();
max = string();
return;
}
max_truncated = true;
} else {
max = val.GetString();
max_truncated = false;
}
}
has_stats = true;
}
static inline bool IsCharacter(char c) {
return (c & 0xc0) != 0x80;
}
string TruncateMin(string_t str, idx_t max_size) {
// truncate a string for the min value
// since 'AAA' < 'AAAA', we can just truncate the string
D_ASSERT(str.GetSize() > max_size);
if (type == LogicalTypeId::BLOB) {
// for blobs - just truncate directly
return string(str.GetData(), max_size);
}
D_ASSERT(type == LogicalTypeId::VARCHAR);
// for varchar we need to truncate to a valid UTF8 string - so we need to truncate to the last valid UTF8 byte
auto str_data = str.GetData();
for (; max_size > 0; max_size--) {
if (IsCharacter(str_data[max_size])) {
break;
}
}
return string(str_data, max_size);
}
bool TryTruncateMax(string_t str, idx_t max_size, string &result, data_t max_byte) {
auto data = const_data_ptr_cast(str.GetData());
// find the last position in the string which we can increment for the truncation
// if ALL characters are above the max byte we cannot truncate
idx_t increment_pos;
for (increment_pos = max_size; increment_pos > 0; increment_pos--) {
idx_t str_idx = increment_pos - 1;
if (data[str_idx] < max_byte) {
// found the increment position
break;
}
}
if (increment_pos == 0) {
// all characters are above the max byte - we cannot truncate - return false
return false;
}
// set up the result string - we don't care about anything after the increment pos
result = string(str.GetData(), increment_pos);
// actually increment
result[increment_pos - 1]++;
return true;
}
bool TryTruncateMax(string_t str, idx_t max_size, string &result) {
// truncate a string for the max value
// since 'XXX' < 'XXXX', we need to "increment" a byte to get a correct max value
// i.e. we need to generate 'XXY' as a string
// note that this is not necessarily always possible
D_ASSERT(str.GetSize() > max_size);
if (type == LogicalTypeId::BLOB) {
// for blobs we can always increment bytes - we just can't increment past the max of a single byte (2^8)
return TryTruncateMax(str, max_size, result, static_cast<data_t>(0xFF));
}
D_ASSERT(type == LogicalTypeId::VARCHAR);
// for varchar the situation is more complex - we need to truncate to a valid UTF8 string and increment
// for now we only increment ASCII characters (characters below 0x7F)
return TryTruncateMax(str, max_size, result, static_cast<data_t>(0x7F));
}
string GetMin() override {
return GetMinValue();
}
string GetMax() override {
return GetMaxValue();
}
string GetMinValue() override {
return HasStats() ? min : string();
}
string GetMaxValue() override {
return HasStats() ? max : string();
}
bool MinIsExact() override {
return !min_truncated;
}
bool MaxIsExact() override {
return !max_truncated;
}
};
class UUIDStatisticsState : public ColumnWriterStatistics {
public:
bool has_stats = false;
data_t min[16] = {0};
data_t max[16] = {0};
public:
bool HasStats() override {
return has_stats;
}
string GetMin() override {
return GetMinValue();
}
string GetMax() override {
return GetMaxValue();
}
string GetMinValue() override {
return HasStats() ? string(char_ptr_cast(min), 16) : string();
}
string GetMaxValue() override {
return HasStats() ? string(char_ptr_cast(max), 16) : string();
}
};
class GeoStatisticsState final : public ColumnWriterStatistics {
public:
explicit GeoStatisticsState() : has_stats(false) {
geo_stats.SetEmpty();
}
bool has_stats;
GeometryStatsData geo_stats;
public:
void Update(const string_t &val) {
geo_stats.Update(val);
has_stats = true;
}
bool HasGeoStats() override {
return has_stats;
}
optional_ptr<GeometryStatsData> GetGeoStats() override {
return geo_stats;
}
void WriteGeoStats(duckdb_parquet::GeospatialStatistics &stats) override {
const auto &types = geo_stats.types;
const auto &bbox = geo_stats.extent;
if (bbox.HasXY()) {
stats.__isset.bbox = true;
stats.bbox.xmin = bbox.x_min;
stats.bbox.xmax = bbox.x_max;
stats.bbox.ymin = bbox.y_min;
stats.bbox.ymax = bbox.y_max;
if (bbox.HasZ()) {
stats.bbox.__isset.zmin = true;
stats.bbox.__isset.zmax = true;
stats.bbox.zmin = bbox.z_min;
stats.bbox.zmax = bbox.z_max;
}
if (bbox.HasM()) {
stats.bbox.__isset.mmin = true;
stats.bbox.__isset.mmax = true;
stats.bbox.mmin = bbox.m_min;
stats.bbox.mmax = bbox.m_max;
}
}
stats.__isset.geospatial_types = true;
stats.geospatial_types = types.ToWKBList();
}
};
} // namespace duckdb

View File

@@ -0,0 +1,115 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/primitive_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_writer.hpp"
#include "writer/parquet_write_stats.hpp"
#include "duckdb/common/serializer/memory_stream.hpp"
#include "parquet_statistics.hpp"
namespace duckdb {
struct PageInformation {
idx_t offset = 0;
idx_t row_count = 0;
idx_t empty_count = 0;
idx_t estimated_page_size = 0;
idx_t null_count = 0;
};
struct PageWriteInformation {
duckdb_parquet::PageHeader page_header;
unique_ptr<MemoryStream> temp_writer;
unique_ptr<ColumnWriterPageState> page_state;
idx_t write_page_idx = 0;
idx_t write_count = 0;
idx_t max_write_count = 0;
size_t compressed_size;
data_ptr_t compressed_data;
AllocatedData compressed_buf;
};
class PrimitiveColumnWriterState : public ColumnWriterState {
public:
PrimitiveColumnWriterState(ParquetWriter &writer_p, duckdb_parquet::RowGroup &row_group, idx_t col_idx)
: writer(writer_p), row_group(row_group), col_idx(col_idx) {
page_info.emplace_back();
}
~PrimitiveColumnWriterState() override = default;
ParquetWriter &writer;
duckdb_parquet::RowGroup &row_group;
idx_t col_idx;
vector<PageInformation> page_info;
vector<PageWriteInformation> write_info;
unique_ptr<ColumnWriterStatistics> stats_state;
idx_t current_page = 0;
unique_ptr<ParquetBloomFilter> bloom_filter;
};
//! Base class for writing non-compound types (ex. numerics, strings)
class PrimitiveColumnWriter : public ColumnWriter {
public:
PrimitiveColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path,
bool can_have_nulls);
~PrimitiveColumnWriter() override = default;
//! We limit the uncompressed page size to 100MB
//! The max size in Parquet is 2GB, but we choose a more conservative limit
static constexpr const idx_t MAX_UNCOMPRESSED_PAGE_SIZE = 104857600ULL;
//! Dictionary pages must be below 2GB. Unlike data pages, there's only one dictionary page.
//! For this reason we go with a much higher, but still a conservative upper bound of 1GB;
static constexpr const idx_t MAX_UNCOMPRESSED_DICT_PAGE_SIZE = 1073741824ULL;
public:
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override;
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) override;
void BeginWrite(ColumnWriterState &state) override;
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
void FinalizeWrite(ColumnWriterState &state) override;
protected:
static void WriteLevels(Allocator &allocator, WriteStream &temp_writer, const unsafe_vector<uint16_t> &levels,
idx_t max_value, idx_t start_offset, idx_t count, optional_idx null_count = optional_idx());
virtual duckdb_parquet::Encoding::type GetEncoding(PrimitiveColumnWriterState &state);
void NextPage(PrimitiveColumnWriterState &state);
void FlushPage(PrimitiveColumnWriterState &state);
//! Initializes the state used to track statistics during writing. Only used for scalar types.
virtual unique_ptr<ColumnWriterStatistics> InitializeStatsState();
//! Initialize the writer for a specific page. Only used for scalar types.
virtual unique_ptr<ColumnWriterPageState> InitializePageState(PrimitiveColumnWriterState &state, idx_t page_idx);
//! Flushes the writer for a specific page. Only used for scalar types.
virtual void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state);
//! Retrieves the row size of a vector at the specified location. Only used for scalar types.
virtual idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state) const;
//! Writes a (subset of a) vector to the specified serializer. Only used for scalar types.
virtual void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats, ColumnWriterPageState *page_state,
Vector &vector, idx_t chunk_start, idx_t chunk_end) = 0;
virtual bool HasDictionary(PrimitiveColumnWriterState &state_p) {
return false;
}
//! The number of elements in the dictionary
virtual idx_t DictionarySize(PrimitiveColumnWriterState &state_p);
void WriteDictionary(PrimitiveColumnWriterState &state, unique_ptr<MemoryStream> temp_writer, idx_t row_count);
virtual void FlushDictionary(PrimitiveColumnWriterState &state, ColumnWriterStatistics *stats);
void SetParquetStatistics(PrimitiveColumnWriterState &state, duckdb_parquet::ColumnChunk &column);
void RegisterToRowGroup(duckdb_parquet::RowGroup &row_group);
};
} // namespace duckdb

View File

@@ -0,0 +1,37 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/struct_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_writer.hpp"
namespace duckdb {
class StructColumnWriter : public ColumnWriter {
public:
StructColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
vector<unique_ptr<ColumnWriter>> child_writers_p, bool can_have_nulls)
: ColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
child_writers = std::move(child_writers_p);
}
~StructColumnWriter() override = default;
public:
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override;
bool HasAnalyze() override;
void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
void FinalizeAnalyze(ColumnWriterState &state) override;
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) override;
void BeginWrite(ColumnWriterState &state) override;
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
void FinalizeWrite(ColumnWriterState &state) override;
};
} // namespace duckdb

View File

@@ -0,0 +1,444 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/templated_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "writer/primitive_column_writer.hpp"
#include "writer/parquet_write_operators.hpp"
#include "parquet_dbp_encoder.hpp"
#include "parquet_dlba_encoder.hpp"
#include "parquet_rle_bp_encoder.hpp"
#include "duckdb/common/primitive_dictionary.hpp"
namespace duckdb {
template <class SRC, class TGT, class OP = ParquetCastOperator, bool ALL_VALID>
static void TemplatedWritePlain(Vector &col, ColumnWriterStatistics *stats, const idx_t chunk_start,
const idx_t chunk_end, const ValidityMask &mask, WriteStream &ser) {
static constexpr bool COPY_DIRECTLY_FROM_VECTOR = ALL_VALID && std::is_same<SRC, TGT>::value &&
std::is_arithmetic<TGT>::value &&
std::is_same<OP, ParquetCastOperator>::value;
const auto *const ptr = FlatVector::GetData<SRC>(col);
TGT local_write[STANDARD_VECTOR_SIZE];
idx_t local_write_count = 0;
for (idx_t r = chunk_start; r < chunk_end; r++) {
if (!ALL_VALID && !mask.RowIsValid(r)) {
continue;
}
TGT target_value = OP::template Operation<SRC, TGT>(ptr[r]);
OP::template HandleStats<SRC, TGT>(stats, target_value);
if (COPY_DIRECTLY_FROM_VECTOR) {
continue;
}
if (std::is_arithmetic<TGT>::value) {
local_write[local_write_count++] = target_value;
if (local_write_count == STANDARD_VECTOR_SIZE) {
ser.WriteData(data_ptr_cast(local_write), local_write_count * sizeof(TGT));
local_write_count = 0;
}
} else {
OP::template WriteToStream<SRC, TGT>(target_value, ser);
}
}
if (COPY_DIRECTLY_FROM_VECTOR) {
ser.WriteData(const_data_ptr_cast(&ptr[chunk_start]), (chunk_end - chunk_start) * sizeof(TGT));
return;
}
if (std::is_arithmetic<TGT>::value) {
ser.WriteData(data_ptr_cast(local_write), local_write_count * sizeof(TGT));
}
// Else we already wrote to stream
}
template <class SRC, class TGT, class OP>
class StandardColumnWriterState : public PrimitiveColumnWriterState {
public:
StandardColumnWriterState(ParquetWriter &writer, duckdb_parquet::RowGroup &row_group, idx_t col_idx)
: PrimitiveColumnWriterState(writer, row_group, col_idx),
dictionary(BufferAllocator::Get(writer.GetContext()),
writer.DictionarySizeLimit().IsValid() ? writer.DictionarySizeLimit().GetIndex()
: NumericCast<idx_t>(row_group.num_rows) / 5,
writer.StringDictionaryPageSizeLimit()),
encoding(duckdb_parquet::Encoding::PLAIN) {
}
~StandardColumnWriterState() override = default;
// analysis state for integer values for DELTA_BINARY_PACKED/DELTA_LENGTH_BYTE_ARRAY
idx_t total_value_count = 0;
idx_t total_string_size = 0;
uint32_t key_bit_width = 0;
PrimitiveDictionary<SRC, TGT, OP> dictionary;
duckdb_parquet::Encoding::type encoding;
};
template <class SRC, class TGT, class OP>
class StandardWriterPageState : public ColumnWriterPageState {
public:
explicit StandardWriterPageState(const idx_t total_value_count, const idx_t total_string_size,
duckdb_parquet::Encoding::type encoding_p,
const PrimitiveDictionary<SRC, TGT, OP> &dictionary_p)
: encoding(encoding_p), dbp_initialized(false), dbp_encoder(total_value_count), dlba_initialized(false),
dlba_encoder(total_value_count, total_string_size), bss_initialized(false),
bss_encoder(total_value_count, sizeof(TGT)), dictionary(dictionary_p), dict_written_value(false),
dict_bit_width(RleBpDecoder::ComputeBitWidth(dictionary.GetSize())), dict_encoder(dict_bit_width) {
}
duckdb_parquet::Encoding::type encoding;
bool dbp_initialized;
DbpEncoder dbp_encoder;
bool dlba_initialized;
DlbaEncoder dlba_encoder;
bool bss_initialized;
BssEncoder bss_encoder;
const PrimitiveDictionary<SRC, TGT, OP> &dictionary;
bool dict_written_value;
uint32_t dict_bit_width;
RleBpEncoder dict_encoder;
};
template <class SRC, class TGT, class OP = ParquetCastOperator>
class StandardColumnWriter : public PrimitiveColumnWriter {
public:
StandardColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
vector<string> schema_path_p, // NOLINT
bool can_have_nulls)
: PrimitiveColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
}
~StandardColumnWriter() override = default;
public:
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override {
auto result = make_uniq<StandardColumnWriterState<SRC, TGT, OP>>(writer, row_group, row_group.columns.size());
result->encoding = duckdb_parquet::Encoding::RLE_DICTIONARY;
RegisterToRowGroup(row_group);
return std::move(result);
}
unique_ptr<ColumnWriterPageState> InitializePageState(PrimitiveColumnWriterState &state_p,
idx_t page_idx) override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
const auto &page_info = state_p.page_info[page_idx];
auto result = make_uniq<StandardWriterPageState<SRC, TGT, OP>>(
page_info.row_count - (page_info.empty_count + page_info.null_count), state.total_string_size,
state.encoding, state.dictionary);
return std::move(result);
}
void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) override {
auto &page_state = state_p->Cast<StandardWriterPageState<SRC, TGT, OP>>();
switch (page_state.encoding) {
case duckdb_parquet::Encoding::DELTA_BINARY_PACKED:
if (!page_state.dbp_initialized) {
page_state.dbp_encoder.BeginWrite(temp_writer, 0);
}
page_state.dbp_encoder.FinishWrite(temp_writer);
break;
case duckdb_parquet::Encoding::RLE_DICTIONARY:
D_ASSERT(page_state.dict_bit_width != 0);
if (!page_state.dict_written_value) {
// all values are null
// just write the bit width
temp_writer.Write<uint8_t>(page_state.dict_bit_width);
return;
}
page_state.dict_encoder.FinishWrite(temp_writer);
break;
case duckdb_parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY:
if (!page_state.dlba_initialized) {
page_state.dlba_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()), temp_writer,
string_t(""));
}
page_state.dlba_encoder.FinishWrite(temp_writer);
break;
case duckdb_parquet::Encoding::BYTE_STREAM_SPLIT:
if (!page_state.bss_initialized) {
page_state.bss_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()));
}
page_state.bss_encoder.FinishWrite(temp_writer);
break;
case duckdb_parquet::Encoding::PLAIN:
break;
default:
throw InternalException("Unknown encoding");
}
}
duckdb_parquet::Encoding::type GetEncoding(PrimitiveColumnWriterState &state_p) override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
return state.encoding;
}
bool HasAnalyze() override {
return true;
}
void Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
auto data_ptr = FlatVector::GetData<SRC>(vector);
idx_t vector_index = 0;
const bool check_parent_empty = parent && !parent->is_empty.empty();
const idx_t parent_index = state.definition_levels.size();
D_ASSERT(!check_parent_empty || parent_index < parent->is_empty.size());
const idx_t vcount =
check_parent_empty ? parent->definition_levels.size() - state.definition_levels.size() : count;
const auto &validity = FlatVector::Validity(vector);
if (!check_parent_empty && validity.AllValid()) {
// Fast path
for (; vector_index < vcount; vector_index++) {
const auto &src_value = data_ptr[vector_index];
state.dictionary.Insert(src_value);
state.total_value_count++;
state.total_string_size += DlbaEncoder::GetStringSize(src_value);
}
} else {
for (idx_t i = 0; i < vcount; i++) {
if (check_parent_empty && parent->is_empty[parent_index + i]) {
continue;
}
if (validity.RowIsValid(vector_index)) {
const auto &src_value = data_ptr[vector_index];
state.dictionary.Insert(src_value);
state.total_value_count++;
state.total_string_size += DlbaEncoder::GetStringSize(src_value);
}
vector_index++;
}
}
}
void FinalizeAnalyze(ColumnWriterState &state_p) override {
const auto type = writer.GetType(SchemaIndex());
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
if (state.dictionary.GetSize() == 0 || state.dictionary.IsFull()) {
state.dictionary.Reset();
if (writer.GetParquetVersion() == ParquetVersion::V1) {
// Can't do the cool stuff for V1
state.encoding = duckdb_parquet::Encoding::PLAIN;
} else {
// If we aren't doing dictionary encoding, these encodings are virtually always better than PLAIN
switch (type) {
case duckdb_parquet::Type::type::INT32:
case duckdb_parquet::Type::type::INT64:
state.encoding = duckdb_parquet::Encoding::DELTA_BINARY_PACKED;
break;
case duckdb_parquet::Type::type::BYTE_ARRAY:
state.encoding = duckdb_parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY;
break;
case duckdb_parquet::Type::type::FLOAT:
case duckdb_parquet::Type::type::DOUBLE:
state.encoding = duckdb_parquet::Encoding::BYTE_STREAM_SPLIT;
break;
default:
state.encoding = duckdb_parquet::Encoding::PLAIN;
}
}
} else {
state.key_bit_width = RleBpDecoder::ComputeBitWidth(state.dictionary.GetSize());
}
}
unique_ptr<ColumnWriterStatistics> InitializeStatsState() override {
return OP::template InitializeStats<SRC, TGT>();
}
bool HasDictionary(PrimitiveColumnWriterState &state_p) override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
return state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY;
}
idx_t DictionarySize(PrimitiveColumnWriterState &state_p) override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
return state.dictionary.GetSize();
}
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats, ColumnWriterPageState *page_state_p,
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override {
const auto &mask = FlatVector::Validity(input_column);
if (mask.AllValid()) {
WriteVectorInternal<true>(temp_writer, stats, page_state_p, input_column, chunk_start, chunk_end);
} else {
WriteVectorInternal<false>(temp_writer, stats, page_state_p, input_column, chunk_start, chunk_end);
}
}
void FlushDictionary(PrimitiveColumnWriterState &state_p, ColumnWriterStatistics *stats) override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY);
if (writer.EnableBloomFilters()) {
state.bloom_filter =
make_uniq<ParquetBloomFilter>(state.dictionary.GetSize(), writer.BloomFilterFalsePositiveRatio());
}
state.dictionary.IterateValues([&](const SRC &src_value, const TGT &tgt_value) {
// update the statistics
OP::template HandleStats<SRC, TGT>(stats, tgt_value);
if (state.bloom_filter) {
// update the bloom filter
auto hash = OP::template XXHash64<SRC, TGT>(tgt_value);
state.bloom_filter->FilterInsert(hash);
}
});
// flush the dictionary page and add it to the to-be-written pages
WriteDictionary(state, state.dictionary.GetTargetMemoryStream(), state.dictionary.GetSize());
// bloom filter will be queued for writing in ParquetWriter::BufferBloomFilter one level up
}
idx_t GetRowSize(const Vector &vector, const idx_t index,
const PrimitiveColumnWriterState &state_p) const override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY) {
return (state.key_bit_width + 7) / 8;
} else {
return OP::template GetRowSize<SRC, TGT>(vector, index);
}
}
private:
template <bool ALL_VALID>
void WriteVectorInternal(WriteStream &temp_writer, ColumnWriterStatistics *stats,
ColumnWriterPageState *page_state_p, Vector &input_column, idx_t chunk_start,
idx_t chunk_end) {
auto &page_state = page_state_p->Cast<StandardWriterPageState<SRC, TGT, OP>>();
const auto &mask = FlatVector::Validity(input_column);
const auto *data_ptr = FlatVector::GetData<SRC>(input_column);
switch (page_state.encoding) {
case duckdb_parquet::Encoding::RLE_DICTIONARY: {
idx_t r = chunk_start;
if (!page_state.dict_written_value) {
// find first non-null value
for (; r < chunk_end; r++) {
if (!mask.RowIsValid(r)) {
continue;
}
// write the bit-width as a one-byte entry and initialize writer
temp_writer.Write<uint8_t>(page_state.dict_bit_width);
page_state.dict_encoder.BeginWrite();
page_state.dict_written_value = true;
break;
}
}
for (; r < chunk_end; r++) {
if (!ALL_VALID && !mask.RowIsValid(r)) {
continue;
}
const auto &src_value = data_ptr[r];
const auto value_index = page_state.dictionary.GetIndex(src_value);
page_state.dict_encoder.WriteValue(temp_writer, value_index);
}
break;
}
case duckdb_parquet::Encoding::DELTA_BINARY_PACKED: {
idx_t r = chunk_start;
if (!page_state.dbp_initialized) {
// find first non-null value
for (; r < chunk_end; r++) {
if (!mask.RowIsValid(r)) {
continue;
}
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
OP::template HandleStats<SRC, TGT>(stats, target_value);
page_state.dbp_encoder.BeginWrite(temp_writer, target_value);
page_state.dbp_initialized = true;
r++; // skip over
break;
}
}
for (; r < chunk_end; r++) {
if (!ALL_VALID && !mask.RowIsValid(r)) {
continue;
}
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
OP::template HandleStats<SRC, TGT>(stats, target_value);
page_state.dbp_encoder.WriteValue(temp_writer, target_value);
}
break;
}
case duckdb_parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY: {
idx_t r = chunk_start;
if (!page_state.dlba_initialized) {
// find first non-null value
for (; r < chunk_end; r++) {
if (!mask.RowIsValid(r)) {
continue;
}
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
OP::template HandleStats<SRC, TGT>(stats, target_value);
page_state.dlba_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()), temp_writer,
target_value);
page_state.dlba_initialized = true;
r++; // skip over
break;
}
}
for (; r < chunk_end; r++) {
if (!ALL_VALID && !mask.RowIsValid(r)) {
continue;
}
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
OP::template HandleStats<SRC, TGT>(stats, target_value);
page_state.dlba_encoder.WriteValue(temp_writer, target_value);
}
break;
}
case duckdb_parquet::Encoding::BYTE_STREAM_SPLIT: {
if (!page_state.bss_initialized) {
page_state.bss_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()));
page_state.bss_initialized = true;
}
for (idx_t r = chunk_start; r < chunk_end; r++) {
if (!ALL_VALID && !mask.RowIsValid(r)) {
continue;
}
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
OP::template HandleStats<SRC, TGT>(stats, target_value);
page_state.bss_encoder.WriteValue(target_value);
}
break;
}
case duckdb_parquet::Encoding::PLAIN: {
D_ASSERT(page_state.encoding == duckdb_parquet::Encoding::PLAIN);
if (mask.AllValid()) {
TemplatedWritePlain<SRC, TGT, OP, true>(input_column, stats, chunk_start, chunk_end, mask, temp_writer);
} else {
TemplatedWritePlain<SRC, TGT, OP, false>(input_column, stats, chunk_start, chunk_end, mask,
temp_writer);
}
break;
}
default:
throw InternalException("Unknown encoding");
}
}
};
} // namespace duckdb

View File

@@ -0,0 +1,30 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/variant_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "struct_column_writer.hpp"
#include "duckdb/planner/expression/bound_function_expression.hpp"
namespace duckdb {
class VariantColumnWriter : public StructColumnWriter {
public:
VariantColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
vector<unique_ptr<ColumnWriter>> child_writers_p, bool can_have_nulls)
: StructColumnWriter(writer, column_schema, std::move(schema_path_p), std::move(child_writers_p),
can_have_nulls) {
}
~VariantColumnWriter() override = default;
public:
static ScalarFunction GetTransformFunction();
static LogicalType TransformTypedValueRecursive(const LogicalType &type);
};
} // namespace duckdb

View File

@@ -0,0 +1,33 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// zstd_file_system.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "duckdb/common/compressed_file_system.hpp"
namespace duckdb {
class ZStdFileSystem : public CompressedFileSystem {
public:
unique_ptr<FileHandle> OpenCompressedFile(QueryContext context, unique_ptr<FileHandle> handle, bool write) override;
std::string GetName() const override {
return "ZStdFileSystem";
}
unique_ptr<StreamWrapper> CreateStream() override;
idx_t InBufferSize() override;
idx_t OutBufferSize() override;
static int64_t DefaultCompressionLevel();
static int64_t MinimumCompressionLevel();
static int64_t MaximumCompressionLevel();
};
} // namespace duckdb

View File

@@ -0,0 +1,84 @@
import os
# list all include directories
include_directories = [
os.path.sep.join(x.split('/'))
for x in [
'extension/parquet/include',
'third_party/parquet',
'third_party/thrift',
'third_party/lz4',
'third_party/brotli/include',
'third_party/brotli/common',
'third_party/brotli/dec',
'third_party/brotli/enc',
'third_party/snappy',
'third_party/mbedtls',
'third_party/mbedtls/include',
'third_party/zstd/include',
]
]
prefix = os.path.join('extension', 'parquet')
def list_files_recursive(rootdir, suffix):
file_list = []
for root, _, files in os.walk(rootdir):
file_list += [os.path.join(root, f) for f in files if f.endswith(suffix)]
return file_list
source_files = list_files_recursive(prefix, '.cpp')
# parquet/thrift/snappy
source_files += [
os.path.sep.join(x.split('/'))
for x in [
'third_party/parquet/parquet_types.cpp',
'third_party/thrift/thrift/protocol/TProtocol.cpp',
'third_party/thrift/thrift/transport/TTransportException.cpp',
'third_party/thrift/thrift/transport/TBufferTransports.cpp',
'third_party/snappy/snappy.cc',
'third_party/snappy/snappy-sinksource.cc',
]
]
# lz4
source_files += [os.path.sep.join(x.split('/')) for x in ['third_party/lz4/lz4.cpp']]
# brotli
source_files += [
os.path.sep.join(x.split('/'))
for x in [
'third_party/brotli/common/constants.cpp',
'third_party/brotli/common/context.cpp',
'third_party/brotli/common/dictionary.cpp',
'third_party/brotli/common/platform.cpp',
'third_party/brotli/common/shared_dictionary.cpp',
'third_party/brotli/common/transform.cpp',
'third_party/brotli/dec/bit_reader.cpp',
'third_party/brotli/dec/decode.cpp',
'third_party/brotli/dec/huffman.cpp',
'third_party/brotli/dec/state.cpp',
'third_party/brotli/enc/backward_references.cpp',
'third_party/brotli/enc/backward_references_hq.cpp',
'third_party/brotli/enc/bit_cost.cpp',
'third_party/brotli/enc/block_splitter.cpp',
'third_party/brotli/enc/brotli_bit_stream.cpp',
'third_party/brotli/enc/cluster.cpp',
'third_party/brotli/enc/command.cpp',
'third_party/brotli/enc/compound_dictionary.cpp',
'third_party/brotli/enc/compress_fragment.cpp',
'third_party/brotli/enc/compress_fragment_two_pass.cpp',
'third_party/brotli/enc/dictionary_hash.cpp',
'third_party/brotli/enc/encode.cpp',
'third_party/brotli/enc/encoder_dict.cpp',
'third_party/brotli/enc/entropy_encode.cpp',
'third_party/brotli/enc/fast_log.cpp',
'third_party/brotli/enc/histogram.cpp',
'third_party/brotli/enc/literal_cost.cpp',
'third_party/brotli/enc/memory.cpp',
'third_party/brotli/enc/metablock.cpp',
'third_party/brotli/enc/static_dict.cpp',
'third_party/brotli/enc/utf8_util.cpp',
]
]

View File

@@ -0,0 +1,406 @@
#include "parquet_crypto.hpp"
#include "mbedtls_wrapper.hpp"
#include "thrift_tools.hpp"
#include "duckdb/common/exception/conversion_exception.hpp"
#include "duckdb/common/helper.hpp"
#include "duckdb/common/types/blob.hpp"
#include "duckdb/storage/arena_allocator.hpp"
namespace duckdb {
ParquetKeys &ParquetKeys::Get(ClientContext &context) {
auto &cache = ObjectCache::GetObjectCache(context);
if (!cache.Get<ParquetKeys>(ParquetKeys::ObjectType())) {
cache.Put(ParquetKeys::ObjectType(), make_shared_ptr<ParquetKeys>());
}
return *cache.Get<ParquetKeys>(ParquetKeys::ObjectType());
}
void ParquetKeys::AddKey(const string &key_name, const string &key) {
keys[key_name] = key;
}
bool ParquetKeys::HasKey(const string &key_name) const {
return keys.find(key_name) != keys.end();
}
const string &ParquetKeys::GetKey(const string &key_name) const {
D_ASSERT(HasKey(key_name));
return keys.at(key_name);
}
string ParquetKeys::ObjectType() {
return "parquet_keys";
}
string ParquetKeys::GetObjectType() {
return ObjectType();
}
ParquetEncryptionConfig::ParquetEncryptionConfig() {
}
ParquetEncryptionConfig::ParquetEncryptionConfig(string footer_key_p) : footer_key(std::move(footer_key_p)) {
}
ParquetEncryptionConfig::ParquetEncryptionConfig(ClientContext &context, const Value &arg) {
if (arg.type().id() != LogicalTypeId::STRUCT) {
throw BinderException("Parquet encryption_config must be of type STRUCT");
}
const auto &child_types = StructType::GetChildTypes(arg.type());
auto &children = StructValue::GetChildren(arg);
const auto &keys = ParquetKeys::Get(context);
for (idx_t i = 0; i < StructType::GetChildCount(arg.type()); i++) {
auto &struct_key = child_types[i].first;
if (StringUtil::Lower(struct_key) == "footer_key") {
const auto footer_key_name = StringValue::Get(children[i].DefaultCastAs(LogicalType::VARCHAR));
if (!keys.HasKey(footer_key_name)) {
throw BinderException(
"No key with name \"%s\" exists. Add it with PRAGMA add_parquet_key('<key_name>','<key>');",
footer_key_name);
}
// footer key name provided - read the key from the config
const auto &keys = ParquetKeys::Get(context);
footer_key = keys.GetKey(footer_key_name);
} else if (StringUtil::Lower(struct_key) == "footer_key_value") {
footer_key = StringValue::Get(children[i].DefaultCastAs(LogicalType::BLOB));
} else if (StringUtil::Lower(struct_key) == "column_keys") {
throw NotImplementedException("Parquet encryption_config column_keys not yet implemented");
} else {
throw BinderException("Unknown key in encryption_config \"%s\"", struct_key);
}
}
}
shared_ptr<ParquetEncryptionConfig> ParquetEncryptionConfig::Create(ClientContext &context, const Value &arg) {
return shared_ptr<ParquetEncryptionConfig>(new ParquetEncryptionConfig(context, arg));
}
const string &ParquetEncryptionConfig::GetFooterKey() const {
return footer_key;
}
using duckdb_apache::thrift::protocol::TCompactProtocolFactoryT;
using duckdb_apache::thrift::transport::TTransport;
//! Encryption wrapper for a transport protocol
class EncryptionTransport : public TTransport {
public:
EncryptionTransport(TProtocol &prot_p, const string &key, const EncryptionUtil &encryption_util_p)
: prot(prot_p), trans(*prot.getTransport()),
aes(encryption_util_p.CreateEncryptionState(EncryptionTypes::GCM, key.size())),
allocator(Allocator::DefaultAllocator(), ParquetCrypto::CRYPTO_BLOCK_SIZE) {
Initialize(key);
}
bool isOpen() const override {
return trans.isOpen();
}
void open() override {
trans.open();
}
void close() override {
trans.close();
}
void write_virt(const uint8_t *buf, uint32_t len) override {
memcpy(allocator.Allocate(len), buf, len);
}
uint32_t Finalize() {
// Write length
const auto ciphertext_length = allocator.SizeInBytes();
const uint32_t total_length = ParquetCrypto::NONCE_BYTES + ciphertext_length + ParquetCrypto::TAG_BYTES;
trans.write(const_data_ptr_cast(&total_length), ParquetCrypto::LENGTH_BYTES);
// Write nonce at beginning of encrypted chunk
trans.write(nonce, ParquetCrypto::NONCE_BYTES);
data_t aes_buffer[ParquetCrypto::CRYPTO_BLOCK_SIZE];
auto current = allocator.GetTail();
// Loop through the whole chunk
while (current != nullptr) {
for (idx_t pos = 0; pos < current->current_position; pos += ParquetCrypto::CRYPTO_BLOCK_SIZE) {
auto next = MinValue<idx_t>(current->current_position - pos, ParquetCrypto::CRYPTO_BLOCK_SIZE);
auto write_size =
aes->Process(current->data.get() + pos, next, aes_buffer, ParquetCrypto::CRYPTO_BLOCK_SIZE);
trans.write(aes_buffer, write_size);
}
current = current->prev;
}
// Finalize the last encrypted data
data_t tag[ParquetCrypto::TAG_BYTES];
auto write_size = aes->Finalize(aes_buffer, 0, tag, ParquetCrypto::TAG_BYTES);
trans.write(aes_buffer, write_size);
// Write tag for verification
trans.write(tag, ParquetCrypto::TAG_BYTES);
return ParquetCrypto::LENGTH_BYTES + total_length;
}
private:
void Initialize(const string &key) {
// Generate Nonce
aes->GenerateRandomData(nonce, ParquetCrypto::NONCE_BYTES);
// Initialize Encryption
aes->InitializeEncryption(nonce, ParquetCrypto::NONCE_BYTES, reinterpret_cast<const_data_ptr_t>(key.data()),
key.size());
}
private:
//! Protocol and corresponding transport that we're wrapping
TProtocol &prot;
TTransport &trans;
//! AES context and buffers
shared_ptr<EncryptionState> aes;
//! Nonce created by Initialize()
data_t nonce[ParquetCrypto::NONCE_BYTES];
//! Arena Allocator to fully materialize in memory before encrypting
ArenaAllocator allocator;
};
//! Decryption wrapper for a transport protocol
class DecryptionTransport : public TTransport {
public:
DecryptionTransport(TProtocol &prot_p, const string &key, const EncryptionUtil &encryption_util_p)
: prot(prot_p), trans(*prot.getTransport()),
aes(encryption_util_p.CreateEncryptionState(EncryptionTypes::GCM, key.size())), read_buffer_size(0),
read_buffer_offset(0) {
Initialize(key);
}
uint32_t read_virt(uint8_t *buf, uint32_t len) override {
const uint32_t result = len;
if (len > transport_remaining - ParquetCrypto::TAG_BYTES + read_buffer_size - read_buffer_offset) {
throw InvalidInputException("Too many bytes requested from crypto buffer");
}
while (len != 0) {
if (read_buffer_offset == read_buffer_size) {
ReadBlock(buf);
}
const auto next = MinValue(read_buffer_size - read_buffer_offset, len);
read_buffer_offset += next;
buf += next;
len -= next;
}
return result;
}
uint32_t Finalize() {
if (read_buffer_offset != read_buffer_size) {
throw InternalException("DecryptionTransport::Finalize was called with bytes remaining in read buffer: \n"
"read buffer offset: %d, read buffer size: %d",
read_buffer_offset, read_buffer_size);
}
data_t computed_tag[ParquetCrypto::TAG_BYTES];
transport_remaining -= trans.read(computed_tag, ParquetCrypto::TAG_BYTES);
aes->Finalize(read_buffer, 0, computed_tag, ParquetCrypto::TAG_BYTES);
if (transport_remaining != 0) {
throw InvalidInputException("Encoded ciphertext length differs from actual ciphertext length");
}
return ParquetCrypto::LENGTH_BYTES + total_bytes;
}
AllocatedData ReadAll() {
D_ASSERT(transport_remaining == total_bytes - ParquetCrypto::NONCE_BYTES);
auto result = Allocator::DefaultAllocator().Allocate(transport_remaining - ParquetCrypto::TAG_BYTES);
read_virt(result.get(), transport_remaining - ParquetCrypto::TAG_BYTES);
Finalize();
return result;
}
private:
void Initialize(const string &key) {
// Read encoded length (don't add to read_bytes)
data_t length_buf[ParquetCrypto::LENGTH_BYTES];
trans.read(length_buf, ParquetCrypto::LENGTH_BYTES);
total_bytes = Load<uint32_t>(length_buf);
transport_remaining = total_bytes;
// Read nonce and initialize AES
transport_remaining -= trans.read(nonce, ParquetCrypto::NONCE_BYTES);
// check whether context is initialized
aes->InitializeDecryption(nonce, ParquetCrypto::NONCE_BYTES, reinterpret_cast<const_data_ptr_t>(key.data()),
key.size());
}
void ReadBlock(uint8_t *buf) {
// Read from transport into read_buffer at one AES block size offset (up to the tag)
read_buffer_size = MinValue(ParquetCrypto::CRYPTO_BLOCK_SIZE, transport_remaining - ParquetCrypto::TAG_BYTES);
transport_remaining -= trans.read(read_buffer + ParquetCrypto::BLOCK_SIZE, read_buffer_size);
// Decrypt from read_buffer + block size into read_buffer start (decryption can trail behind in same buffer)
#ifdef DEBUG
auto size = aes->Process(read_buffer + ParquetCrypto::BLOCK_SIZE, read_buffer_size, buf,
ParquetCrypto::CRYPTO_BLOCK_SIZE + ParquetCrypto::BLOCK_SIZE);
D_ASSERT(size == read_buffer_size);
#else
aes->Process(read_buffer + ParquetCrypto::BLOCK_SIZE, read_buffer_size, buf,
ParquetCrypto::CRYPTO_BLOCK_SIZE + ParquetCrypto::BLOCK_SIZE);
#endif
read_buffer_offset = 0;
}
private:
//! Protocol and corresponding transport that we're wrapping
TProtocol &prot;
TTransport &trans;
//! AES context and buffers
shared_ptr<EncryptionState> aes;
//! We read/decrypt big blocks at a time
data_t read_buffer[ParquetCrypto::CRYPTO_BLOCK_SIZE + ParquetCrypto::BLOCK_SIZE];
uint32_t read_buffer_size;
uint32_t read_buffer_offset;
//! Remaining bytes to read, set by Initialize(), decremented by ReadBlock()
uint32_t total_bytes;
uint32_t transport_remaining;
//! Nonce read by Initialize()
data_t nonce[ParquetCrypto::NONCE_BYTES];
};
class SimpleReadTransport : public TTransport {
public:
explicit SimpleReadTransport(data_ptr_t read_buffer_p, uint32_t read_buffer_size_p)
: read_buffer(read_buffer_p), read_buffer_size(read_buffer_size_p), read_buffer_offset(0) {
}
uint32_t read_virt(uint8_t *buf, uint32_t len) override {
const auto remaining = read_buffer_size - read_buffer_offset;
if (len > remaining) {
return remaining;
}
memcpy(buf, read_buffer + read_buffer_offset, len);
read_buffer_offset += len;
return len;
}
private:
const data_ptr_t read_buffer;
const uint32_t read_buffer_size;
uint32_t read_buffer_offset;
};
uint32_t ParquetCrypto::Read(TBase &object, TProtocol &iprot, const string &key,
const EncryptionUtil &encryption_util_p) {
TCompactProtocolFactoryT<DecryptionTransport> tproto_factory;
auto dprot =
tproto_factory.getProtocol(duckdb_base_std::make_shared<DecryptionTransport>(iprot, key, encryption_util_p));
auto &dtrans = reinterpret_cast<DecryptionTransport &>(*dprot->getTransport());
// We have to read the whole thing otherwise thrift throws an error before we realize we're decryption is wrong
auto all = dtrans.ReadAll();
TCompactProtocolFactoryT<SimpleReadTransport> tsimple_proto_factory;
auto simple_prot =
tsimple_proto_factory.getProtocol(duckdb_base_std::make_shared<SimpleReadTransport>(all.get(), all.GetSize()));
// Read the object
object.read(simple_prot.get());
return ParquetCrypto::LENGTH_BYTES + ParquetCrypto::NONCE_BYTES + all.GetSize() + ParquetCrypto::TAG_BYTES;
}
uint32_t ParquetCrypto::Write(const TBase &object, TProtocol &oprot, const string &key,
const EncryptionUtil &encryption_util_p) {
// Create encryption protocol
TCompactProtocolFactoryT<EncryptionTransport> tproto_factory;
auto eprot =
tproto_factory.getProtocol(duckdb_base_std::make_shared<EncryptionTransport>(oprot, key, encryption_util_p));
auto &etrans = reinterpret_cast<EncryptionTransport &>(*eprot->getTransport());
// Write the object in memory
object.write(eprot.get());
// Encrypt and write to oprot
return etrans.Finalize();
}
uint32_t ParquetCrypto::ReadData(TProtocol &iprot, const data_ptr_t buffer, const uint32_t buffer_size,
const string &key, const EncryptionUtil &encryption_util_p) {
// Create decryption protocol
TCompactProtocolFactoryT<DecryptionTransport> tproto_factory;
auto dprot =
tproto_factory.getProtocol(duckdb_base_std::make_shared<DecryptionTransport>(iprot, key, encryption_util_p));
auto &dtrans = reinterpret_cast<DecryptionTransport &>(*dprot->getTransport());
// Read buffer
dtrans.read(buffer, buffer_size);
// Verify AES tag and read length
return dtrans.Finalize();
}
uint32_t ParquetCrypto::WriteData(TProtocol &oprot, const const_data_ptr_t buffer, const uint32_t buffer_size,
const string &key, const EncryptionUtil &encryption_util_p) {
// FIXME: we know the size upfront so we could do a streaming write instead of this
// Create encryption protocol
TCompactProtocolFactoryT<EncryptionTransport> tproto_factory;
auto eprot =
tproto_factory.getProtocol(duckdb_base_std::make_shared<EncryptionTransport>(oprot, key, encryption_util_p));
auto &etrans = reinterpret_cast<EncryptionTransport &>(*eprot->getTransport());
// Write the data in memory
etrans.write(buffer, buffer_size);
// Encrypt and write to oprot
return etrans.Finalize();
}
bool ParquetCrypto::ValidKey(const std::string &key) {
switch (key.size()) {
case 16:
case 24:
case 32:
return true;
default:
return false;
}
}
static string Base64Decode(const string &key) {
auto result_size = Blob::FromBase64Size(key);
auto output = duckdb::unique_ptr<unsigned char[]>(new unsigned char[result_size]);
Blob::FromBase64(key, output.get(), result_size);
string decoded_key(reinterpret_cast<const char *>(output.get()), result_size);
return decoded_key;
}
void ParquetCrypto::AddKey(ClientContext &context, const FunctionParameters &parameters) {
const auto &key_name = StringValue::Get(parameters.values[0]);
const auto &key = StringValue::Get(parameters.values[1]);
auto &keys = ParquetKeys::Get(context);
if (ValidKey(key)) {
keys.AddKey(key_name, key);
} else {
string decoded_key;
try {
decoded_key = Base64Decode(key);
} catch (const ConversionException &e) {
throw InvalidInputException("Invalid AES key. Not a plain AES key NOR a base64 encoded string");
}
if (!ValidKey(decoded_key)) {
throw InvalidInputException(
"Invalid AES key. Must have a length of 128, 192, or 256 bits (16, 24, or 32 bytes)");
}
keys.AddKey(key_name, decoded_key);
}
}
} // namespace duckdb

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,180 @@
#include "parquet_field_id.hpp"
#include "duckdb/common/exception/binder_exception.hpp"
namespace duckdb {
constexpr const char *FieldID::DUCKDB_FIELD_ID;
ChildFieldIDs::ChildFieldIDs() : ids(make_uniq<case_insensitive_map_t<FieldID>>()) {
}
ChildFieldIDs ChildFieldIDs::Copy() const {
ChildFieldIDs result;
for (const auto &id : *ids) {
result.ids->emplace(id.first, id.second.Copy());
}
return result;
}
FieldID::FieldID() : set(false) {
}
FieldID::FieldID(int32_t field_id_p) : set(true), field_id(field_id_p) {
}
FieldID FieldID::Copy() const {
auto result = set ? FieldID(field_id) : FieldID();
result.child_field_ids = child_field_ids.Copy();
return result;
}
static case_insensitive_map_t<LogicalType> GetChildNameToTypeMap(const LogicalType &type) {
case_insensitive_map_t<LogicalType> name_to_type_map;
switch (type.id()) {
case LogicalTypeId::LIST:
name_to_type_map.emplace("element", ListType::GetChildType(type));
break;
case LogicalTypeId::MAP:
name_to_type_map.emplace("key", MapType::KeyType(type));
name_to_type_map.emplace("value", MapType::ValueType(type));
break;
case LogicalTypeId::STRUCT:
for (auto &child_type : StructType::GetChildTypes(type)) {
if (child_type.first == FieldID::DUCKDB_FIELD_ID) {
throw BinderException("Cannot have column named \"%s\" with FIELD_IDS", FieldID::DUCKDB_FIELD_ID);
}
name_to_type_map.emplace(child_type);
}
break;
default: // LCOV_EXCL_START
throw InternalException("Unexpected type in GetChildNameToTypeMap");
} // LCOV_EXCL_STOP
return name_to_type_map;
}
static void GetChildNamesAndTypes(const LogicalType &type, vector<string> &child_names,
vector<LogicalType> &child_types) {
switch (type.id()) {
case LogicalTypeId::LIST:
child_names.emplace_back("element");
child_types.emplace_back(ListType::GetChildType(type));
break;
case LogicalTypeId::MAP:
child_names.emplace_back("key");
child_names.emplace_back("value");
child_types.emplace_back(MapType::KeyType(type));
child_types.emplace_back(MapType::ValueType(type));
break;
case LogicalTypeId::STRUCT:
for (auto &child_type : StructType::GetChildTypes(type)) {
child_names.emplace_back(child_type.first);
child_types.emplace_back(child_type.second);
}
break;
default: // LCOV_EXCL_START
throw InternalException("Unexpected type in GetChildNamesAndTypes");
} // LCOV_EXCL_STOP
}
void FieldID::GenerateFieldIDs(ChildFieldIDs &field_ids, idx_t &field_id, const vector<string> &names,
const vector<LogicalType> &sql_types) {
D_ASSERT(names.size() == sql_types.size());
for (idx_t col_idx = 0; col_idx < names.size(); col_idx++) {
const auto &col_name = names[col_idx];
auto inserted = field_ids.ids->insert(make_pair(col_name, FieldID(UnsafeNumericCast<int32_t>(field_id++))));
D_ASSERT(inserted.second);
const auto &col_type = sql_types[col_idx];
if (col_type.id() != LogicalTypeId::LIST && col_type.id() != LogicalTypeId::MAP &&
col_type.id() != LogicalTypeId::STRUCT) {
continue;
}
// Cannot use GetChildNameToTypeMap here because we lose order, and we want to generate depth-first
vector<string> child_names;
vector<LogicalType> child_types;
GetChildNamesAndTypes(col_type, child_names, child_types);
GenerateFieldIDs(inserted.first->second.child_field_ids, field_id, child_names, child_types);
}
}
void FieldID::GetFieldIDs(const Value &field_ids_value, ChildFieldIDs &field_ids,
unordered_set<uint32_t> &unique_field_ids,
const case_insensitive_map_t<LogicalType> &name_to_type_map) {
const auto &struct_type = field_ids_value.type();
if (struct_type.id() != LogicalTypeId::STRUCT) {
throw BinderException(
"Expected FIELD_IDS to be a STRUCT, e.g., {col1: 42, col2: {%s: 43, nested_col: 44}, col3: 44}",
FieldID::DUCKDB_FIELD_ID);
}
const auto &struct_children = StructValue::GetChildren(field_ids_value);
D_ASSERT(StructType::GetChildTypes(struct_type).size() == struct_children.size());
for (idx_t i = 0; i < struct_children.size(); i++) {
const auto &col_name = StringUtil::Lower(StructType::GetChildName(struct_type, i));
if (col_name == FieldID::DUCKDB_FIELD_ID) {
continue;
}
auto it = name_to_type_map.find(col_name);
if (it == name_to_type_map.end()) {
string names;
for (const auto &name : name_to_type_map) {
if (!names.empty()) {
names += ", ";
}
names += name.first;
}
throw BinderException(
"Column name \"%s\" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this "
"column is a partition column. Available column names: [%s]",
col_name, names);
}
D_ASSERT(field_ids.ids->find(col_name) == field_ids.ids->end()); // Caught by STRUCT - deduplicates keys
const auto &child_value = struct_children[i];
const auto &child_type = child_value.type();
optional_ptr<const Value> field_id_value;
optional_ptr<const Value> child_field_ids_value;
if (child_type.id() == LogicalTypeId::STRUCT) {
const auto &nested_children = StructValue::GetChildren(child_value);
D_ASSERT(StructType::GetChildTypes(child_type).size() == nested_children.size());
for (idx_t nested_i = 0; nested_i < nested_children.size(); nested_i++) {
const auto &field_id_or_nested_col = StructType::GetChildName(child_type, nested_i);
if (field_id_or_nested_col == FieldID::DUCKDB_FIELD_ID) {
field_id_value = &nested_children[nested_i];
} else {
child_field_ids_value = &child_value;
}
}
} else {
field_id_value = &child_value;
}
FieldID field_id;
if (field_id_value) {
Value field_id_integer_value = field_id_value->DefaultCastAs(LogicalType::INTEGER);
const uint32_t field_id_int = IntegerValue::Get(field_id_integer_value);
if (!unique_field_ids.insert(field_id_int).second) {
throw BinderException("Duplicate field_id %s found in FIELD_IDS", field_id_integer_value.ToString());
}
field_id = FieldID(UnsafeNumericCast<int32_t>(field_id_int));
}
auto inserted = field_ids.ids->insert(make_pair(col_name, std::move(field_id)));
D_ASSERT(inserted.second);
if (child_field_ids_value) {
const auto &col_type = it->second;
if (col_type.id() != LogicalTypeId::LIST && col_type.id() != LogicalTypeId::MAP &&
col_type.id() != LogicalTypeId::STRUCT) {
throw BinderException("Column \"%s\" with type \"%s\" cannot have a nested FIELD_IDS specification",
col_name, LogicalTypeIdToString(col_type.id()));
}
GetFieldIDs(*child_field_ids_value, inserted.first->second.child_field_ids, unique_field_ids,
GetChildNameToTypeMap(col_type));
}
}
}
} // namespace duckdb

View File

@@ -0,0 +1,55 @@
#include "parquet_file_metadata_cache.hpp"
#include "duckdb/storage/external_file_cache.hpp"
#include "duckdb/storage/caching_file_system.hpp"
namespace duckdb {
ParquetFileMetadataCache::ParquetFileMetadataCache(unique_ptr<duckdb_parquet::FileMetaData> file_metadata,
CachingFileHandle &handle,
unique_ptr<GeoParquetFileMetadata> geo_metadata, idx_t footer_size)
: metadata(std::move(file_metadata)), geo_metadata(std::move(geo_metadata)), footer_size(footer_size),
validate(handle.Validate()), last_modified(handle.GetLastModifiedTime()), version_tag(handle.GetVersionTag()) {
}
string ParquetFileMetadataCache::ObjectType() {
return "parquet_metadata";
}
string ParquetFileMetadataCache::GetObjectType() {
return ObjectType();
}
bool ParquetFileMetadataCache::IsValid(CachingFileHandle &new_handle) const {
return ExternalFileCache::IsValid(validate, version_tag, last_modified, new_handle.GetVersionTag(),
new_handle.GetLastModifiedTime());
}
ParquetCacheValidity ParquetFileMetadataCache::IsValid(const OpenFileInfo &info) const {
if (!info.extended_info) {
return ParquetCacheValidity::UNKNOWN;
}
auto &open_options = info.extended_info->options;
const auto validate_entry = open_options.find("validate_external_file_cache");
if (validate_entry != open_options.end()) {
// check if always valid - if so just return valid
if (BooleanValue::Get(validate_entry->second)) {
return ParquetCacheValidity::VALID;
}
}
const auto lm_entry = open_options.find("last_modified");
if (lm_entry == open_options.end()) {
return ParquetCacheValidity::UNKNOWN;
}
auto new_last_modified = lm_entry->second.GetValue<timestamp_t>();
string new_etag;
const auto etag_entry = open_options.find("etag");
if (etag_entry != open_options.end()) {
new_etag = StringValue::Get(etag_entry->second);
}
if (ExternalFileCache::IsValid(false, version_tag, last_modified, new_etag, new_last_modified)) {
return ParquetCacheValidity::VALID;
}
return ParquetCacheValidity::INVALID;
}
} // namespace duckdb

View File

@@ -0,0 +1,44 @@
#include "parquet_float16.hpp"
#include "duckdb.hpp"
namespace duckdb {
float Float16ToFloat32(const uint16_t &float16_value) {
uint32_t sign = float16_value >> 15;
uint32_t exponent = (float16_value >> 10) & 0x1F;
uint32_t fraction = (float16_value & 0x3FF);
// Avoid strict aliasing issues and compiler warnings
uint32_t float32_value = 0;
if (exponent == 0) {
if (fraction == 0) {
// zero
float32_value = (sign << 31);
} else {
// can be represented as ordinary value in float32
// 2 ** -14 * 0.0101
// => 2 ** -16 * 1.0100
// int int_exponent = -14;
exponent = 127 - 14;
while ((fraction & (1 << 10)) == 0) {
// int_exponent--;
exponent--;
fraction <<= 1;
}
fraction &= 0x3FF;
// int_exponent += 127;
float32_value = (sign << 31) | (exponent << 23) | (fraction << 13);
}
} else if (exponent == 0x1F) {
/* Inf or NaN */
float32_value = (sign << 31) | (0xFF << 23) | (fraction << 13);
} else {
/* ordinary number */
float32_value = (sign << 31) | ((exponent + (127 - 15)) << 23) | (fraction << 13);
}
return Load<float>(const_data_ptr_cast(&float32_value));
}
} // namespace duckdb

View File

@@ -0,0 +1,969 @@
#include "parquet_metadata.hpp"
#include "parquet_statistics.hpp"
#include <sstream>
#include "duckdb/common/multi_file/multi_file_reader.hpp"
#include "duckdb/common/types/blob.hpp"
#include "duckdb/planner/filter/constant_filter.hpp"
#include "duckdb/main/config.hpp"
#include "duckdb/common/multi_file/multi_file_list.hpp"
#include "parquet_reader.hpp"
#include "duckdb/common/numeric_utils.hpp"
namespace duckdb {
struct ParquetMetadataFilePaths {
MultiFileListScanData scan_data;
shared_ptr<MultiFileList> file_list;
mutex file_lock;
bool NextFile(OpenFileInfo &result) {
D_ASSERT(file_list);
unique_lock<mutex> lock(file_lock);
return file_list->Scan(scan_data, result);
}
FileExpandResult GetExpandResult() {
D_ASSERT(file_list);
unique_lock<mutex> lock(file_lock);
return file_list->GetExpandResult();
}
};
struct ParquetMetaDataBindData : public TableFunctionData {
unique_ptr<ParquetMetadataFilePaths> file_paths;
};
struct ParquetBloomProbeBindData : public ParquetMetaDataBindData {
string probe_column_name;
Value probe_constant;
};
enum class ParquetMetadataOperatorType : uint8_t {
META_DATA,
SCHEMA,
KEY_VALUE_META_DATA,
FILE_META_DATA,
BLOOM_PROBE
};
class ParquetMetadataFileProcessor {
public:
ParquetMetadataFileProcessor() = default;
virtual ~ParquetMetadataFileProcessor() = default;
void Initialize(ClientContext &context, OpenFileInfo &file_info) {
ParquetOptions parquet_options(context);
reader = make_uniq<ParquetReader>(context, file_info, parquet_options);
}
virtual void InitializeInternal(ClientContext &context) {};
virtual idx_t TotalRowCount() = 0;
virtual void ReadRow(DataChunk &output, idx_t output_idx, idx_t row_idx) = 0;
protected:
unique_ptr<ParquetReader> reader;
};
struct ParquetMetaDataBindData;
class ParquetMetaDataOperator {
public:
template <ParquetMetadataOperatorType OP_TYPE>
static unique_ptr<FunctionData> Bind(ClientContext &context, TableFunctionBindInput &input,
vector<LogicalType> &return_types, vector<string> &names);
static unique_ptr<GlobalTableFunctionState> InitGlobal(ClientContext &context, TableFunctionInitInput &input);
template <ParquetMetadataOperatorType OP_TYPE>
static unique_ptr<LocalTableFunctionState> InitLocal(ExecutionContext &context, TableFunctionInitInput &input,
GlobalTableFunctionState *global_state);
template <ParquetMetadataOperatorType OP_TYPE>
static void Function(ClientContext &context, TableFunctionInput &data_p, DataChunk &output);
static double Progress(ClientContext &context, const FunctionData *bind_data_p,
const GlobalTableFunctionState *global_state);
template <ParquetMetadataOperatorType OP_TYPE>
static void BindSchema(vector<LogicalType> &return_types, vector<string> &names);
};
struct ParquetMetadataGlobalState : public GlobalTableFunctionState {
ParquetMetadataGlobalState(unique_ptr<ParquetMetadataFilePaths> file_paths_p, ClientContext &context)
: file_paths(std::move(file_paths_p)) {
auto expand_result = file_paths->GetExpandResult();
if (expand_result == FileExpandResult::MULTIPLE_FILES) {
max_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
} else {
max_threads = 1;
}
}
idx_t MaxThreads() const override {
return max_threads;
}
bool NextFile(ClientContext &context, OpenFileInfo &result) {
return file_paths->NextFile(result);
}
double GetProgress() const {
// Not the most accurate, instantly assumes all files are done and equal
unique_lock<mutex> lock(file_paths->file_lock);
return static_cast<double>(file_paths->scan_data.current_file_idx) / file_paths->file_list->GetTotalFileCount();
}
unique_ptr<ParquetMetadataFilePaths> file_paths;
idx_t max_threads;
};
struct ParquetMetadataLocalState : public LocalTableFunctionState {
unique_ptr<ParquetMetadataFileProcessor> processor;
bool file_exhausted = true;
idx_t row_idx = 0;
idx_t total_rows = 0;
};
template <class T>
static string ConvertParquetElementToString(T &&entry) {
duckdb::stringstream ss;
ss << entry;
return ss.str();
}
template <class T>
static string PrintParquetElementToString(T &&entry) {
duckdb::stringstream ss;
entry.printTo(ss);
return ss.str();
}
template <class T>
static Value ParquetElementString(T &&value, bool is_set) {
if (!is_set) {
return Value();
}
return Value(ConvertParquetElementToString(value));
}
static Value ParquetElementStringVal(const string &value, bool is_set) {
if (!is_set) {
return Value();
}
return Value(value);
}
template <class T>
static Value ParquetElementInteger(T &&value, bool is_iset) {
if (!is_iset) {
return Value();
}
return Value::INTEGER(value);
}
template <class T>
static Value ParquetElementBigint(T &&value, bool is_iset) {
if (!is_iset) {
return Value();
}
return Value::BIGINT(value);
}
static Value ParquetElementBoolean(bool value, bool is_iset) {
if (!is_iset) {
return Value();
}
return Value::BOOLEAN(value);
}
//===--------------------------------------------------------------------===//
// Row Group Meta Data
//===--------------------------------------------------------------------===//
class ParquetRowGroupMetadataProcessor : public ParquetMetadataFileProcessor {
public:
void InitializeInternal(ClientContext &context) override;
idx_t TotalRowCount() override;
void ReadRow(DataChunk &output, idx_t output_idx, idx_t row_idx) override;
private:
vector<ParquetColumnSchema> column_schemas;
};
template <>
void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::META_DATA>(vector<LogicalType> &return_types,
vector<string> &names) {
names.emplace_back("file_name");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("row_group_id");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("row_group_num_rows");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("row_group_num_columns");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("row_group_bytes");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("column_id");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("file_offset");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("num_values");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("path_in_schema");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("type");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("stats_min");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("stats_max");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("stats_null_count");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("stats_distinct_count");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("stats_min_value");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("stats_max_value");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("compression");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("encodings");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("index_page_offset");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("dictionary_page_offset");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("data_page_offset");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("total_compressed_size");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("total_uncompressed_size");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("key_value_metadata");
return_types.emplace_back(LogicalType::MAP(LogicalType::BLOB, LogicalType::BLOB));
names.emplace_back("bloom_filter_offset");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("bloom_filter_length");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("min_is_exact");
return_types.emplace_back(LogicalType::BOOLEAN);
names.emplace_back("max_is_exact");
return_types.emplace_back(LogicalType::BOOLEAN);
names.emplace_back("row_group_compressed_bytes");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("geo_bbox");
return_types.emplace_back(LogicalType::STRUCT({
{"xmin", LogicalType::DOUBLE},
{"xmax", LogicalType::DOUBLE},
{"ymin", LogicalType::DOUBLE},
{"ymax", LogicalType::DOUBLE},
{"zmin", LogicalType::DOUBLE},
{"zmax", LogicalType::DOUBLE},
{"mmin", LogicalType::DOUBLE},
{"mmax", LogicalType::DOUBLE},
}));
names.emplace_back("geo_types");
return_types.emplace_back(LogicalType::LIST(LogicalType::VARCHAR));
}
static Value ConvertParquetStats(const LogicalType &type, const ParquetColumnSchema &schema_ele, bool stats_is_set,
const std::string &stats) {
if (!stats_is_set) {
return Value(LogicalType::VARCHAR);
}
return ParquetStatisticsUtils::ConvertValue(type, schema_ele, stats).DefaultCastAs(LogicalType::VARCHAR);
}
static Value ConvertParquetGeoStatsBBOX(const duckdb_parquet::GeospatialStatistics &stats) {
if (!stats.__isset.bbox) {
return Value(LogicalType::STRUCT({
{"xmin", LogicalType::DOUBLE},
{"xmax", LogicalType::DOUBLE},
{"ymin", LogicalType::DOUBLE},
{"ymax", LogicalType::DOUBLE},
{"zmin", LogicalType::DOUBLE},
{"zmax", LogicalType::DOUBLE},
{"mmin", LogicalType::DOUBLE},
{"mmax", LogicalType::DOUBLE},
}));
}
return Value::STRUCT({
{"xmin", Value::DOUBLE(stats.bbox.xmin)},
{"xmax", Value::DOUBLE(stats.bbox.xmax)},
{"ymin", Value::DOUBLE(stats.bbox.ymin)},
{"ymax", Value::DOUBLE(stats.bbox.ymax)},
{"zmin", stats.bbox.__isset.zmin ? Value::DOUBLE(stats.bbox.zmin) : Value(LogicalTypeId::DOUBLE)},
{"zmax", stats.bbox.__isset.zmax ? Value::DOUBLE(stats.bbox.zmax) : Value(LogicalTypeId::DOUBLE)},
{"mmin", stats.bbox.__isset.mmin ? Value::DOUBLE(stats.bbox.mmin) : Value(LogicalTypeId::DOUBLE)},
{"mmax", stats.bbox.__isset.mmax ? Value::DOUBLE(stats.bbox.mmax) : Value(LogicalTypeId::DOUBLE)},
});
}
static Value ConvertParquetGeoStatsTypes(const duckdb_parquet::GeospatialStatistics &stats) {
if (!stats.__isset.geospatial_types) {
return Value(LogicalType::LIST(LogicalType::VARCHAR));
}
vector<Value> types;
types.reserve(stats.geospatial_types.size());
GeometryTypeSet type_set;
for (auto &type : stats.geospatial_types) {
const auto geom_type = (type % 1000);
const auto vert_type = (type / 1000);
if (geom_type < 1 || geom_type > 7) {
throw InvalidInputException("Unsupported geometry type in Parquet geo metadata");
}
if (vert_type < 0 || vert_type > 3) {
throw InvalidInputException("Unsupported geometry vertex type in Parquet geo metadata");
}
type_set.Add(static_cast<GeometryType>(geom_type), static_cast<VertexType>(vert_type));
}
for (auto &type_name : type_set.ToString(true)) {
types.push_back(Value(type_name));
}
return Value::LIST(LogicalType::VARCHAR, types);
}
void ParquetRowGroupMetadataProcessor::InitializeInternal(ClientContext &context) {
auto meta_data = reader->GetFileMetadata();
column_schemas.clear();
for (idx_t schema_idx = 0; schema_idx < meta_data->schema.size(); schema_idx++) {
auto &schema_element = meta_data->schema[schema_idx];
if (schema_element.num_children > 0) {
continue;
}
ParquetColumnSchema column_schema;
column_schema.type = reader->DeriveLogicalType(schema_element, column_schema);
column_schemas.push_back(std::move(column_schema));
}
}
idx_t ParquetRowGroupMetadataProcessor::TotalRowCount() {
auto meta_data = reader->GetFileMetadata();
return meta_data->row_groups.size() * column_schemas.size();
}
void ParquetRowGroupMetadataProcessor::ReadRow(DataChunk &output, idx_t output_idx, idx_t row_idx) {
auto meta_data = reader->GetFileMetadata();
idx_t col_idx = row_idx % column_schemas.size();
idx_t row_group_idx = row_idx / column_schemas.size();
auto &row_group = meta_data->row_groups[row_group_idx];
auto &column = row_group.columns[col_idx];
auto &column_schema = column_schemas[col_idx];
auto &col_meta = column.meta_data;
auto &stats = col_meta.statistics;
auto &column_type = column_schema.type;
// file_name
output.SetValue(0, output_idx, reader->file.path);
// row_group_id
output.SetValue(1, output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(row_group_idx)));
// row_group_num_rows
output.SetValue(2, output_idx, Value::BIGINT(row_group.num_rows));
// row_group_num_columns
output.SetValue(3, output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(row_group.columns.size())));
// row_group_bytes
output.SetValue(4, output_idx, Value::BIGINT(row_group.total_byte_size));
// column_id
output.SetValue(5, output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(col_idx)));
// file_offset
output.SetValue(6, output_idx, ParquetElementBigint(column.file_offset, row_group.__isset.file_offset));
// num_values
output.SetValue(7, output_idx, Value::BIGINT(col_meta.num_values));
// path_in_schema
output.SetValue(8, output_idx, StringUtil::Join(col_meta.path_in_schema, ", "));
// type
output.SetValue(9, output_idx, ConvertParquetElementToString(col_meta.type));
// stats_min
output.SetValue(10, output_idx, ConvertParquetStats(column_type, column_schema, stats.__isset.min, stats.min));
// stats_max
output.SetValue(11, output_idx, ConvertParquetStats(column_type, column_schema, stats.__isset.max, stats.max));
// stats_null_count
output.SetValue(12, output_idx, ParquetElementBigint(stats.null_count, stats.__isset.null_count));
// stats_distinct_count
output.SetValue(13, output_idx, ParquetElementBigint(stats.distinct_count, stats.__isset.distinct_count));
// stats_min_value
output.SetValue(14, output_idx,
ConvertParquetStats(column_type, column_schema, stats.__isset.min_value, stats.min_value));
// stats_max_value
output.SetValue(15, output_idx,
ConvertParquetStats(column_type, column_schema, stats.__isset.max_value, stats.max_value));
// compression
output.SetValue(16, output_idx, ConvertParquetElementToString(col_meta.codec));
// encodings
vector<string> encoding_string;
encoding_string.reserve(col_meta.encodings.size());
for (auto &encoding : col_meta.encodings) {
encoding_string.push_back(ConvertParquetElementToString(encoding));
}
output.SetValue(17, output_idx, Value(StringUtil::Join(encoding_string, ", ")));
// index_page_offset
output.SetValue(18, output_idx,
ParquetElementBigint(col_meta.index_page_offset, col_meta.__isset.index_page_offset));
// dictionary_page_offset
output.SetValue(19, output_idx,
ParquetElementBigint(col_meta.dictionary_page_offset, col_meta.__isset.dictionary_page_offset));
// data_page_offset
output.SetValue(20, output_idx, Value::BIGINT(col_meta.data_page_offset));
// total_compressed_size
output.SetValue(21, output_idx, Value::BIGINT(col_meta.total_compressed_size));
// total_uncompressed_size
output.SetValue(22, output_idx, Value::BIGINT(col_meta.total_uncompressed_size));
// key_value_metadata
vector<Value> map_keys, map_values;
for (auto &entry : col_meta.key_value_metadata) {
map_keys.push_back(Value::BLOB_RAW(entry.key));
map_values.push_back(Value::BLOB_RAW(entry.value));
}
output.SetValue(23, output_idx,
Value::MAP(LogicalType::BLOB, LogicalType::BLOB, std::move(map_keys), std::move(map_values)));
// bloom_filter_offset
output.SetValue(24, output_idx,
ParquetElementBigint(col_meta.bloom_filter_offset, col_meta.__isset.bloom_filter_offset));
// bloom_filter_length
output.SetValue(25, output_idx,
ParquetElementBigint(col_meta.bloom_filter_length, col_meta.__isset.bloom_filter_length));
// min_is_exact
output.SetValue(26, output_idx, ParquetElementBoolean(stats.is_min_value_exact, stats.__isset.is_min_value_exact));
// max_is_exact
output.SetValue(27, output_idx, ParquetElementBoolean(stats.is_max_value_exact, stats.__isset.is_max_value_exact));
// row_group_compressed_bytes
output.SetValue(28, output_idx,
ParquetElementBigint(row_group.total_compressed_size, row_group.__isset.total_compressed_size));
// geo_stats_bbox, LogicalType::STRUCT(...)
output.SetValue(29, output_idx, ConvertParquetGeoStatsBBOX(col_meta.geospatial_statistics));
// geo_stats_types, LogicalType::LIST(LogicalType::VARCHAR)
output.SetValue(30, output_idx, ConvertParquetGeoStatsTypes(col_meta.geospatial_statistics));
}
//===--------------------------------------------------------------------===//
// Schema Data
//===--------------------------------------------------------------------===//
class ParquetSchemaProcessor : public ParquetMetadataFileProcessor {
public:
idx_t TotalRowCount() override;
void ReadRow(DataChunk &output, idx_t output_idx, idx_t row_idx) override;
};
template <>
void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::SCHEMA>(vector<LogicalType> &return_types,
vector<string> &names) {
names.emplace_back("file_name");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("name");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("type");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("type_length");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("repetition_type");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("num_children");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("converted_type");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("scale");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("precision");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("field_id");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("logical_type");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("duckdb_type");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("column_id");
return_types.emplace_back(LogicalType::BIGINT);
}
static Value ParquetLogicalTypeToString(const duckdb_parquet::LogicalType &type, bool is_set) {
if (!is_set) {
return Value();
}
if (type.__isset.STRING) {
return Value(PrintParquetElementToString(type.STRING));
}
if (type.__isset.MAP) {
return Value(PrintParquetElementToString(type.MAP));
}
if (type.__isset.LIST) {
return Value(PrintParquetElementToString(type.LIST));
}
if (type.__isset.ENUM) {
return Value(PrintParquetElementToString(type.ENUM));
}
if (type.__isset.DECIMAL) {
return Value(PrintParquetElementToString(type.DECIMAL));
}
if (type.__isset.DATE) {
return Value(PrintParquetElementToString(type.DATE));
}
if (type.__isset.TIME) {
return Value(PrintParquetElementToString(type.TIME));
}
if (type.__isset.TIMESTAMP) {
return Value(PrintParquetElementToString(type.TIMESTAMP));
}
if (type.__isset.INTEGER) {
return Value(PrintParquetElementToString(type.INTEGER));
}
if (type.__isset.UNKNOWN) {
return Value(PrintParquetElementToString(type.UNKNOWN));
}
if (type.__isset.JSON) {
return Value(PrintParquetElementToString(type.JSON));
}
if (type.__isset.BSON) {
return Value(PrintParquetElementToString(type.BSON));
}
if (type.__isset.UUID) {
return Value(PrintParquetElementToString(type.UUID));
}
if (type.__isset.FLOAT16) {
return Value(PrintParquetElementToString(type.FLOAT16));
}
if (type.__isset.GEOMETRY) {
return Value(PrintParquetElementToString(type.GEOMETRY));
}
if (type.__isset.GEOGRAPHY) {
return Value(PrintParquetElementToString(type.GEOGRAPHY));
}
return Value();
}
idx_t ParquetSchemaProcessor::TotalRowCount() {
return reader->GetFileMetadata()->schema.size();
}
void ParquetSchemaProcessor::ReadRow(DataChunk &output, idx_t output_idx, idx_t row_idx) {
auto meta_data = reader->GetFileMetadata();
const auto &column = meta_data->schema[row_idx];
// file_name
output.SetValue(0, output_idx, reader->file.path);
// name
output.SetValue(1, output_idx, column.name);
// type
output.SetValue(2, output_idx, ParquetElementString(column.type, column.__isset.type));
// type_length
output.SetValue(3, output_idx, ParquetElementInteger(column.type_length, column.__isset.type_length));
// repetition_type
output.SetValue(4, output_idx, ParquetElementString(column.repetition_type, column.__isset.repetition_type));
// num_children
output.SetValue(5, output_idx, ParquetElementBigint(column.num_children, column.__isset.num_children));
// converted_type
output.SetValue(6, output_idx, ParquetElementString(column.converted_type, column.__isset.converted_type));
// scale
output.SetValue(7, output_idx, ParquetElementBigint(column.scale, column.__isset.scale));
// precision
output.SetValue(8, output_idx, ParquetElementBigint(column.precision, column.__isset.precision));
// field_id
output.SetValue(9, output_idx, ParquetElementBigint(column.field_id, column.__isset.field_id));
// logical_type
output.SetValue(10, output_idx, ParquetLogicalTypeToString(column.logicalType, column.__isset.logicalType));
// duckdb_type
ParquetColumnSchema column_schema;
Value duckdb_type;
if (column.__isset.type) {
duckdb_type = reader->DeriveLogicalType(column, column_schema).ToString();
}
output.SetValue(11, output_idx, duckdb_type);
// column_id
output.SetValue(12, output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(row_idx)));
}
//===--------------------------------------------------------------------===//
// KV Meta Data
//===--------------------------------------------------------------------===//
class ParquetKeyValueMetadataProcessor : public ParquetMetadataFileProcessor {
public:
idx_t TotalRowCount() override;
void ReadRow(DataChunk &output, idx_t output_idx, idx_t row_idx) override;
};
template <>
void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::KEY_VALUE_META_DATA>(
vector<LogicalType> &return_types, vector<string> &names) {
names.emplace_back("file_name");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("key");
return_types.emplace_back(LogicalType::BLOB);
names.emplace_back("value");
return_types.emplace_back(LogicalType::BLOB);
}
idx_t ParquetKeyValueMetadataProcessor::TotalRowCount() {
return reader->GetFileMetadata()->key_value_metadata.size();
}
void ParquetKeyValueMetadataProcessor::ReadRow(DataChunk &output, idx_t output_idx, idx_t row_idx) {
auto meta_data = reader->GetFileMetadata();
auto &entry = meta_data->key_value_metadata[row_idx];
output.SetValue(0, output_idx, Value(reader->file.path));
output.SetValue(1, output_idx, Value::BLOB_RAW(entry.key));
output.SetValue(2, output_idx, Value::BLOB_RAW(entry.value));
}
//===--------------------------------------------------------------------===//
// File Meta Data
//===--------------------------------------------------------------------===//
class ParquetFileMetadataProcessor : public ParquetMetadataFileProcessor {
public:
idx_t TotalRowCount() override;
void ReadRow(DataChunk &output, idx_t output_idx, idx_t row_idx) override;
};
template <>
void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::FILE_META_DATA>(vector<LogicalType> &return_types,
vector<string> &names) {
names.emplace_back("file_name");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("created_by");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("num_rows");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("num_row_groups");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("format_version");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("encryption_algorithm");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("footer_signing_key_metadata");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("file_size_bytes");
return_types.emplace_back(LogicalType::UBIGINT);
names.emplace_back("footer_size");
return_types.emplace_back(LogicalType::UBIGINT);
}
idx_t ParquetFileMetadataProcessor::TotalRowCount() {
return 1;
}
void ParquetFileMetadataProcessor::ReadRow(DataChunk &output, idx_t output_idx, idx_t row_idx) {
auto meta_data = reader->GetFileMetadata();
// file_name
output.SetValue(0, output_idx, Value(reader->file.path));
// created_by
output.SetValue(1, output_idx, ParquetElementStringVal(meta_data->created_by, meta_data->__isset.created_by));
// num_rows
output.SetValue(2, output_idx, Value::BIGINT(meta_data->num_rows));
// num_row_groups
output.SetValue(3, output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(meta_data->row_groups.size())));
// format_version
output.SetValue(4, output_idx, Value::BIGINT(meta_data->version));
// encryption_algorithm
output.SetValue(5, output_idx,
ParquetElementString(meta_data->encryption_algorithm, meta_data->__isset.encryption_algorithm));
// footer_signing_key_metadata
output.SetValue(6, output_idx,
ParquetElementStringVal(meta_data->footer_signing_key_metadata,
meta_data->__isset.footer_signing_key_metadata));
// file_size_bytes
output.SetValue(7, output_idx, Value::UBIGINT(reader->GetHandle().GetFileSize()));
// footer_size
output.SetValue(8, output_idx, Value::UBIGINT(reader->metadata->footer_size));
}
//===--------------------------------------------------------------------===//
// Bloom Probe
//===--------------------------------------------------------------------===//
class ParquetBloomProbeProcessor : public ParquetMetadataFileProcessor {
public:
ParquetBloomProbeProcessor(const string &probe_column, const Value &probe_value);
void InitializeInternal(ClientContext &context) override;
idx_t TotalRowCount() override;
void ReadRow(DataChunk &output, idx_t output_idx, idx_t row_idx) override;
private:
string probe_column_name;
Value probe_constant;
optional_idx probe_column_idx;
unique_ptr<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>> protocol;
optional_ptr<Allocator> allocator;
unique_ptr<ConstantFilter> filter;
};
template <>
void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::BLOOM_PROBE>(vector<LogicalType> &return_types,
vector<string> &names) {
names.emplace_back("file_name");
return_types.emplace_back(LogicalType::VARCHAR);
names.emplace_back("row_group_id");
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("bloom_filter_excludes");
return_types.emplace_back(LogicalType::BOOLEAN);
}
ParquetBloomProbeProcessor::ParquetBloomProbeProcessor(const string &probe_column, const Value &probe_value)
: probe_column_name(probe_column), probe_constant(probe_value) {
}
void ParquetBloomProbeProcessor::InitializeInternal(ClientContext &context) {
probe_column_idx = optional_idx::Invalid();
for (idx_t column_idx = 0; column_idx < reader->columns.size(); column_idx++) {
if (reader->columns[column_idx].name == probe_column_name) {
probe_column_idx = column_idx;
break;
}
}
if (!probe_column_idx.IsValid()) {
throw InvalidInputException("Column %s not found in %s", probe_column_name, reader->file.path);
}
auto transport = duckdb_base_std::make_shared<ThriftFileTransport>(reader->GetHandle(), false);
protocol = make_uniq<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>>(std::move(transport));
allocator = &BufferAllocator::Get(context);
filter = make_uniq<ConstantFilter>(
ExpressionType::COMPARE_EQUAL,
probe_constant.CastAs(context, reader->GetColumns()[probe_column_idx.GetIndex()].type));
}
idx_t ParquetBloomProbeProcessor::TotalRowCount() {
return reader->GetFileMetadata()->row_groups.size();
}
void ParquetBloomProbeProcessor::ReadRow(DataChunk &output, idx_t output_idx, idx_t row_idx) {
auto meta_data = reader->GetFileMetadata();
auto &row_group = meta_data->row_groups[row_idx];
auto &column = row_group.columns[probe_column_idx.GetIndex()];
D_ASSERT(!probe_constant.IsNull());
auto bloom_excludes = ParquetStatisticsUtils::BloomFilterExcludes(*filter, column.meta_data, *protocol, *allocator);
output.SetValue(0, output_idx, Value(reader->file.path));
output.SetValue(1, output_idx, Value::BIGINT(NumericCast<int64_t>(row_idx)));
output.SetValue(2, output_idx, Value::BOOLEAN(bloom_excludes));
}
//===--------------------------------------------------------------------===//
// Template Function Implementation
//===--------------------------------------------------------------------===//
template <ParquetMetadataOperatorType OP_TYPE>
unique_ptr<FunctionData> ParquetMetaDataOperator::Bind(ClientContext &context, TableFunctionBindInput &input,
vector<LogicalType> &return_types, vector<string> &names) {
// Extract file paths from input using MultiFileReader (handles both single files and arrays)
auto multi_file_reader = MultiFileReader::CreateDefault("ParquetMetadata");
auto glob_input = FileGlobInput(FileGlobOptions::FALLBACK_GLOB, "parquet");
auto result = make_uniq<ParquetMetaDataBindData>();
// Bind schema based on operation type
if (OP_TYPE == ParquetMetadataOperatorType::BLOOM_PROBE) {
auto probe_bind_data = make_uniq<ParquetBloomProbeBindData>();
D_ASSERT(input.inputs.size() == 3);
if (input.inputs[1].IsNull() || input.inputs[2].IsNull()) {
throw InvalidInputException("Can't have NULL parameters for parquet_bloom_probe");
}
probe_bind_data->probe_column_name = input.inputs[1].CastAs(context, LogicalType::VARCHAR).GetValue<string>();
probe_bind_data->probe_constant = input.inputs[2];
result = std::move(probe_bind_data);
}
result->file_paths = make_uniq<ParquetMetadataFilePaths>();
result->file_paths->file_list = multi_file_reader->CreateFileList(context, input.inputs[0], glob_input);
D_ASSERT(!result->file_paths->file_list->IsEmpty());
result->file_paths->file_list->InitializeScan(result->file_paths->scan_data);
BindSchema<OP_TYPE>(return_types, names);
return std::move(result);
}
unique_ptr<GlobalTableFunctionState> ParquetMetaDataOperator::InitGlobal(ClientContext &context,
TableFunctionInitInput &input) {
auto &bind_data = input.bind_data->CastNoConst<ParquetMetaDataBindData>();
return make_uniq<ParquetMetadataGlobalState>(std::move(bind_data.file_paths), context);
}
template <ParquetMetadataOperatorType OP_TYPE>
unique_ptr<LocalTableFunctionState> ParquetMetaDataOperator::InitLocal(ExecutionContext &context,
TableFunctionInitInput &input,
GlobalTableFunctionState *global_state) {
auto &bind_data = input.bind_data->Cast<ParquetMetaDataBindData>();
auto res = make_uniq<ParquetMetadataLocalState>();
switch (OP_TYPE) {
case ParquetMetadataOperatorType::META_DATA:
res->processor = make_uniq<ParquetRowGroupMetadataProcessor>();
break;
case ParquetMetadataOperatorType::SCHEMA:
res->processor = make_uniq<ParquetSchemaProcessor>();
break;
case ParquetMetadataOperatorType::KEY_VALUE_META_DATA:
res->processor = make_uniq<ParquetKeyValueMetadataProcessor>();
break;
case ParquetMetadataOperatorType::FILE_META_DATA:
res->processor = make_uniq<ParquetFileMetadataProcessor>();
break;
case ParquetMetadataOperatorType::BLOOM_PROBE: {
const auto &probe_bind_data = static_cast<const ParquetBloomProbeBindData &>(bind_data);
res->processor =
make_uniq<ParquetBloomProbeProcessor>(probe_bind_data.probe_column_name, probe_bind_data.probe_constant);
break;
}
default:
throw InternalException("Unsupported ParquetMetadataOperatorType");
}
return unique_ptr_cast<LocalTableFunctionState, ParquetMetadataLocalState>(std::move(res));
}
template <ParquetMetadataOperatorType OP_TYPE>
void ParquetMetaDataOperator::Function(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
auto &global_state = data_p.global_state->Cast<ParquetMetadataGlobalState>();
auto &local_state = data_p.local_state->Cast<ParquetMetadataLocalState>();
idx_t output_count = 0;
while (output_count < STANDARD_VECTOR_SIZE) {
// Check if we need a new file
if (local_state.file_exhausted) {
OpenFileInfo next_file;
if (!global_state.file_paths->NextFile(next_file)) {
break; // No more files to process
}
local_state.processor->Initialize(context, next_file);
local_state.processor->InitializeInternal(context);
local_state.file_exhausted = false;
local_state.row_idx = 0;
local_state.total_rows = local_state.processor->TotalRowCount();
}
idx_t left_in_vector = STANDARD_VECTOR_SIZE - output_count;
idx_t left_in_file = local_state.total_rows - local_state.row_idx;
idx_t rows_to_output = 0;
if (left_in_file <= left_in_vector) {
local_state.file_exhausted = true;
rows_to_output = left_in_file;
} else {
rows_to_output = left_in_vector;
}
for (idx_t i = 0; i < rows_to_output; ++i) {
local_state.processor->ReadRow(output, output_count + i, local_state.row_idx + i);
}
output_count += rows_to_output;
local_state.row_idx += rows_to_output;
}
output.SetCardinality(output_count);
}
double ParquetMetaDataOperator::Progress(ClientContext &context, const FunctionData *bind_data_p,
const GlobalTableFunctionState *global_state) {
auto &global_data = global_state->Cast<ParquetMetadataGlobalState>();
return global_data.GetProgress() * 100.0;
}
ParquetMetaDataFunction::ParquetMetaDataFunction()
: TableFunction("parquet_metadata", {LogicalType::VARCHAR},
ParquetMetaDataOperator::Function<ParquetMetadataOperatorType::META_DATA>,
ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::META_DATA>,
ParquetMetaDataOperator::InitGlobal,
ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::META_DATA>) {
table_scan_progress = ParquetMetaDataOperator::Progress;
}
ParquetSchemaFunction::ParquetSchemaFunction()
: TableFunction("parquet_schema", {LogicalType::VARCHAR},
ParquetMetaDataOperator::Function<ParquetMetadataOperatorType::SCHEMA>,
ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::SCHEMA>,
ParquetMetaDataOperator::InitGlobal,
ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::SCHEMA>) {
table_scan_progress = ParquetMetaDataOperator::Progress;
}
ParquetKeyValueMetadataFunction::ParquetKeyValueMetadataFunction()
: TableFunction("parquet_kv_metadata", {LogicalType::VARCHAR},
ParquetMetaDataOperator::Function<ParquetMetadataOperatorType::KEY_VALUE_META_DATA>,
ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::KEY_VALUE_META_DATA>,
ParquetMetaDataOperator::InitGlobal,
ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::KEY_VALUE_META_DATA>) {
table_scan_progress = ParquetMetaDataOperator::Progress;
}
ParquetFileMetadataFunction::ParquetFileMetadataFunction()
: TableFunction("parquet_file_metadata", {LogicalType::VARCHAR},
ParquetMetaDataOperator::Function<ParquetMetadataOperatorType::FILE_META_DATA>,
ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::FILE_META_DATA>,
ParquetMetaDataOperator::InitGlobal,
ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::FILE_META_DATA>) {
table_scan_progress = ParquetMetaDataOperator::Progress;
}
ParquetBloomProbeFunction::ParquetBloomProbeFunction()
: TableFunction("parquet_bloom_probe", {LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::ANY},
ParquetMetaDataOperator::Function<ParquetMetadataOperatorType::BLOOM_PROBE>,
ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::BLOOM_PROBE>,
ParquetMetaDataOperator::InitGlobal,
ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::BLOOM_PROBE>) {
table_scan_progress = ParquetMetaDataOperator::Progress;
}
} // namespace duckdb

View File

@@ -0,0 +1,594 @@
#include "parquet_multi_file_info.hpp"
#include "duckdb/common/multi_file/multi_file_function.hpp"
#include "duckdb/parser/parsed_data/create_table_function_info.hpp"
#include "duckdb/common/serializer/serializer.hpp"
#include "duckdb/common/serializer/deserializer.hpp"
#include "parquet_crypto.hpp"
#include "duckdb/function/table_function.hpp"
namespace duckdb {
struct ParquetReadBindData : public TableFunctionData {
// These come from the initial_reader, but need to be stored in case the initial_reader is removed by a filter
idx_t initial_file_cardinality;
idx_t initial_file_row_groups;
idx_t explicit_cardinality = 0; // can be set to inject exterior cardinality knowledge (e.g. from a data lake)
unique_ptr<ParquetFileReaderOptions> options;
ParquetOptions &GetParquetOptions() {
return options->options;
}
const ParquetOptions &GetParquetOptions() const {
return options->options;
}
unique_ptr<FunctionData> Copy() const override {
auto result = make_uniq<ParquetReadBindData>();
result->initial_file_cardinality = initial_file_cardinality;
result->initial_file_row_groups = initial_file_row_groups;
result->explicit_cardinality = explicit_cardinality;
result->options = make_uniq<ParquetFileReaderOptions>(options->options);
return std::move(result);
}
};
struct ParquetReadGlobalState : public GlobalTableFunctionState {
explicit ParquetReadGlobalState(optional_ptr<const PhysicalOperator> op_p)
: row_group_index(0), batch_index(0), op(op_p) {
}
//! Index of row group within file currently up for scanning
idx_t row_group_index;
//! Batch index of the next row group to be scanned
idx_t batch_index;
//! (Optional) pointer to physical operator performing the scan
optional_ptr<const PhysicalOperator> op;
};
struct ParquetReadLocalState : public LocalTableFunctionState {
ParquetReaderScanState scan_state;
};
static void ParseFileRowNumberOption(MultiFileReaderBindData &bind_data, ParquetOptions &options,
vector<LogicalType> &return_types, vector<string> &names) {
if (options.file_row_number) {
if (StringUtil::CIFind(names, "file_row_number") != DConstants::INVALID_INDEX) {
throw BinderException(
"Using file_row_number option on file with column named file_row_number is not supported");
}
return_types.emplace_back(LogicalType::BIGINT);
names.emplace_back("file_row_number");
}
}
static void BindSchema(ClientContext &context, vector<LogicalType> &return_types, vector<string> &names,
MultiFileBindData &bind_data) {
auto &parquet_bind = bind_data.bind_data->Cast<ParquetReadBindData>();
auto &options = parquet_bind.GetParquetOptions();
D_ASSERT(!options.schema.empty());
auto &file_options = bind_data.file_options;
if (file_options.union_by_name || file_options.hive_partitioning) {
throw BinderException("Parquet schema cannot be combined with union_by_name=true or hive_partitioning=true");
}
auto &reader_bind = bind_data.reader_bind;
vector<string> schema_col_names;
vector<LogicalType> schema_col_types;
schema_col_names.reserve(options.schema.size());
schema_col_types.reserve(options.schema.size());
bool match_by_field_id;
if (!options.schema.empty()) {
auto &column = options.schema[0];
if (column.identifier.type().id() == LogicalTypeId::INTEGER) {
match_by_field_id = true;
} else {
match_by_field_id = false;
}
} else {
match_by_field_id = false;
}
for (idx_t i = 0; i < options.schema.size(); i++) {
const auto &column = options.schema[i];
schema_col_names.push_back(column.name);
schema_col_types.push_back(column.type);
auto res = MultiFileColumnDefinition(column.name, column.type);
res.identifier = column.identifier;
#ifdef DEBUG
if (match_by_field_id) {
D_ASSERT(res.identifier.type().id() == LogicalTypeId::INTEGER);
} else {
D_ASSERT(res.identifier.type().id() == LogicalTypeId::VARCHAR);
}
#endif
res.default_expression = make_uniq<ConstantExpression>(column.default_value);
reader_bind.schema.emplace_back(res);
}
ParseFileRowNumberOption(reader_bind, options, return_types, names);
if (options.file_row_number) {
MultiFileColumnDefinition res("file_row_number", LogicalType::BIGINT);
res.identifier = Value::INTEGER(MultiFileReader::ORDINAL_FIELD_ID);
schema_col_names.push_back(res.name);
schema_col_types.push_back(res.type);
reader_bind.schema.emplace_back(res);
}
if (match_by_field_id) {
reader_bind.mapping = MultiFileColumnMappingMode::BY_FIELD_ID;
} else {
reader_bind.mapping = MultiFileColumnMappingMode::BY_NAME;
}
// perform the binding on the obtained set of names + types
bind_data.multi_file_reader->BindOptions(file_options, *bind_data.file_list, schema_col_types, schema_col_names,
reader_bind);
names = schema_col_names;
return_types = schema_col_types;
D_ASSERT(names.size() == return_types.size());
}
unique_ptr<MultiFileReaderInterface> ParquetMultiFileInfo::CreateInterface(ClientContext &context) {
return make_uniq<ParquetMultiFileInfo>();
}
void ParquetMultiFileInfo::BindReader(ClientContext &context, vector<LogicalType> &return_types, vector<string> &names,
MultiFileBindData &bind_data) {
auto &parquet_bind = bind_data.bind_data->Cast<ParquetReadBindData>();
auto &options = parquet_bind.GetParquetOptions();
if (!options.schema.empty()) {
BindSchema(context, return_types, names, bind_data);
} else {
bind_data.reader_bind =
bind_data.multi_file_reader->BindReader(context, return_types, names, *bind_data.file_list, bind_data,
*parquet_bind.options, bind_data.file_options);
}
}
static bool GetBooleanArgument(const string &key, const vector<Value> &option_values) {
if (option_values.empty()) {
return true;
}
Value boolean_value;
string error_message;
if (!option_values[0].DefaultTryCastAs(LogicalType::BOOLEAN, boolean_value, &error_message)) {
throw InvalidInputException("Unable to cast \"%s\" to BOOLEAN for Parquet option \"%s\"",
option_values[0].ToString(), key);
}
return BooleanValue::Get(boolean_value);
}
static bool ParquetScanPushdownExpression(ClientContext &context, const LogicalGet &get, Expression &expr) {
return true;
}
static void VerifyParquetSchemaParameter(const Value &schema) {
LogicalType::MAP(LogicalType::BLOB, LogicalType::STRUCT({{{"name", LogicalType::VARCHAR},
{"type", LogicalType::VARCHAR},
{"default_value", LogicalType::VARCHAR}}}));
auto &map_type = schema.type();
if (map_type.id() != LogicalTypeId::MAP) {
throw InvalidInputException("'schema' expects a value of type MAP, not %s",
LogicalTypeIdToString(map_type.id()));
}
auto &key_type = MapType::KeyType(map_type);
auto &value_type = MapType::ValueType(map_type);
if (value_type.id() != LogicalTypeId::STRUCT) {
throw InvalidInputException("'schema' expects a STRUCT as the value type of the map");
}
auto &children = StructType::GetChildTypes(value_type);
if (children.size() < 3) {
throw InvalidInputException(
"'schema' expects the STRUCT to have 3 children, 'name', 'type' and 'default_value");
}
if (!StringUtil::CIEquals(children[0].first, "name")) {
throw InvalidInputException("'schema' expects the first field of the struct to be called 'name'");
}
if (children[0].second.id() != LogicalTypeId::VARCHAR) {
throw InvalidInputException("'schema' expects the 'name' field to be of type VARCHAR, not %s",
LogicalTypeIdToString(children[0].second.id()));
}
if (!StringUtil::CIEquals(children[1].first, "type")) {
throw InvalidInputException("'schema' expects the second field of the struct to be called 'type'");
}
if (children[1].second.id() != LogicalTypeId::VARCHAR) {
throw InvalidInputException("'schema' expects the 'type' field to be of type VARCHAR, not %s",
LogicalTypeIdToString(children[1].second.id()));
}
if (!StringUtil::CIEquals(children[2].first, "default_value")) {
throw InvalidInputException("'schema' expects the third field of the struct to be called 'default_value'");
}
//! NOTE: default_value can be any type
if (key_type.id() != LogicalTypeId::INTEGER && key_type.id() != LogicalTypeId::VARCHAR) {
throw InvalidInputException(
"'schema' expects the value type of the map to be either INTEGER or VARCHAR, not %s",
LogicalTypeIdToString(key_type.id()));
}
}
static void ParquetScanSerialize(Serializer &serializer, const optional_ptr<FunctionData> bind_data_p,
const TableFunction &function) {
auto &bind_data = bind_data_p->Cast<MultiFileBindData>();
auto &parquet_data = bind_data.bind_data->Cast<ParquetReadBindData>();
vector<string> files;
for (auto &file : bind_data.file_list->GetAllFiles()) {
files.emplace_back(file.path);
}
serializer.WriteProperty(100, "files", files);
serializer.WriteProperty(101, "types", bind_data.types);
serializer.WriteProperty(102, "names", bind_data.names);
ParquetOptionsSerialization serialization(parquet_data.GetParquetOptions(), bind_data.file_options);
serializer.WriteProperty(103, "parquet_options", serialization);
if (serializer.ShouldSerialize(3)) {
serializer.WriteProperty(104, "table_columns", bind_data.table_columns);
}
}
static unique_ptr<FunctionData> ParquetScanDeserialize(Deserializer &deserializer, TableFunction &function) {
auto &context = deserializer.Get<ClientContext &>();
auto files = deserializer.ReadProperty<vector<string>>(100, "files");
auto types = deserializer.ReadProperty<vector<LogicalType>>(101, "types");
auto names = deserializer.ReadProperty<vector<string>>(102, "names");
auto serialization = deserializer.ReadProperty<ParquetOptionsSerialization>(103, "parquet_options");
auto table_columns =
deserializer.ReadPropertyWithExplicitDefault<vector<string>>(104, "table_columns", vector<string> {});
vector<Value> file_path;
for (auto &path : files) {
file_path.emplace_back(path);
}
FileGlobInput input(FileGlobOptions::FALLBACK_GLOB, "parquet");
auto multi_file_reader = MultiFileReader::Create(function);
auto file_list = multi_file_reader->CreateFileList(context, Value::LIST(LogicalType::VARCHAR, file_path), input);
auto parquet_options = make_uniq<ParquetFileReaderOptions>(std::move(serialization.parquet_options));
auto interface = make_uniq<ParquetMultiFileInfo>();
auto bind_data = MultiFileFunction<ParquetMultiFileInfo>::MultiFileBindInternal(
context, std::move(multi_file_reader), std::move(file_list), types, names,
std::move(serialization.file_options), std::move(parquet_options), std::move(interface));
bind_data->Cast<MultiFileBindData>().table_columns = std::move(table_columns);
return bind_data;
}
static vector<column_t> ParquetGetRowIdColumns(ClientContext &context, optional_ptr<FunctionData> bind_data) {
vector<column_t> result;
result.emplace_back(MultiFileReader::COLUMN_IDENTIFIER_FILE_INDEX);
result.emplace_back(MultiFileReader::COLUMN_IDENTIFIER_FILE_ROW_NUMBER);
return result;
}
static vector<PartitionStatistics> ParquetGetPartitionStats(ClientContext &context, GetPartitionStatsInput &input) {
auto &bind_data = input.bind_data->Cast<MultiFileBindData>();
vector<PartitionStatistics> result;
if (bind_data.file_list->GetExpandResult() == FileExpandResult::SINGLE_FILE && bind_data.initial_reader) {
// we have read the metadata - get the partitions for this reader
auto &reader = bind_data.initial_reader->Cast<ParquetReader>();
reader.GetPartitionStats(result);
return result;
}
// if we are reading multiple files - we check if we have caching enabled
if (!ParquetReader::MetadataCacheEnabled(context)) {
// no caching - bail
return result;
}
// caching is enabled - check if we have ALL of the metadata cached
vector<shared_ptr<ParquetFileMetadataCache>> caches;
for (auto &file : bind_data.file_list->Files()) {
auto metadata_entry = ParquetReader::GetMetadataCacheEntry(context, file);
if (!metadata_entry) {
// no cache entry found
return result;
}
// check if the file has any deletes
if (file.extended_info) {
auto entry = file.extended_info->options.find("has_deletes");
if (entry != file.extended_info->options.end()) {
if (BooleanValue::Get(entry->second)) {
// the file has deletes - skip emitting partition stats
// FIXME: we could emit partition stats but set count to `COUNT_APPROXIMATE` instead of
// `COUNT_EXACT`
return result;
}
}
}
// check if the cache is valid based ONLY on the OpenFileInfo (do not do any file system requests here)
auto is_valid = metadata_entry->IsValid(file);
if (is_valid != ParquetCacheValidity::VALID) {
return result;
}
caches.push_back(std::move(metadata_entry));
}
// all caches are valid! we can return the partition stats
for (auto &cache : caches) {
ParquetReader::GetPartitionStats(*cache->metadata, result);
}
return result;
}
TableFunctionSet ParquetScanFunction::GetFunctionSet() {
MultiFileFunction<ParquetMultiFileInfo> table_function("parquet_scan");
table_function.named_parameters["binary_as_string"] = LogicalType::BOOLEAN;
table_function.named_parameters["file_row_number"] = LogicalType::BOOLEAN;
table_function.named_parameters["debug_use_openssl"] = LogicalType::BOOLEAN;
table_function.named_parameters["compression"] = LogicalType::VARCHAR;
table_function.named_parameters["explicit_cardinality"] = LogicalType::UBIGINT;
table_function.named_parameters["schema"] = LogicalTypeId::ANY;
table_function.named_parameters["encryption_config"] = LogicalTypeId::ANY;
table_function.named_parameters["parquet_version"] = LogicalType::VARCHAR;
table_function.named_parameters["can_have_nan"] = LogicalType::BOOLEAN;
table_function.statistics = MultiFileFunction<ParquetMultiFileInfo>::MultiFileScanStats;
table_function.serialize = ParquetScanSerialize;
table_function.deserialize = ParquetScanDeserialize;
table_function.get_row_id_columns = ParquetGetRowIdColumns;
table_function.pushdown_expression = ParquetScanPushdownExpression;
table_function.get_partition_stats = ParquetGetPartitionStats;
table_function.filter_pushdown = true;
table_function.filter_prune = true;
table_function.late_materialization = true;
return MultiFileReader::CreateFunctionSet(static_cast<TableFunction>(table_function));
}
unique_ptr<BaseFileReaderOptions> ParquetMultiFileInfo::InitializeOptions(ClientContext &context,
optional_ptr<TableFunctionInfo> info) {
return make_uniq<ParquetFileReaderOptions>(context);
}
bool ParquetMultiFileInfo::ParseCopyOption(ClientContext &context, const string &key, const vector<Value> &values,
BaseFileReaderOptions &file_options, vector<string> &expected_names,
vector<LogicalType> &expected_types) {
auto &parquet_options = file_options.Cast<ParquetFileReaderOptions>();
auto &options = parquet_options.options;
if (key == "compression" || key == "codec" || key == "row_group_size") {
// CODEC/COMPRESSION and ROW_GROUP_SIZE options have no effect on parquet read.
// These options are determined from the file.
return true;
}
if (key == "binary_as_string") {
options.binary_as_string = GetBooleanArgument(key, values);
return true;
}
if (key == "file_row_number") {
options.file_row_number = GetBooleanArgument(key, values);
return true;
}
if (key == "debug_use_openssl") {
options.debug_use_openssl = GetBooleanArgument(key, values);
return true;
}
if (key == "encryption_config") {
if (values.size() != 1) {
throw BinderException("Parquet encryption_config cannot be empty!");
}
options.encryption_config = ParquetEncryptionConfig::Create(context, values[0]);
return true;
}
if (key == "can_have_nan") {
if (values.size() != 1) {
throw BinderException("Parquet can_have_nan cannot be empty!");
}
options.can_have_nan = GetBooleanArgument(key, values);
return true;
}
return false;
}
bool ParquetMultiFileInfo::ParseOption(ClientContext &context, const string &original_key, const Value &val,
MultiFileOptions &file_options, BaseFileReaderOptions &base_options) {
auto &parquet_options = base_options.Cast<ParquetFileReaderOptions>();
auto &options = parquet_options.options;
auto key = StringUtil::Lower(original_key);
if (val.IsNull()) {
throw BinderException("Cannot use NULL as argument to %s", original_key);
}
if (key == "compression") {
// COMPRESSION has no effect on parquet read.
// These options are determined from the file.
return true;
}
if (key == "binary_as_string") {
options.binary_as_string = BooleanValue::Get(val);
return true;
}
if (key == "variant_legacy_encoding") {
options.variant_legacy_encoding = BooleanValue::Get(val);
return true;
}
if (key == "file_row_number") {
options.file_row_number = BooleanValue::Get(val);
return true;
}
if (key == "debug_use_openssl") {
options.debug_use_openssl = BooleanValue::Get(val);
return true;
}
if (key == "can_have_nan") {
options.can_have_nan = BooleanValue::Get(val);
return true;
}
if (key == "schema") {
// Argument is a map that defines the schema
const auto &schema_value = val;
VerifyParquetSchemaParameter(schema_value);
const auto column_values = ListValue::GetChildren(schema_value);
if (column_values.empty()) {
throw BinderException("Parquet schema cannot be empty");
}
options.schema.reserve(column_values.size());
for (idx_t i = 0; i < column_values.size(); i++) {
options.schema.emplace_back(ParquetColumnDefinition::FromSchemaValue(context, column_values[i]));
}
file_options.auto_detect_hive_partitioning = false;
return true;
}
if (key == "explicit_cardinality") {
options.explicit_cardinality = UBigIntValue::Get(val);
return true;
}
if (key == "encryption_config") {
options.encryption_config = ParquetEncryptionConfig::Create(context, val);
return true;
}
return false;
}
unique_ptr<TableFunctionData> ParquetMultiFileInfo::InitializeBindData(MultiFileBindData &multi_file_data,
unique_ptr<BaseFileReaderOptions> options_p) {
auto result = make_uniq<ParquetReadBindData>();
// Set the explicit cardinality if requested
result->options = unique_ptr_cast<BaseFileReaderOptions, ParquetFileReaderOptions>(std::move(options_p));
auto &parquet_options = result->GetParquetOptions();
if (parquet_options.explicit_cardinality) {
auto file_count = multi_file_data.file_list->GetTotalFileCount();
result->explicit_cardinality = parquet_options.explicit_cardinality;
result->initial_file_cardinality = result->explicit_cardinality / (file_count ? file_count : 1);
}
return std::move(result);
}
void ParquetMultiFileInfo::GetBindInfo(const TableFunctionData &bind_data_p, BindInfo &info) {
auto &bind_data = bind_data_p.Cast<ParquetReadBindData>();
auto &parquet_options = bind_data.GetParquetOptions();
info.type = ScanType::PARQUET;
info.InsertOption("binary_as_string", Value::BOOLEAN(parquet_options.binary_as_string));
info.InsertOption("file_row_number", Value::BOOLEAN(parquet_options.file_row_number));
info.InsertOption("debug_use_openssl", Value::BOOLEAN(parquet_options.debug_use_openssl));
}
optional_idx ParquetMultiFileInfo::MaxThreads(const MultiFileBindData &bind_data_p,
const MultiFileGlobalState &global_state,
FileExpandResult expand_result) {
if (expand_result == FileExpandResult::MULTIPLE_FILES) {
// always launch max threads if we are reading multiple files
return optional_idx();
}
auto &bind_data = bind_data_p.bind_data->Cast<ParquetReadBindData>();
return MaxValue(bind_data.initial_file_row_groups, static_cast<idx_t>(1));
}
void ParquetMultiFileInfo::FinalizeBindData(MultiFileBindData &multi_file_data) {
auto &bind_data = multi_file_data.bind_data->Cast<ParquetReadBindData>();
if (multi_file_data.initial_reader) {
auto &initial_reader = multi_file_data.initial_reader->Cast<ParquetReader>();
bind_data.initial_file_cardinality = initial_reader.NumRows();
bind_data.initial_file_row_groups = initial_reader.NumRowGroups();
bind_data.options->options = initial_reader.parquet_options;
}
}
unique_ptr<NodeStatistics> ParquetMultiFileInfo::GetCardinality(const MultiFileBindData &bind_data_p,
idx_t file_count) {
auto &bind_data = bind_data_p.bind_data->Cast<ParquetReadBindData>();
if (bind_data.explicit_cardinality) {
return make_uniq<NodeStatistics>(bind_data.explicit_cardinality);
}
return make_uniq<NodeStatistics>(MaxValue(bind_data.initial_file_cardinality, (idx_t)1) * file_count);
}
unique_ptr<BaseStatistics> ParquetReader::GetStatistics(ClientContext &context, const string &name) {
return ReadStatistics(name);
}
double ParquetReader::GetProgressInFile(ClientContext &context) {
auto read_rows = rows_read.load();
return 100.0 * (static_cast<double>(read_rows) / static_cast<double>(NumRows()));
}
void ParquetMultiFileInfo::GetVirtualColumns(ClientContext &, MultiFileBindData &, virtual_column_map_t &result) {
result.insert(make_pair(MultiFileReader::COLUMN_IDENTIFIER_FILE_ROW_NUMBER,
TableColumn("file_row_number", LogicalType::BIGINT)));
}
shared_ptr<BaseFileReader> ParquetMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &,
BaseUnionData &union_data_p,
const MultiFileBindData &bind_data_p) {
auto &union_data = union_data_p.Cast<ParquetUnionData>();
return make_shared_ptr<ParquetReader>(context, union_data.file, union_data.options, union_data.metadata);
}
shared_ptr<BaseFileReader> ParquetMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &,
const OpenFileInfo &file, idx_t file_idx,
const MultiFileBindData &multi_bind_data) {
auto &bind_data = multi_bind_data.bind_data->Cast<ParquetReadBindData>();
return make_shared_ptr<ParquetReader>(context, file, bind_data.GetParquetOptions());
}
shared_ptr<BaseFileReader> ParquetMultiFileInfo::CreateReader(ClientContext &context, const OpenFileInfo &file,
BaseFileReaderOptions &options_p,
const MultiFileOptions &) {
auto &options = options_p.Cast<ParquetFileReaderOptions>();
return make_shared_ptr<ParquetReader>(context, file, options.options);
}
shared_ptr<BaseUnionData> ParquetReader::GetUnionData(idx_t file_idx) {
auto result = make_uniq<ParquetUnionData>(file);
for (auto &column : columns) {
result->names.push_back(column.name);
result->types.push_back(column.type);
}
if (file_idx == 0) {
result->options = parquet_options;
result->metadata = metadata;
result->reader = shared_from_this();
} else {
result->options = std::move(parquet_options);
result->metadata = std::move(metadata);
result->root_schema = std::move(root_schema);
}
return std::move(result);
}
unique_ptr<GlobalTableFunctionState> ParquetMultiFileInfo::InitializeGlobalState(ClientContext &, MultiFileBindData &,
MultiFileGlobalState &global_state) {
return make_uniq<ParquetReadGlobalState>(global_state.op);
}
unique_ptr<LocalTableFunctionState> ParquetMultiFileInfo::InitializeLocalState(ExecutionContext &,
GlobalTableFunctionState &) {
return make_uniq<ParquetReadLocalState>();
}
bool ParquetReader::TryInitializeScan(ClientContext &context, GlobalTableFunctionState &gstate_p,
LocalTableFunctionState &lstate_p) {
auto &gstate = gstate_p.Cast<ParquetReadGlobalState>();
auto &lstate = lstate_p.Cast<ParquetReadLocalState>();
if (gstate.row_group_index >= NumRowGroups()) {
// scanned all row groups in this file
return false;
}
// The current reader has rowgroups left to be scanned
vector<idx_t> group_indexes {gstate.row_group_index};
InitializeScan(context, lstate.scan_state, group_indexes);
gstate.row_group_index++;
return true;
}
void ParquetReader::FinishFile(ClientContext &context, GlobalTableFunctionState &gstate_p) {
auto &gstate = gstate_p.Cast<ParquetReadGlobalState>();
gstate.row_group_index = 0;
}
void ParquetReader::Scan(ClientContext &context, GlobalTableFunctionState &gstate_p,
LocalTableFunctionState &local_state_p, DataChunk &chunk) {
auto &gstate = gstate_p.Cast<ParquetReadGlobalState>();
auto &local_state = local_state_p.Cast<ParquetReadLocalState>();
local_state.scan_state.op = gstate.op;
Scan(context, local_state.scan_state, chunk);
}
unique_ptr<MultiFileReaderInterface> ParquetMultiFileInfo::Copy() {
return make_uniq<ParquetMultiFileInfo>();
}
FileGlobInput ParquetMultiFileInfo::GetGlobInput() {
return FileGlobInput(FileGlobOptions::FALLBACK_GLOB, "parquet");
}
} // namespace duckdb

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,81 @@
#include "parquet_shredding.hpp"
#include "duckdb/common/exception/binder_exception.hpp"
#include "duckdb/common/type_visitor.hpp"
namespace duckdb {
ChildShreddingTypes::ChildShreddingTypes() : types(make_uniq<case_insensitive_map_t<ShreddingType>>()) {
}
ChildShreddingTypes ChildShreddingTypes::Copy() const {
ChildShreddingTypes result;
for (const auto &type : *types) {
result.types->emplace(type.first, type.second.Copy());
}
return result;
}
ShreddingType::ShreddingType() : set(false) {
}
ShreddingType::ShreddingType(const LogicalType &type) : set(true), type(type) {
}
ShreddingType ShreddingType::Copy() const {
auto result = set ? ShreddingType(type) : ShreddingType();
result.children = children.Copy();
return result;
}
static ShreddingType ConvertShreddingTypeRecursive(const LogicalType &type) {
if (type.id() == LogicalTypeId::VARIANT) {
return ShreddingType(LogicalType(LogicalTypeId::ANY));
}
if (!type.IsNested()) {
return ShreddingType(type);
}
switch (type.id()) {
case LogicalTypeId::STRUCT: {
ShreddingType res(type);
auto &children = StructType::GetChildTypes(type);
for (auto &entry : children) {
res.AddChild(entry.first, ConvertShreddingTypeRecursive(entry.second));
}
return res;
}
case LogicalTypeId::LIST: {
ShreddingType res(type);
const auto &child = ListType::GetChildType(type);
res.AddChild("element", ConvertShreddingTypeRecursive(child));
return res;
}
default:
break;
}
throw BinderException("VARIANT can only be shredded on LIST/STRUCT/ANY/non-nested type, not %s", type.ToString());
}
void ShreddingType::AddChild(const string &name, ShreddingType &&child) {
children.types->emplace(name, std::move(child));
}
optional_ptr<const ShreddingType> ShreddingType::GetChild(const string &name) const {
auto it = children.types->find(name);
if (it == children.types->end()) {
return nullptr;
}
return it->second;
}
ShreddingType ShreddingType::GetShreddingTypes(const Value &val) {
if (val.type().id() != LogicalTypeId::VARCHAR) {
throw BinderException("SHREDDING value should be of type VARCHAR, a stringified type to use for the column");
}
auto type_str = val.GetValue<string>();
auto logical_type = TransformStringToLogicalType(type_str);
return ConvertShreddingTypeRecursive(logical_type);
}
} // namespace duckdb

View File

@@ -0,0 +1,640 @@
#include "parquet_statistics.hpp"
#include "duckdb.hpp"
#include "parquet_decimal_utils.hpp"
#include "parquet_timestamp.hpp"
#include "parquet_float16.hpp"
#include "parquet_reader.hpp"
#include "reader/string_column_reader.hpp"
#include "reader/struct_column_reader.hpp"
#include "zstd/common/xxhash.hpp"
#include "duckdb/common/types/blob.hpp"
#include "duckdb/common/types/time.hpp"
#include "duckdb/common/types/value.hpp"
#include "duckdb/storage/statistics/struct_stats.hpp"
#include "duckdb/planner/filter/constant_filter.hpp"
#include "reader/uuid_column_reader.hpp"
namespace duckdb {
using duckdb_parquet::ConvertedType;
using duckdb_parquet::Type;
unique_ptr<BaseStatistics> ParquetStatisticsUtils::CreateNumericStats(const LogicalType &type,
const ParquetColumnSchema &schema_ele,
const duckdb_parquet::Statistics &parquet_stats) {
auto stats = NumericStats::CreateUnknown(type);
// for reasons unknown to science, Parquet defines *both* `min` and `min_value` as well as `max` and
// `max_value`. All are optional. such elegance.
Value min;
Value max;
if (parquet_stats.__isset.min_value) {
min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min_value);
} else if (parquet_stats.__isset.min) {
min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min);
} else {
min = Value(type);
}
if (parquet_stats.__isset.max_value) {
max = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max_value);
} else if (parquet_stats.__isset.max) {
max = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max);
} else {
max = Value(type);
}
NumericStats::SetMin(stats, min);
NumericStats::SetMax(stats, max);
return stats.ToUnique();
}
static unique_ptr<BaseStatistics> CreateFloatingPointStats(const LogicalType &type,
const ParquetColumnSchema &schema_ele,
const duckdb_parquet::Statistics &parquet_stats) {
auto stats = NumericStats::CreateUnknown(type);
// floating point values can always have NaN values - hence we cannot use the max value from the file
Value min;
Value max;
if (parquet_stats.__isset.min_value) {
min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min_value);
} else if (parquet_stats.__isset.min) {
min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min);
} else {
min = Value(type);
}
max = Value("nan").DefaultCastAs(type);
NumericStats::SetMin(stats, min);
NumericStats::SetMax(stats, max);
return stats.ToUnique();
}
Value ParquetStatisticsUtils::ConvertValue(const LogicalType &type, const ParquetColumnSchema &schema_ele,
const std::string &stats) {
Value result;
string error;
auto stats_val = ConvertValueInternal(type, schema_ele, stats);
if (!stats_val.DefaultTryCastAs(type, result, &error)) {
return Value(type);
}
return result;
}
Value ParquetStatisticsUtils::ConvertValueInternal(const LogicalType &type, const ParquetColumnSchema &schema_ele,
const std::string &stats) {
auto stats_data = const_data_ptr_cast(stats.c_str());
switch (type.id()) {
case LogicalTypeId::BOOLEAN: {
if (stats.size() != sizeof(bool)) {
throw InvalidInputException("Incorrect stats size for type BOOLEAN");
}
return Value::BOOLEAN(Load<bool>(stats_data));
}
case LogicalTypeId::UTINYINT:
case LogicalTypeId::USMALLINT:
case LogicalTypeId::UINTEGER:
if (stats.size() != sizeof(uint32_t)) {
throw InvalidInputException("Incorrect stats size for type UINTEGER");
}
return Value::UINTEGER(Load<uint32_t>(stats_data));
case LogicalTypeId::UBIGINT:
if (stats.size() != sizeof(uint64_t)) {
throw InvalidInputException("Incorrect stats size for type UBIGINT");
}
return Value::UBIGINT(Load<uint64_t>(stats_data));
case LogicalTypeId::TINYINT:
case LogicalTypeId::SMALLINT:
case LogicalTypeId::INTEGER:
if (stats.size() != sizeof(int32_t)) {
throw InvalidInputException("Incorrect stats size for type INTEGER");
}
return Value::INTEGER(Load<int32_t>(stats_data));
case LogicalTypeId::BIGINT:
if (stats.size() != sizeof(int64_t)) {
throw InvalidInputException("Incorrect stats size for type BIGINT");
}
return Value::BIGINT(Load<int64_t>(stats_data));
case LogicalTypeId::FLOAT: {
float val;
if (schema_ele.type_info == ParquetExtraTypeInfo::FLOAT16) {
if (stats.size() != sizeof(uint16_t)) {
throw InvalidInputException("Incorrect stats size for type FLOAT16");
}
val = Float16ToFloat32(Load<uint16_t>(stats_data));
} else {
if (stats.size() != sizeof(float)) {
throw InvalidInputException("Incorrect stats size for type FLOAT");
}
val = Load<float>(stats_data);
}
if (!Value::FloatIsFinite(val)) {
return Value();
}
return Value::FLOAT(val);
}
case LogicalTypeId::DOUBLE: {
if (schema_ele.type_info == ParquetExtraTypeInfo::DECIMAL_BYTE_ARRAY) {
// decimals cast to double
return Value::DOUBLE(ParquetDecimalUtils::ReadDecimalValue<double>(stats_data, stats.size(), schema_ele));
}
if (stats.size() != sizeof(double)) {
throw InvalidInputException("Incorrect stats size for type DOUBLE");
}
auto val = Load<double>(stats_data);
if (!Value::DoubleIsFinite(val)) {
return Value();
}
return Value::DOUBLE(val);
}
case LogicalTypeId::DECIMAL: {
auto width = DecimalType::GetWidth(type);
auto scale = DecimalType::GetScale(type);
switch (schema_ele.type_info) {
case ParquetExtraTypeInfo::DECIMAL_INT32:
if (stats.size() != sizeof(int32_t)) {
throw InvalidInputException("Incorrect stats size for type %s", type.ToString());
}
return Value::DECIMAL(Load<int32_t>(stats_data), width, scale);
case ParquetExtraTypeInfo::DECIMAL_INT64:
if (stats.size() != sizeof(int64_t)) {
throw InvalidInputException("Incorrect stats size for type %s", type.ToString());
}
return Value::DECIMAL(Load<int64_t>(stats_data), width, scale);
case ParquetExtraTypeInfo::DECIMAL_BYTE_ARRAY:
switch (type.InternalType()) {
case PhysicalType::INT16:
return Value::DECIMAL(
ParquetDecimalUtils::ReadDecimalValue<int16_t>(stats_data, stats.size(), schema_ele), width, scale);
case PhysicalType::INT32:
return Value::DECIMAL(
ParquetDecimalUtils::ReadDecimalValue<int32_t>(stats_data, stats.size(), schema_ele), width, scale);
case PhysicalType::INT64:
return Value::DECIMAL(
ParquetDecimalUtils::ReadDecimalValue<int64_t>(stats_data, stats.size(), schema_ele), width, scale);
case PhysicalType::INT128:
return Value::DECIMAL(
ParquetDecimalUtils::ReadDecimalValue<hugeint_t>(stats_data, stats.size(), schema_ele), width,
scale);
default:
throw InvalidInputException("Unsupported internal type for decimal");
}
default:
throw NotImplementedException("Unrecognized Parquet type for Decimal");
}
}
case LogicalTypeId::VARCHAR:
case LogicalTypeId::BLOB:
if (type.id() == LogicalTypeId::BLOB || !Value::StringIsValid(stats)) {
return Value(Blob::ToString(string_t(stats)));
}
return Value(stats);
case LogicalTypeId::DATE:
if (stats.size() != sizeof(int32_t)) {
throw InvalidInputException("Incorrect stats size for type DATE");
}
return Value::DATE(date_t(Load<int32_t>(stats_data)));
case LogicalTypeId::TIME: {
int64_t val;
if (stats.size() == sizeof(int32_t)) {
val = Load<int32_t>(stats_data);
} else if (stats.size() == sizeof(int64_t)) {
val = Load<int64_t>(stats_data);
} else {
throw InvalidInputException("Incorrect stats size for type TIME");
}
switch (schema_ele.type_info) {
case ParquetExtraTypeInfo::UNIT_MS:
return Value::TIME(Time::FromTimeMs(val));
case ParquetExtraTypeInfo::UNIT_NS:
return Value::TIME(Time::FromTimeNs(val));
case ParquetExtraTypeInfo::UNIT_MICROS:
default:
return Value::TIME(dtime_t(val));
}
}
case LogicalTypeId::TIME_NS: {
int64_t val;
if (stats.size() == sizeof(int32_t)) {
val = Load<int32_t>(stats_data);
} else if (stats.size() == sizeof(int64_t)) {
val = Load<int64_t>(stats_data);
} else {
throw InvalidInputException("Incorrect stats size for type TIME");
}
switch (schema_ele.type_info) {
case ParquetExtraTypeInfo::UNIT_MS:
return Value::TIME_NS(ParquetMsIntToTimeNs(val));
case ParquetExtraTypeInfo::UNIT_NS:
return Value::TIME_NS(ParquetIntToTimeNs(val));
case ParquetExtraTypeInfo::UNIT_MICROS:
default:
return Value::TIME_NS(dtime_ns_t(val));
}
}
case LogicalTypeId::TIME_TZ: {
int64_t val;
if (stats.size() == sizeof(int32_t)) {
val = Load<int32_t>(stats_data);
} else if (stats.size() == sizeof(int64_t)) {
val = Load<int64_t>(stats_data);
} else {
throw InvalidInputException("Incorrect stats size for type TIMETZ");
}
switch (schema_ele.type_info) {
case ParquetExtraTypeInfo::UNIT_MS:
return Value::TIMETZ(ParquetIntToTimeMsTZ(NumericCast<int32_t>(val)));
case ParquetExtraTypeInfo::UNIT_NS:
return Value::TIMETZ(ParquetIntToTimeNsTZ(val));
case ParquetExtraTypeInfo::UNIT_MICROS:
default:
return Value::TIMETZ(ParquetIntToTimeTZ(val));
}
}
case LogicalTypeId::TIMESTAMP:
case LogicalTypeId::TIMESTAMP_TZ: {
timestamp_t timestamp_value;
if (schema_ele.type_info == ParquetExtraTypeInfo::IMPALA_TIMESTAMP) {
if (stats.size() != sizeof(Int96)) {
throw InvalidInputException("Incorrect stats size for type TIMESTAMP");
}
timestamp_value = ImpalaTimestampToTimestamp(Load<Int96>(stats_data));
} else {
if (stats.size() != sizeof(int64_t)) {
throw InvalidInputException("Incorrect stats size for type TIMESTAMP");
}
auto val = Load<int64_t>(stats_data);
switch (schema_ele.type_info) {
case ParquetExtraTypeInfo::UNIT_MS:
timestamp_value = Timestamp::FromEpochMs(val);
break;
case ParquetExtraTypeInfo::UNIT_NS:
timestamp_value = Timestamp::FromEpochNanoSeconds(val);
break;
case ParquetExtraTypeInfo::UNIT_MICROS:
default:
timestamp_value = timestamp_t(val);
break;
}
}
if (type.id() == LogicalTypeId::TIMESTAMP_TZ) {
return Value::TIMESTAMPTZ(timestamp_tz_t(timestamp_value));
}
return Value::TIMESTAMP(timestamp_value);
}
case LogicalTypeId::TIMESTAMP_NS: {
timestamp_ns_t timestamp_value;
if (schema_ele.type_info == ParquetExtraTypeInfo::IMPALA_TIMESTAMP) {
if (stats.size() != sizeof(Int96)) {
throw InvalidInputException("Incorrect stats size for type TIMESTAMP_NS");
}
timestamp_value = ImpalaTimestampToTimestampNS(Load<Int96>(stats_data));
} else {
if (stats.size() != sizeof(int64_t)) {
throw InvalidInputException("Incorrect stats size for type TIMESTAMP_NS");
}
auto val = Load<int64_t>(stats_data);
switch (schema_ele.type_info) {
case ParquetExtraTypeInfo::UNIT_MS:
timestamp_value = ParquetTimestampMsToTimestampNs(val);
break;
case ParquetExtraTypeInfo::UNIT_NS:
timestamp_value = ParquetTimestampNsToTimestampNs(val);
break;
case ParquetExtraTypeInfo::UNIT_MICROS:
default:
timestamp_value = ParquetTimestampUsToTimestampNs(val);
break;
}
}
return Value::TIMESTAMPNS(timestamp_value);
}
case LogicalTypeId::UUID: {
if (stats.size() != 16) {
throw InvalidInputException("Incorrect stats size for type UUID");
}
auto uuid_val = UUIDValueConversion::ReadParquetUUID(const_data_ptr_cast(stats.c_str()));
return Value::UUID(uuid_val);
}
default:
throw InternalException("Unsupported type for stats %s", type.ToString());
}
}
unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(const ParquetColumnSchema &schema,
const vector<ColumnChunk> &columns,
bool can_have_nan) {
// Not supported types
auto &type = schema.type;
if (type.id() == LogicalTypeId::ARRAY || type.id() == LogicalTypeId::MAP || type.id() == LogicalTypeId::LIST) {
return nullptr;
}
unique_ptr<BaseStatistics> row_group_stats;
// Structs are handled differently (they dont have stats)
if (type.id() == LogicalTypeId::STRUCT) {
auto struct_stats = StructStats::CreateUnknown(type);
// Recurse into child readers
for (idx_t i = 0; i < schema.children.size(); i++) {
auto &child_schema = schema.children[i];
auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns, can_have_nan);
StructStats::SetChildStats(struct_stats, i, std::move(child_stats));
}
row_group_stats = struct_stats.ToUnique();
// null count is generic
if (row_group_stats) {
row_group_stats->Set(StatsInfo::CAN_HAVE_NULL_AND_VALID_VALUES);
}
return row_group_stats;
} else if (schema.schema_type == ParquetColumnSchemaType::VARIANT) {
//! FIXME: there are situations where VARIANT columns can have stats
return nullptr;
}
// Otherwise, its a standard column with stats
auto &column_chunk = columns[schema.column_index];
if (!column_chunk.__isset.meta_data || !column_chunk.meta_data.__isset.statistics) {
// no stats present for row group
return nullptr;
}
auto &parquet_stats = column_chunk.meta_data.statistics;
switch (type.id()) {
case LogicalTypeId::UTINYINT:
case LogicalTypeId::USMALLINT:
case LogicalTypeId::UINTEGER:
case LogicalTypeId::UBIGINT:
case LogicalTypeId::TINYINT:
case LogicalTypeId::SMALLINT:
case LogicalTypeId::INTEGER:
case LogicalTypeId::BIGINT:
case LogicalTypeId::DATE:
case LogicalTypeId::TIME:
case LogicalTypeId::TIME_TZ:
case LogicalTypeId::TIMESTAMP:
case LogicalTypeId::TIMESTAMP_TZ:
case LogicalTypeId::TIMESTAMP_SEC:
case LogicalTypeId::TIMESTAMP_MS:
case LogicalTypeId::TIMESTAMP_NS:
case LogicalTypeId::DECIMAL:
row_group_stats = CreateNumericStats(type, schema, parquet_stats);
break;
case LogicalTypeId::FLOAT:
case LogicalTypeId::DOUBLE:
if (can_have_nan) {
// Since parquet doesn't tell us if the column has NaN values, if the user has explicitly declared that it
// does, we create stats without an upper max value, as NaN compares larger than anything else.
row_group_stats = CreateFloatingPointStats(type, schema, parquet_stats);
} else {
// Otherwise we use the numeric stats as usual, which might lead to "wrong" pruning if the column contains
// NaN values. The parquet spec is not clear on how to handle NaN values in statistics, and so this is
// probably the best we can do for now.
row_group_stats = CreateNumericStats(type, schema, parquet_stats);
}
break;
case LogicalTypeId::VARCHAR: {
auto string_stats = StringStats::CreateUnknown(type);
if (parquet_stats.__isset.min_value) {
StringColumnReader::VerifyString(parquet_stats.min_value.c_str(), parquet_stats.min_value.size(), true);
StringStats::SetMin(string_stats, parquet_stats.min_value);
} else if (parquet_stats.__isset.min) {
StringColumnReader::VerifyString(parquet_stats.min.c_str(), parquet_stats.min.size(), true);
StringStats::SetMin(string_stats, parquet_stats.min);
}
if (parquet_stats.__isset.max_value) {
StringColumnReader::VerifyString(parquet_stats.max_value.c_str(), parquet_stats.max_value.size(), true);
StringStats::SetMax(string_stats, parquet_stats.max_value);
} else if (parquet_stats.__isset.max) {
StringColumnReader::VerifyString(parquet_stats.max.c_str(), parquet_stats.max.size(), true);
StringStats::SetMax(string_stats, parquet_stats.max);
}
row_group_stats = string_stats.ToUnique();
break;
}
default:
// no stats for you
break;
} // end of type switch
// null count is generic
if (row_group_stats) {
row_group_stats->Set(StatsInfo::CAN_HAVE_NULL_AND_VALID_VALUES);
if (parquet_stats.__isset.null_count && parquet_stats.null_count == 0) {
row_group_stats->Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
}
if (parquet_stats.__isset.null_count && parquet_stats.null_count == column_chunk.meta_data.num_values) {
row_group_stats->Set(StatsInfo::CANNOT_HAVE_VALID_VALUES);
}
}
return row_group_stats;
}
static bool HasFilterConstants(const TableFilter &duckdb_filter) {
switch (duckdb_filter.filter_type) {
case TableFilterType::CONSTANT_COMPARISON: {
auto &constant_filter = duckdb_filter.Cast<ConstantFilter>();
return (constant_filter.comparison_type == ExpressionType::COMPARE_EQUAL && !constant_filter.constant.IsNull());
}
case TableFilterType::CONJUNCTION_AND: {
auto &conjunction_and_filter = duckdb_filter.Cast<ConjunctionAndFilter>();
bool child_has_constant = false;
for (auto &child_filter : conjunction_and_filter.child_filters) {
child_has_constant |= HasFilterConstants(*child_filter);
}
return child_has_constant;
}
case TableFilterType::CONJUNCTION_OR: {
auto &conjunction_or_filter = duckdb_filter.Cast<ConjunctionOrFilter>();
bool child_has_constant = false;
for (auto &child_filter : conjunction_or_filter.child_filters) {
child_has_constant |= HasFilterConstants(*child_filter);
}
return child_has_constant;
}
default:
return false;
}
}
template <class T>
static uint64_t ValueXH64FixedWidth(const Value &constant) {
T val = constant.GetValue<T>();
return duckdb_zstd::XXH64(&val, sizeof(val), 0);
}
// TODO we can only this if the parquet representation of the type exactly matches the duckdb rep!
// TODO TEST THIS!
// TODO perhaps we can re-use some writer infra here
static uint64_t ValueXXH64(const Value &constant) {
switch (constant.type().InternalType()) {
case PhysicalType::UINT8:
return ValueXH64FixedWidth<int32_t>(constant);
case PhysicalType::INT8:
return ValueXH64FixedWidth<int32_t>(constant);
case PhysicalType::UINT16:
return ValueXH64FixedWidth<int32_t>(constant);
case PhysicalType::INT16:
return ValueXH64FixedWidth<int32_t>(constant);
case PhysicalType::UINT32:
return ValueXH64FixedWidth<uint32_t>(constant);
case PhysicalType::INT32:
return ValueXH64FixedWidth<int32_t>(constant);
case PhysicalType::UINT64:
return ValueXH64FixedWidth<uint64_t>(constant);
case PhysicalType::INT64:
return ValueXH64FixedWidth<int64_t>(constant);
case PhysicalType::FLOAT:
return ValueXH64FixedWidth<float>(constant);
case PhysicalType::DOUBLE:
return ValueXH64FixedWidth<double>(constant);
case PhysicalType::VARCHAR: {
auto val = constant.GetValue<string>();
return duckdb_zstd::XXH64(val.c_str(), val.length(), 0);
}
default:
return 0;
}
}
static bool ApplyBloomFilter(const TableFilter &duckdb_filter, ParquetBloomFilter &bloom_filter) {
switch (duckdb_filter.filter_type) {
case TableFilterType::CONSTANT_COMPARISON: {
auto &constant_filter = duckdb_filter.Cast<ConstantFilter>();
auto is_compare_equal = constant_filter.comparison_type == ExpressionType::COMPARE_EQUAL;
D_ASSERT(!constant_filter.constant.IsNull());
auto hash = ValueXXH64(constant_filter.constant);
return hash > 0 && !bloom_filter.FilterCheck(hash) && is_compare_equal;
}
case TableFilterType::CONJUNCTION_AND: {
auto &conjunction_and_filter = duckdb_filter.Cast<ConjunctionAndFilter>();
bool any_children_true = false;
for (auto &child_filter : conjunction_and_filter.child_filters) {
any_children_true |= ApplyBloomFilter(*child_filter, bloom_filter);
}
return any_children_true;
}
case TableFilterType::CONJUNCTION_OR: {
auto &conjunction_or_filter = duckdb_filter.Cast<ConjunctionOrFilter>();
bool all_children_true = true;
for (auto &child_filter : conjunction_or_filter.child_filters) {
all_children_true &= ApplyBloomFilter(*child_filter, bloom_filter);
}
return all_children_true;
}
default:
return false;
}
}
bool ParquetStatisticsUtils::BloomFilterSupported(const LogicalTypeId &type_id) {
switch (type_id) {
case LogicalTypeId::TINYINT:
case LogicalTypeId::UTINYINT:
case LogicalTypeId::SMALLINT:
case LogicalTypeId::USMALLINT:
case LogicalTypeId::INTEGER:
case LogicalTypeId::UINTEGER:
case LogicalTypeId::BIGINT:
case LogicalTypeId::UBIGINT:
case LogicalTypeId::FLOAT:
case LogicalTypeId::DOUBLE:
case LogicalTypeId::VARCHAR:
case LogicalTypeId::BLOB:
return true;
default:
return false;
}
}
bool ParquetStatisticsUtils::BloomFilterExcludes(const TableFilter &duckdb_filter,
const duckdb_parquet::ColumnMetaData &column_meta_data,
TProtocol &file_proto, Allocator &allocator) {
if (!HasFilterConstants(duckdb_filter) || !column_meta_data.__isset.bloom_filter_offset ||
column_meta_data.bloom_filter_offset <= 0) {
return false;
}
// TODO check length against file length!
auto &transport = reinterpret_cast<ThriftFileTransport &>(*file_proto.getTransport());
transport.SetLocation(column_meta_data.bloom_filter_offset);
if (column_meta_data.__isset.bloom_filter_length && column_meta_data.bloom_filter_length > 0) {
transport.Prefetch(column_meta_data.bloom_filter_offset, column_meta_data.bloom_filter_length);
}
duckdb_parquet::BloomFilterHeader filter_header;
// TODO the bloom filter could be encrypted, too, so need to double check that this is NOT the case
filter_header.read(&file_proto);
if (!filter_header.algorithm.__isset.BLOCK || !filter_header.compression.__isset.UNCOMPRESSED ||
!filter_header.hash.__isset.XXHASH) {
return false;
}
auto new_buffer = make_uniq<ResizeableBuffer>(allocator, filter_header.numBytes);
transport.read(new_buffer->ptr, filter_header.numBytes);
ParquetBloomFilter bloom_filter(std::move(new_buffer));
return ApplyBloomFilter(duckdb_filter, bloom_filter);
}
ParquetBloomFilter::ParquetBloomFilter(idx_t num_entries, double bloom_filter_false_positive_ratio) {
// aim for hit ratio of 0.01%
// see http://tfk.mit.edu/pdf/bloom.pdf
double f = bloom_filter_false_positive_ratio;
double k = 8.0;
double n = LossyNumericCast<double>(num_entries);
double m = -k * n / std::log(1 - std::pow(f, 1 / k));
auto b = MaxValue<idx_t>(NextPowerOfTwo(LossyNumericCast<idx_t>(m / k)) / 32, 1);
D_ASSERT(b > 0 && IsPowerOfTwo(b));
data = make_uniq<ResizeableBuffer>(Allocator::DefaultAllocator(), sizeof(ParquetBloomBlock) * b);
data->zero();
block_count = data->len / sizeof(ParquetBloomBlock);
D_ASSERT(data->len % sizeof(ParquetBloomBlock) == 0);
}
ParquetBloomFilter::ParquetBloomFilter(unique_ptr<ResizeableBuffer> data_p) {
D_ASSERT(data_p->len % sizeof(ParquetBloomBlock) == 0);
data = std::move(data_p);
block_count = data->len / sizeof(ParquetBloomBlock);
D_ASSERT(data->len % sizeof(ParquetBloomBlock) == 0);
}
void ParquetBloomFilter::FilterInsert(uint64_t x) {
auto blocks = reinterpret_cast<ParquetBloomBlock *>(data->ptr);
uint64_t i = ((x >> 32) * block_count) >> 32;
auto &b = blocks[i];
ParquetBloomBlock::BlockInsert(b, x);
}
bool ParquetBloomFilter::FilterCheck(uint64_t x) {
auto blocks = reinterpret_cast<ParquetBloomBlock *>(data->ptr);
auto i = ((x >> 32) * block_count) >> 32;
return ParquetBloomBlock::BlockCheck(blocks[i], x);
}
// compiler optimizes this into a single instruction (popcnt)
static uint8_t PopCnt64(uint64_t n) {
uint8_t c = 0;
for (; n; ++c) {
n &= n - 1;
}
return c;
}
double ParquetBloomFilter::OneRatio() {
auto bloom_ptr = reinterpret_cast<uint64_t *>(data->ptr);
idx_t one_count = 0;
for (idx_t b_idx = 0; b_idx < data->len / sizeof(uint64_t); ++b_idx) {
one_count += PopCnt64(bloom_ptr[b_idx]);
}
return LossyNumericCast<double>(one_count) / (LossyNumericCast<double>(data->len) * 8.0);
}
ResizeableBuffer *ParquetBloomFilter::Get() {
return data.get();
}
} // namespace duckdb

View File

@@ -0,0 +1,156 @@
#include "parquet_timestamp.hpp"
#include "duckdb.hpp"
#include "duckdb/common/types/date.hpp"
#include "duckdb/common/types/time.hpp"
#include "duckdb/common/types/timestamp.hpp"
namespace duckdb {
// surely they are joking
static constexpr int64_t JULIAN_TO_UNIX_EPOCH_DAYS = 2440588LL;
static constexpr int64_t MILLISECONDS_PER_DAY = 86400000LL;
static constexpr int64_t MICROSECONDS_PER_DAY = MILLISECONDS_PER_DAY * 1000LL;
static constexpr int64_t NANOSECONDS_PER_MICRO = 1000LL;
static constexpr int64_t NANOSECONDS_PER_DAY = MICROSECONDS_PER_DAY * 1000LL;
static inline int64_t ImpalaTimestampToDays(const Int96 &impala_timestamp) {
return impala_timestamp.value[2] - JULIAN_TO_UNIX_EPOCH_DAYS;
}
static int64_t ImpalaTimestampToMicroseconds(const Int96 &impala_timestamp) {
int64_t days_since_epoch = ImpalaTimestampToDays(impala_timestamp);
auto nanoseconds = Load<int64_t>(const_data_ptr_cast(impala_timestamp.value));
auto microseconds = nanoseconds / NANOSECONDS_PER_MICRO;
return days_since_epoch * MICROSECONDS_PER_DAY + microseconds;
}
static int64_t ImpalaTimestampToNanoseconds(const Int96 &impala_timestamp) {
int64_t days_since_epoch = ImpalaTimestampToDays(impala_timestamp);
auto nanoseconds = Load<int64_t>(const_data_ptr_cast(impala_timestamp.value));
return days_since_epoch * NANOSECONDS_PER_DAY + nanoseconds;
}
timestamp_ns_t ImpalaTimestampToTimestampNS(const Int96 &raw_ts) {
timestamp_ns_t result;
result.value = ImpalaTimestampToNanoseconds(raw_ts);
return result;
}
timestamp_t ImpalaTimestampToTimestamp(const Int96 &raw_ts) {
auto impala_us = ImpalaTimestampToMicroseconds(raw_ts);
return Timestamp::FromEpochMicroSeconds(impala_us);
}
Int96 TimestampToImpalaTimestamp(timestamp_t &ts) {
int32_t hour, min, sec, msec;
Time::Convert(Timestamp::GetTime(ts), hour, min, sec, msec);
uint64_t ms_since_midnight = hour * 60 * 60 * 1000 + min * 60 * 1000 + sec * 1000 + msec;
auto days_since_epoch = Date::Epoch(Timestamp::GetDate(ts)) / int64_t(24 * 60 * 60);
// first two uint32 in Int96 are nanoseconds since midnights
// last uint32 is number of days since year 4713 BC ("Julian date")
Int96 impala_ts;
Store<uint64_t>(ms_since_midnight * 1000000, data_ptr_cast(impala_ts.value));
impala_ts.value[2] = days_since_epoch + JULIAN_TO_UNIX_EPOCH_DAYS;
return impala_ts;
}
timestamp_t ParquetTimestampMicrosToTimestamp(const int64_t &raw_ts) {
return Timestamp::FromEpochMicroSeconds(raw_ts);
}
timestamp_t ParquetTimestampMsToTimestamp(const int64_t &raw_ts) {
timestamp_t input(raw_ts);
if (!Timestamp::IsFinite(input)) {
return input;
}
return Timestamp::FromEpochMs(raw_ts);
}
timestamp_ns_t ParquetTimestampMsToTimestampNs(const int64_t &raw_ms) {
timestamp_ns_t input;
input.value = raw_ms;
if (!Timestamp::IsFinite(input)) {
return input;
}
return Timestamp::TimestampNsFromEpochMillis(raw_ms);
}
timestamp_ns_t ParquetTimestampUsToTimestampNs(const int64_t &raw_us) {
timestamp_ns_t input;
input.value = raw_us;
if (!Timestamp::IsFinite(input)) {
return input;
}
return Timestamp::TimestampNsFromEpochMicros(raw_us);
}
timestamp_ns_t ParquetTimestampNsToTimestampNs(const int64_t &raw_ns) {
timestamp_ns_t result;
result.value = raw_ns;
return result;
}
timestamp_t ParquetTimestampNsToTimestamp(const int64_t &raw_ts) {
timestamp_t input(raw_ts);
if (!Timestamp::IsFinite(input)) {
return input;
}
return Timestamp::FromEpochNanoSeconds(raw_ts);
}
date_t ParquetIntToDate(const int32_t &raw_date) {
return date_t(raw_date);
}
template <typename T>
static T ParquetWrapTime(const T &raw, const T day) {
// Special case 24:00:00
if (raw == day) {
return raw;
}
const auto modulus = raw % day;
return modulus + (modulus < 0) * day;
}
dtime_t ParquetMsIntToTime(const int32_t &raw_millis) {
return Time::FromTimeMs(raw_millis);
}
dtime_t ParquetIntToTime(const int64_t &raw_micros) {
return dtime_t(raw_micros);
}
dtime_t ParquetNsIntToTime(const int64_t &raw_nanos) {
return Time::FromTimeNs(raw_nanos);
}
dtime_ns_t ParquetMsIntToTimeNs(const int32_t &raw_millis) {
return dtime_ns_t(Interval::NANOS_PER_MSEC * raw_millis);
}
dtime_ns_t ParquetUsIntToTimeNs(const int64_t &raw_micros) {
return dtime_ns_t(raw_micros * Interval::NANOS_PER_MICRO);
}
dtime_ns_t ParquetIntToTimeNs(const int64_t &raw_nanos) {
return dtime_ns_t(raw_nanos);
}
dtime_tz_t ParquetIntToTimeMsTZ(const int32_t &raw_millis) {
const int32_t MSECS_PER_DAY = Interval::MSECS_PER_SEC * Interval::SECS_PER_DAY;
const auto millis = ParquetWrapTime(raw_millis, MSECS_PER_DAY);
return dtime_tz_t(Time::FromTimeMs(millis), 0);
}
dtime_tz_t ParquetIntToTimeTZ(const int64_t &raw_micros) {
const auto micros = ParquetWrapTime(raw_micros, Interval::MICROS_PER_DAY);
return dtime_tz_t(dtime_t(micros), 0);
}
dtime_tz_t ParquetIntToTimeNsTZ(const int64_t &raw_nanos) {
const auto nanos = ParquetWrapTime(raw_nanos, Interval::NANOS_PER_DAY);
return dtime_tz_t(Time::FromTimeNs(nanos), 0);
}
} // namespace duckdb

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,16 @@
add_library_unity(
duckdb_parquet_readers
OBJECT
decimal_column_reader.cpp
expression_column_reader.cpp
list_column_reader.cpp
row_number_column_reader.cpp
string_column_reader.cpp
struct_column_reader.cpp
variant_column_reader.cpp)
add_subdirectory(variant)
set(PARQUET_EXTENSION_FILES
${PARQUET_EXTENSION_FILES} $<TARGET_OBJECTS:duckdb_parquet_readers>
PARENT_SCOPE)

View File

@@ -0,0 +1,56 @@
#include "reader/decimal_column_reader.hpp"
namespace duckdb {
template <bool FIXED>
static unique_ptr<ColumnReader> CreateDecimalReaderInternal(ParquetReader &reader, const ParquetColumnSchema &schema) {
switch (schema.type.InternalType()) {
case PhysicalType::INT16:
return make_uniq<DecimalColumnReader<int16_t, FIXED>>(reader, schema);
case PhysicalType::INT32:
return make_uniq<DecimalColumnReader<int32_t, FIXED>>(reader, schema);
case PhysicalType::INT64:
return make_uniq<DecimalColumnReader<int64_t, FIXED>>(reader, schema);
case PhysicalType::INT128:
return make_uniq<DecimalColumnReader<hugeint_t, FIXED>>(reader, schema);
case PhysicalType::DOUBLE:
return make_uniq<DecimalColumnReader<double, FIXED>>(reader, schema);
default:
throw InternalException("Unrecognized type for Decimal");
}
}
template <>
double ParquetDecimalUtils::ReadDecimalValue(const_data_ptr_t pointer, idx_t size,
const ParquetColumnSchema &schema_ele) {
double res = 0;
bool positive = (*pointer & 0x80) == 0;
for (idx_t i = 0; i < size; i += 8) {
auto byte_size = MinValue<idx_t>(sizeof(uint64_t), size - i);
uint64_t input = 0;
auto res_ptr = reinterpret_cast<uint8_t *>(&input);
for (idx_t k = 0; k < byte_size; k++) {
auto byte = pointer[i + k];
res_ptr[sizeof(uint64_t) - k - 1] = positive ? byte : byte ^ 0xFF;
}
res *= double(NumericLimits<uint64_t>::Maximum()) + 1;
res += static_cast<double>(input);
}
if (!positive) {
res += 1;
res /= pow(10, schema_ele.type_scale);
return -res;
}
res /= pow(10, schema_ele.type_scale);
return res;
}
unique_ptr<ColumnReader> ParquetDecimalUtils::CreateReader(ParquetReader &reader, const ParquetColumnSchema &schema) {
if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
return CreateDecimalReaderInternal<true>(reader, schema);
} else {
return CreateDecimalReaderInternal<false>(reader, schema);
}
}
} // namespace duckdb

View File

@@ -0,0 +1,50 @@
#include "reader/expression_column_reader.hpp"
#include "parquet_reader.hpp"
namespace duckdb {
//===--------------------------------------------------------------------===//
// Expression Column Reader
//===--------------------------------------------------------------------===//
ExpressionColumnReader::ExpressionColumnReader(ClientContext &context, unique_ptr<ColumnReader> child_reader_p,
unique_ptr<Expression> expr_p, const ParquetColumnSchema &schema_p)
: ColumnReader(child_reader_p->Reader(), schema_p), child_reader(std::move(child_reader_p)),
expr(std::move(expr_p)), executor(context, expr.get()) {
vector<LogicalType> intermediate_types {child_reader->Type()};
intermediate_chunk.Initialize(reader.allocator, intermediate_types);
}
ExpressionColumnReader::ExpressionColumnReader(ClientContext &context, unique_ptr<ColumnReader> child_reader_p,
unique_ptr<Expression> expr_p,
unique_ptr<ParquetColumnSchema> owned_schema_p)
: ColumnReader(child_reader_p->Reader(), *owned_schema_p), child_reader(std::move(child_reader_p)),
expr(std::move(expr_p)), executor(context, expr.get()), owned_schema(std::move(owned_schema_p)) {
vector<LogicalType> intermediate_types {child_reader->Type()};
intermediate_chunk.Initialize(reader.allocator, intermediate_types);
}
void ExpressionColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns,
TProtocol &protocol_p) {
child_reader->InitializeRead(row_group_idx_p, columns, protocol_p);
}
idx_t ExpressionColumnReader::Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) {
intermediate_chunk.Reset();
auto &intermediate_vector = intermediate_chunk.data[0];
auto amount = child_reader->Read(num_values, define_out, repeat_out, intermediate_vector);
// Execute the expression
intermediate_chunk.SetCardinality(amount);
executor.ExecuteExpression(intermediate_chunk, result);
return amount;
}
void ExpressionColumnReader::Skip(idx_t num_values) {
child_reader->Skip(num_values);
}
idx_t ExpressionColumnReader::GroupRowsAvailable() {
return child_reader->GroupRowsAvailable();
}
} // namespace duckdb

View File

@@ -0,0 +1,190 @@
#include "reader/list_column_reader.hpp"
#include "parquet_reader.hpp"
namespace duckdb {
struct ListReaderData {
ListReaderData(list_entry_t *result_ptr, ValidityMask &result_mask)
: result_ptr(result_ptr), result_mask(result_mask) {
}
list_entry_t *result_ptr;
ValidityMask &result_mask;
};
struct TemplatedListReader {
using DATA = ListReaderData;
static DATA Initialize(optional_ptr<Vector> result_out) {
D_ASSERT(ListVector::GetListSize(*result_out) == 0);
auto result_ptr = FlatVector::GetData<list_entry_t>(*result_out);
auto &result_mask = FlatVector::Validity(*result_out);
return ListReaderData(result_ptr, result_mask);
}
static idx_t GetOffset(optional_ptr<Vector> result_out) {
return ListVector::GetListSize(*result_out);
}
static void HandleRepeat(DATA &data, idx_t offset) {
data.result_ptr[offset].length++;
}
static void HandleListStart(DATA &data, idx_t offset, idx_t offset_in_child, idx_t length) {
data.result_ptr[offset].offset = offset_in_child;
data.result_ptr[offset].length = length;
}
static void HandleNull(DATA &data, idx_t offset) {
data.result_mask.SetInvalid(offset);
data.result_ptr[offset].offset = 0;
data.result_ptr[offset].length = 0;
}
static void AppendVector(optional_ptr<Vector> result_out, Vector &read_vector, idx_t child_idx) {
ListVector::Append(*result_out, read_vector, child_idx);
}
};
struct TemplatedListSkipper {
using DATA = bool;
static DATA Initialize(optional_ptr<Vector>) {
return false;
}
static idx_t GetOffset(optional_ptr<Vector>) {
return 0;
}
static void HandleRepeat(DATA &, idx_t) {
}
static void HandleListStart(DATA &, idx_t, idx_t, idx_t) {
}
static void HandleNull(DATA &, idx_t) {
}
static void AppendVector(optional_ptr<Vector>, Vector &, idx_t) {
}
};
template <class OP>
idx_t ListColumnReader::ReadInternal(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out,
optional_ptr<Vector> result_out) {
idx_t result_offset = 0;
auto data = OP::Initialize(result_out);
// if an individual list is longer than STANDARD_VECTOR_SIZE we actually have to loop the child read to fill it
bool finished = false;
while (!finished) {
idx_t child_actual_num_values = 0;
// check if we have any overflow from a previous read
if (overflow_child_count == 0) {
// we don't: read elements from the child reader
child_defines.zero();
child_repeats.zero();
// we don't know in advance how many values to read because of the beautiful repetition/definition setup
// we just read (up to) a vector from the child column, and see if we have read enough
// if we have not read enough, we read another vector
// if we have read enough, we leave any unhandled elements in the overflow vector for a subsequent read
auto child_req_num_values =
MinValue<idx_t>(STANDARD_VECTOR_SIZE, child_column_reader->GroupRowsAvailable());
read_vector.ResetFromCache(read_cache);
child_actual_num_values =
child_column_reader->Read(child_req_num_values, child_defines_ptr, child_repeats_ptr, read_vector);
} else {
// we do: use the overflow values
child_actual_num_values = overflow_child_count;
overflow_child_count = 0;
}
if (child_actual_num_values == 0) {
// no more elements available: we are done
break;
}
read_vector.Verify(child_actual_num_values);
idx_t current_chunk_offset = OP::GetOffset(result_out);
// hard-won piece of code this, modify at your own risk
// the intuition is that we have to only collapse values into lists that are repeated *on this level*
// the rest is pretty much handed up as-is as a single-valued list or NULL
idx_t child_idx;
for (child_idx = 0; child_idx < child_actual_num_values; child_idx++) {
if (child_repeats_ptr[child_idx] == MaxRepeat()) {
// value repeats on this level, append
D_ASSERT(result_offset > 0);
OP::HandleRepeat(data, result_offset - 1);
continue;
}
if (result_offset >= num_values) {
// we ran out of output space
finished = true;
break;
}
if (child_defines_ptr[child_idx] >= MaxDefine()) {
// value has been defined down the stack, hence its NOT NULL
OP::HandleListStart(data, result_offset, child_idx + current_chunk_offset, 1);
} else if (child_defines_ptr[child_idx] == MaxDefine() - 1) {
// empty list
OP::HandleListStart(data, result_offset, child_idx + current_chunk_offset, 0);
} else {
// value is NULL somewhere up the stack
OP::HandleNull(data, result_offset);
}
if (repeat_out) {
repeat_out[result_offset] = child_repeats_ptr[child_idx];
}
if (define_out) {
define_out[result_offset] = child_defines_ptr[child_idx];
}
result_offset++;
}
// actually append the required elements to the child list
OP::AppendVector(result_out, read_vector, child_idx);
// we have read more values from the child reader than we can fit into the result for this read
// we have to pass everything from child_idx to child_actual_num_values into the next call
if (child_idx < child_actual_num_values && result_offset == num_values) {
read_vector.Slice(read_vector, child_idx, child_actual_num_values);
overflow_child_count = child_actual_num_values - child_idx;
read_vector.Verify(overflow_child_count);
// move values in the child repeats and defines *backward* by child_idx
for (idx_t repdef_idx = 0; repdef_idx < overflow_child_count; repdef_idx++) {
child_defines_ptr[repdef_idx] = child_defines_ptr[child_idx + repdef_idx];
child_repeats_ptr[repdef_idx] = child_repeats_ptr[child_idx + repdef_idx];
}
}
}
return result_offset;
}
idx_t ListColumnReader::Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out) {
ApplyPendingSkips(define_out, repeat_out);
return ReadInternal<TemplatedListReader>(num_values, define_out, repeat_out, result_out);
}
ListColumnReader::ListColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
unique_ptr<ColumnReader> child_column_reader_p)
: ColumnReader(reader, schema), child_column_reader(std::move(child_column_reader_p)),
read_cache(reader.allocator, ListType::GetChildType(Type())), read_vector(read_cache), overflow_child_count(0) {
child_defines.resize(reader.allocator, STANDARD_VECTOR_SIZE);
child_repeats.resize(reader.allocator, STANDARD_VECTOR_SIZE);
child_defines_ptr = (uint8_t *)child_defines.ptr;
child_repeats_ptr = (uint8_t *)child_repeats.ptr;
}
void ListColumnReader::ApplyPendingSkips(data_ptr_t define_out, data_ptr_t repeat_out) {
ReadInternal<TemplatedListSkipper>(pending_skips, nullptr, nullptr, nullptr);
pending_skips = 0;
}
} // namespace duckdb

View File

@@ -0,0 +1,46 @@
#include "reader/row_number_column_reader.hpp"
#include "parquet_reader.hpp"
#include "duckdb/storage/table/row_group.hpp"
namespace duckdb {
//===--------------------------------------------------------------------===//
// Row NumberColumn Reader
//===--------------------------------------------------------------------===//
RowNumberColumnReader::RowNumberColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: ColumnReader(reader, schema) {
}
void RowNumberColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns,
TProtocol &protocol_p) {
row_group_offset = 0;
auto &row_groups = reader.GetFileMetadata()->row_groups;
for (idx_t i = 0; i < row_group_idx_p; i++) {
row_group_offset += row_groups[i].num_rows;
}
}
void RowNumberColumnReader::Filter(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out,
Vector &result_out, const TableFilter &filter, TableFilterState &filter_state,
SelectionVector &sel, idx_t &approved_tuple_count, bool is_first_filter) {
// check the row id stats if this filter has any chance of passing
auto prune_result = RowGroup::CheckRowIdFilter(filter, row_group_offset, row_group_offset + num_values);
if (prune_result == FilterPropagateResult::FILTER_ALWAYS_FALSE) {
// filter is always false - don't read anything
approved_tuple_count = 0;
Skip(num_values);
return;
}
ColumnReader::Filter(num_values, define_out, repeat_out, result_out, filter, filter_state, sel,
approved_tuple_count, is_first_filter);
}
idx_t RowNumberColumnReader::Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) {
auto data_ptr = FlatVector::GetData<int64_t>(result);
for (idx_t i = 0; i < num_values; i++) {
data_ptr[i] = UnsafeNumericCast<int64_t>(row_group_offset++);
}
return num_values;
}
} // namespace duckdb

View File

@@ -0,0 +1,81 @@
#include "reader/string_column_reader.hpp"
#include "utf8proc_wrapper.hpp"
#include "parquet_reader.hpp"
#include "duckdb/common/types/blob.hpp"
namespace duckdb {
//===--------------------------------------------------------------------===//
// String Column Reader
//===--------------------------------------------------------------------===//
StringColumnReader::StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: ColumnReader(reader, schema), string_column_type(GetStringColumnType(Type())) {
fixed_width_string_length = 0;
if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
fixed_width_string_length = schema.type_length;
}
}
void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, const bool is_varchar) {
if (!is_varchar) {
return;
}
// verify if a string is actually UTF8, and if there are no null bytes in the middle of the string
// technically Parquet should guarantee this, but reality is often disappointing
UnicodeInvalidReason reason;
size_t pos;
auto utf_type = Utf8Proc::Analyze(str_data, str_len, &reason, &pos);
if (utf_type == UnicodeType::INVALID) {
throw InvalidInputException("Invalid string encoding found in Parquet file: value \"%s\" is not valid UTF8!",
Blob::ToString(string_t(str_data, str_len)));
}
}
void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len) {
switch (string_column_type) {
case StringColumnType::VARCHAR:
VerifyString(str_data, str_len, true);
break;
case StringColumnType::JSON: {
const auto error = StringUtil::ValidateJSON(str_data, str_len);
if (!error.empty()) {
throw InvalidInputException("Invalid JSON found in Parquet file: %s", error);
}
break;
}
default:
break;
}
}
class ParquetStringVectorBuffer : public VectorBuffer {
public:
explicit ParquetStringVectorBuffer(shared_ptr<ResizeableBuffer> buffer_p)
: VectorBuffer(VectorBufferType::OPAQUE_BUFFER), buffer(std::move(buffer_p)) {
}
private:
shared_ptr<ResizeableBuffer> buffer;
};
void StringColumnReader::ReferenceBlock(Vector &result, shared_ptr<ResizeableBuffer> &block) {
StringVector::AddBuffer(result, make_buffer<ParquetStringVectorBuffer>(block));
}
void StringColumnReader::Plain(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values,
idx_t result_offset, Vector &result) {
ReferenceBlock(result, plain_data);
PlainTemplated<string_t, StringParquetValueConversion>(*plain_data, defines, num_values, result_offset, result);
}
void StringColumnReader::PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values) {
PlainSkipTemplated<StringParquetValueConversion>(plain_data, defines, num_values);
}
void StringColumnReader::PlainSelect(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values,
Vector &result, const SelectionVector &sel, idx_t count) {
ReferenceBlock(result, plain_data);
PlainSelectTemplated<string_t, StringParquetValueConversion>(*plain_data, defines, num_values, result, sel, count);
}
} // namespace duckdb

View File

@@ -0,0 +1,138 @@
#include "reader/struct_column_reader.hpp"
namespace duckdb {
//===--------------------------------------------------------------------===//
// Struct Column Reader
//===--------------------------------------------------------------------===//
StructColumnReader::StructColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
vector<unique_ptr<ColumnReader>> child_readers_p)
: ColumnReader(reader, schema), child_readers(std::move(child_readers_p)) {
D_ASSERT(Type().InternalType() == PhysicalType::STRUCT);
}
ColumnReader &StructColumnReader::GetChildReader(idx_t child_idx) {
if (!child_readers[child_idx]) {
throw InternalException("StructColumnReader::GetChildReader(%d) - but this child reader is not set", child_idx);
}
return *child_readers[child_idx].get();
}
void StructColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns,
TProtocol &protocol_p) {
for (auto &child : child_readers) {
if (!child) {
continue;
}
child->InitializeRead(row_group_idx_p, columns, protocol_p);
}
}
idx_t StructColumnReader::Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) {
auto &struct_entries = StructVector::GetEntries(result);
D_ASSERT(StructType::GetChildTypes(Type()).size() == struct_entries.size());
if (pending_skips > 0) {
throw InternalException("StructColumnReader cannot have pending skips");
}
// If the child reader values are all valid, "define_out" may not be initialized at all
// So, we just initialize them to all be valid beforehand
std::fill_n(define_out, num_values, MaxDefine());
optional_idx read_count;
for (idx_t i = 0; i < child_readers.size(); i++) {
auto &child = child_readers[i];
auto &target_vector = *struct_entries[i];
if (!child) {
// if we are not scanning this vector - set it to NULL
target_vector.SetVectorType(VectorType::CONSTANT_VECTOR);
ConstantVector::SetNull(target_vector, true);
continue;
}
auto child_num_values = child->Read(num_values, define_out, repeat_out, target_vector);
if (!read_count.IsValid()) {
read_count = child_num_values;
} else if (read_count.GetIndex() != child_num_values) {
throw std::runtime_error("Struct child row count mismatch");
}
}
if (!read_count.IsValid()) {
read_count = num_values;
}
// set the validity mask for this level
auto &validity = FlatVector::Validity(result);
for (idx_t i = 0; i < read_count.GetIndex(); i++) {
if (define_out[i] < MaxDefine()) {
validity.SetInvalid(i);
}
}
return read_count.GetIndex();
}
void StructColumnReader::Skip(idx_t num_values) {
for (auto &child : child_readers) {
if (!child) {
continue;
}
child->Skip(num_values);
}
}
void StructColumnReader::RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) {
for (auto &child : child_readers) {
if (!child) {
continue;
}
child->RegisterPrefetch(transport, allow_merge);
}
}
uint64_t StructColumnReader::TotalCompressedSize() {
uint64_t size = 0;
for (auto &child : child_readers) {
if (!child) {
continue;
}
size += child->TotalCompressedSize();
}
return size;
}
static bool TypeHasExactRowCount(const LogicalType &type) {
switch (type.id()) {
case LogicalTypeId::LIST:
case LogicalTypeId::MAP:
return false;
case LogicalTypeId::STRUCT:
for (auto &kv : StructType::GetChildTypes(type)) {
if (TypeHasExactRowCount(kv.second)) {
return true;
}
}
return false;
default:
return true;
}
}
idx_t StructColumnReader::GroupRowsAvailable() {
for (auto &child : child_readers) {
if (!child) {
continue;
}
if (TypeHasExactRowCount(child->Type())) {
return child->GroupRowsAvailable();
}
}
for (auto &child : child_readers) {
if (!child) {
continue;
}
return child->GroupRowsAvailable();
}
throw InternalException("No projected columns in struct?");
}
} // namespace duckdb

View File

@@ -0,0 +1,7 @@
add_library_unity(
duckdb_parquet_reader_variant OBJECT variant_binary_decoder.cpp
variant_value.cpp variant_shredded_conversion.cpp)
set(PARQUET_EXTENSION_FILES
${PARQUET_EXTENSION_FILES} $<TARGET_OBJECTS:duckdb_parquet_reader_variant>
PARENT_SCOPE)

View File

@@ -0,0 +1,365 @@
#include "reader/variant/variant_binary_decoder.hpp"
#include "duckdb/common/printer.hpp"
#include "utf8proc_wrapper.hpp"
#include "reader/uuid_column_reader.hpp"
#include "duckdb/common/types/timestamp.hpp"
#include "duckdb/common/types/decimal.hpp"
#include "duckdb/common/types/uuid.hpp"
#include "duckdb/common/types/time.hpp"
#include "duckdb/common/types/date.hpp"
#include "duckdb/common/types/blob.hpp"
static constexpr uint8_t VERSION_MASK = 0xF;
static constexpr uint8_t SORTED_STRINGS_MASK = 0x1;
static constexpr uint8_t SORTED_STRINGS_SHIFT = 4;
static constexpr uint8_t OFFSET_SIZE_MINUS_ONE_MASK = 0x3;
static constexpr uint8_t OFFSET_SIZE_MINUS_ONE_SHIFT = 6;
static constexpr uint8_t BASIC_TYPE_MASK = 0x3;
static constexpr uint8_t VALUE_HEADER_SHIFT = 2;
//! Object and Array header
static constexpr uint8_t FIELD_OFFSET_SIZE_MINUS_ONE_MASK = 0x3;
//! Object header
static constexpr uint8_t FIELD_ID_SIZE_MINUS_ONE_MASK = 0x3;
static constexpr uint8_t FIELD_ID_SIZE_MINUS_ONE_SHIFT = 2;
static constexpr uint8_t OBJECT_IS_LARGE_MASK = 0x1;
static constexpr uint8_t OBJECT_IS_LARGE_SHIFT = 4;
//! Array header
static constexpr uint8_t ARRAY_IS_LARGE_MASK = 0x1;
static constexpr uint8_t ARRAY_IS_LARGE_SHIFT = 2;
using namespace duckdb_yyjson;
namespace duckdb {
namespace {
static idx_t ReadVariableLengthLittleEndian(idx_t length_in_bytes, const_data_ptr_t &ptr) {
if (length_in_bytes > sizeof(idx_t)) {
throw NotImplementedException("Can't read little-endian value of %d bytes", length_in_bytes);
}
idx_t result = 0;
memcpy(reinterpret_cast<uint8_t *>(&result), ptr, length_in_bytes);
ptr += length_in_bytes;
return result;
}
} // namespace
VariantMetadataHeader VariantMetadataHeader::FromHeaderByte(uint8_t byte) {
VariantMetadataHeader header;
header.version = byte & VERSION_MASK;
header.sorted_strings = (byte >> SORTED_STRINGS_SHIFT) & SORTED_STRINGS_MASK;
header.offset_size = ((byte >> OFFSET_SIZE_MINUS_ONE_SHIFT) & OFFSET_SIZE_MINUS_ONE_MASK) + 1;
if (header.version != 1) {
throw NotImplementedException("Only version 1 of the Variant encoding scheme is supported, found version: %d",
header.version);
}
return header;
}
VariantMetadata::VariantMetadata(const string_t &metadata) : metadata(metadata) {
auto metadata_data = metadata.GetData();
header = VariantMetadataHeader::FromHeaderByte(metadata_data[0]);
const_data_ptr_t ptr = reinterpret_cast<const_data_ptr_t>(metadata_data + sizeof(uint8_t));
idx_t dictionary_size = ReadVariableLengthLittleEndian(header.offset_size, ptr);
auto offsets = ptr;
auto bytes = offsets + ((dictionary_size + 1) * header.offset_size);
idx_t last_offset = ReadVariableLengthLittleEndian(header.offset_size, ptr);
for (idx_t i = 0; i < dictionary_size; i++) {
auto next_offset = ReadVariableLengthLittleEndian(header.offset_size, ptr);
strings.emplace_back(reinterpret_cast<const char *>(bytes + last_offset), next_offset - last_offset);
last_offset = next_offset;
}
}
VariantValueMetadata VariantValueMetadata::FromHeaderByte(uint8_t byte) {
VariantValueMetadata result;
result.basic_type = VariantBasicTypeFromByte(byte & BASIC_TYPE_MASK);
uint8_t value_header = byte >> VALUE_HEADER_SHIFT;
switch (result.basic_type) {
case VariantBasicType::PRIMITIVE: {
result.primitive_type = VariantPrimitiveTypeFromByte(value_header);
break;
}
case VariantBasicType::SHORT_STRING: {
result.string_size = value_header;
break;
}
case VariantBasicType::OBJECT: {
result.field_offset_size = (value_header & FIELD_OFFSET_SIZE_MINUS_ONE_MASK) + 1;
result.field_id_size = ((value_header >> FIELD_ID_SIZE_MINUS_ONE_SHIFT) & FIELD_ID_SIZE_MINUS_ONE_MASK) + 1;
result.is_large = (value_header >> OBJECT_IS_LARGE_SHIFT) & OBJECT_IS_LARGE_MASK;
break;
}
case VariantBasicType::ARRAY: {
result.field_offset_size = (value_header & FIELD_OFFSET_SIZE_MINUS_ONE_MASK) + 1;
result.is_large = (value_header >> ARRAY_IS_LARGE_SHIFT) & ARRAY_IS_LARGE_MASK;
break;
}
default:
throw InternalException("VariantBasicType (%d) not handled", static_cast<uint8_t>(result.basic_type));
}
return result;
}
template <class T>
static T DecodeDecimal(const_data_ptr_t data, uint8_t &scale, uint8_t &width) {
scale = Load<uint8_t>(data);
data++;
auto result = Load<T>(data);
//! FIXME: The spec says:
//! The implied precision of a decimal value is `floor(log_10(val)) + 1`
width = DecimalWidth<T>::max;
return result;
}
template <>
hugeint_t DecodeDecimal(const_data_ptr_t data, uint8_t &scale, uint8_t &width) {
scale = Load<uint8_t>(data);
data++;
hugeint_t result;
result.lower = Load<uint64_t>(data);
result.upper = Load<int64_t>(data + sizeof(uint64_t));
//! FIXME: The spec says:
//! The implied precision of a decimal value is `floor(log_10(val)) + 1`
width = DecimalWidth<hugeint_t>::max;
return result;
}
VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantValueMetadata &value_metadata,
const_data_ptr_t data) {
switch (value_metadata.primitive_type) {
case VariantPrimitiveType::NULL_TYPE: {
return VariantValue(Value());
}
case VariantPrimitiveType::BOOLEAN_TRUE: {
return VariantValue(Value::BOOLEAN(true));
}
case VariantPrimitiveType::BOOLEAN_FALSE: {
return VariantValue(Value::BOOLEAN(false));
}
case VariantPrimitiveType::INT8: {
auto value = Load<int8_t>(data);
return VariantValue(Value::TINYINT(value));
}
case VariantPrimitiveType::INT16: {
auto value = Load<int16_t>(data);
return VariantValue(Value::SMALLINT(value));
}
case VariantPrimitiveType::INT32: {
auto value = Load<int32_t>(data);
return VariantValue(Value::INTEGER(value));
}
case VariantPrimitiveType::INT64: {
auto value = Load<int64_t>(data);
return VariantValue(Value::BIGINT(value));
}
case VariantPrimitiveType::DOUBLE: {
double value = Load<double>(data);
return VariantValue(Value::DOUBLE(value));
}
case VariantPrimitiveType::FLOAT: {
float value = Load<float>(data);
return VariantValue(Value::FLOAT(value));
}
case VariantPrimitiveType::DECIMAL4: {
uint8_t scale;
uint8_t width;
auto value = DecodeDecimal<int32_t>(data, scale, width);
auto value_str = Decimal::ToString(value, width, scale);
return VariantValue(Value(value_str));
}
case VariantPrimitiveType::DECIMAL8: {
uint8_t scale;
uint8_t width;
auto value = DecodeDecimal<int64_t>(data, scale, width);
auto value_str = Decimal::ToString(value, width, scale);
return VariantValue(Value(value_str));
}
case VariantPrimitiveType::DECIMAL16: {
uint8_t scale;
uint8_t width;
auto value = DecodeDecimal<hugeint_t>(data, scale, width);
auto value_str = Decimal::ToString(value, width, scale);
return VariantValue(Value(value_str));
}
case VariantPrimitiveType::DATE: {
date_t value;
value.days = Load<int32_t>(data);
return VariantValue(Value::DATE(value));
}
case VariantPrimitiveType::TIMESTAMP_MICROS: {
timestamp_tz_t micros_ts_tz;
micros_ts_tz.value = Load<int64_t>(data);
return VariantValue(Value::TIMESTAMPTZ(micros_ts_tz));
}
case VariantPrimitiveType::TIMESTAMP_NTZ_MICROS: {
timestamp_t micros_ts;
micros_ts.value = Load<int64_t>(data);
auto value = Value::TIMESTAMP(micros_ts);
auto value_str = value.ToString();
return VariantValue(Value(value_str));
}
case VariantPrimitiveType::BINARY: {
//! Follow the JSON serialization guide by converting BINARY to Base64:
//! For example: `"dmFyaWFudAo="`
auto size = Load<uint32_t>(data);
auto string_data = reinterpret_cast<const char *>(data + sizeof(uint32_t));
auto base64_string = Blob::ToBase64(string_t(string_data, size));
return VariantValue(Value(base64_string));
}
case VariantPrimitiveType::STRING: {
auto size = Load<uint32_t>(data);
auto string_data = reinterpret_cast<const char *>(data + sizeof(uint32_t));
if (!Utf8Proc::IsValid(string_data, size)) {
throw InternalException("Can't decode Variant short-string, string isn't valid UTF8");
}
return VariantValue(Value(string(string_data, size)));
}
case VariantPrimitiveType::TIME_NTZ_MICROS: {
dtime_t micros_time;
micros_time.micros = Load<int64_t>(data);
return VariantValue(Value::TIME(micros_time));
}
case VariantPrimitiveType::TIMESTAMP_NANOS: {
timestamp_ns_t nanos_ts;
nanos_ts.value = Load<int64_t>(data);
//! Convert the nanos timestamp to a micros timestamp (not lossless)
auto micros_ts = Timestamp::FromEpochNanoSeconds(nanos_ts.value);
return VariantValue(Value::TIMESTAMPTZ(timestamp_tz_t(micros_ts)));
}
case VariantPrimitiveType::TIMESTAMP_NTZ_NANOS: {
timestamp_ns_t nanos_ts;
nanos_ts.value = Load<int64_t>(data);
auto value = Value::TIMESTAMPNS(nanos_ts);
auto value_str = value.ToString();
return VariantValue(Value(value_str));
}
case VariantPrimitiveType::UUID: {
auto uuid_value = UUIDValueConversion::ReadParquetUUID(data);
auto value_str = UUID::ToString(uuid_value);
return VariantValue(Value(value_str));
}
default:
throw NotImplementedException("Variant PrimitiveTypeDecode not implemented for type (%d)",
static_cast<uint8_t>(value_metadata.primitive_type));
}
}
VariantValue VariantBinaryDecoder::ShortStringDecode(const VariantValueMetadata &value_metadata,
const_data_ptr_t data) {
D_ASSERT(value_metadata.string_size < 64);
auto string_data = reinterpret_cast<const char *>(data);
if (!Utf8Proc::IsValid(string_data, value_metadata.string_size)) {
throw InternalException("Can't decode Variant short-string, string isn't valid UTF8");
}
return VariantValue(Value(string(string_data, value_metadata.string_size)));
}
VariantValue VariantBinaryDecoder::ObjectDecode(const VariantMetadata &metadata,
const VariantValueMetadata &value_metadata, const_data_ptr_t data) {
VariantValue ret(VariantValueType::OBJECT);
auto field_offset_size = value_metadata.field_offset_size;
auto field_id_size = value_metadata.field_id_size;
auto is_large = value_metadata.is_large;
idx_t num_elements;
if (is_large) {
num_elements = Load<uint32_t>(data);
data += sizeof(uint32_t);
} else {
num_elements = Load<uint8_t>(data);
data += sizeof(uint8_t);
}
auto field_ids = data;
auto field_offsets = data + (num_elements * field_id_size);
auto values = field_offsets + ((num_elements + 1) * field_offset_size);
idx_t last_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets);
for (idx_t i = 0; i < num_elements; i++) {
auto field_id = ReadVariableLengthLittleEndian(field_id_size, field_ids);
auto next_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets);
auto value = Decode(metadata, values + last_offset);
auto &key = metadata.strings[field_id];
ret.AddChild(key, std::move(value));
last_offset = next_offset;
}
return ret;
}
VariantValue VariantBinaryDecoder::ArrayDecode(const VariantMetadata &metadata,
const VariantValueMetadata &value_metadata, const_data_ptr_t data) {
VariantValue ret(VariantValueType::ARRAY);
auto field_offset_size = value_metadata.field_offset_size;
auto is_large = value_metadata.is_large;
uint32_t num_elements;
if (is_large) {
num_elements = Load<uint32_t>(data);
data += sizeof(uint32_t);
} else {
num_elements = Load<uint8_t>(data);
data += sizeof(uint8_t);
}
auto field_offsets = data;
auto values = field_offsets + ((num_elements + 1) * field_offset_size);
idx_t last_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets);
for (idx_t i = 0; i < num_elements; i++) {
auto next_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets);
ret.AddItem(Decode(metadata, values + last_offset));
last_offset = next_offset;
}
return ret;
}
VariantValue VariantBinaryDecoder::Decode(const VariantMetadata &variant_metadata, const_data_ptr_t data) {
auto value_metadata = VariantValueMetadata::FromHeaderByte(data[0]);
data++;
switch (value_metadata.basic_type) {
case VariantBasicType::PRIMITIVE: {
return PrimitiveTypeDecode(value_metadata, data);
}
case VariantBasicType::SHORT_STRING: {
return ShortStringDecode(value_metadata, data);
}
case VariantBasicType::OBJECT: {
return ObjectDecode(variant_metadata, value_metadata, data);
}
case VariantBasicType::ARRAY: {
return ArrayDecode(variant_metadata, value_metadata, data);
}
default:
throw InternalException("Unexpected value for VariantBasicType");
}
}
} // namespace duckdb

View File

@@ -0,0 +1,577 @@
#include "reader/variant/variant_shredded_conversion.hpp"
#include "column_reader.hpp"
#include "utf8proc_wrapper.hpp"
#include "duckdb/common/types/timestamp.hpp"
#include "duckdb/common/types/decimal.hpp"
#include "duckdb/common/types/uuid.hpp"
#include "duckdb/common/types/time.hpp"
#include "duckdb/common/types/date.hpp"
#include "duckdb/common/types/blob.hpp"
namespace duckdb {
template <class T>
struct ConvertShreddedValue {
static VariantValue Convert(T val);
static VariantValue ConvertDecimal(T val, uint8_t width, uint8_t scale) {
throw InternalException("ConvertShreddedValue::ConvertDecimal not implemented for type");
}
static VariantValue ConvertBlob(T val) {
throw InternalException("ConvertShreddedValue::ConvertBlob not implemented for type");
}
};
//! boolean
template <>
VariantValue ConvertShreddedValue<bool>::Convert(bool val) {
return VariantValue(Value::BOOLEAN(val));
}
//! int8
template <>
VariantValue ConvertShreddedValue<int8_t>::Convert(int8_t val) {
return VariantValue(Value::TINYINT(val));
}
//! int16
template <>
VariantValue ConvertShreddedValue<int16_t>::Convert(int16_t val) {
return VariantValue(Value::SMALLINT(val));
}
//! int32
template <>
VariantValue ConvertShreddedValue<int32_t>::Convert(int32_t val) {
return VariantValue(Value::INTEGER(val));
}
//! int64
template <>
VariantValue ConvertShreddedValue<int64_t>::Convert(int64_t val) {
return VariantValue(Value::BIGINT(val));
}
//! float
template <>
VariantValue ConvertShreddedValue<float>::Convert(float val) {
return VariantValue(Value::FLOAT(val));
}
//! double
template <>
VariantValue ConvertShreddedValue<double>::Convert(double val) {
return VariantValue(Value::DOUBLE(val));
}
//! decimal4/decimal8/decimal16
template <>
VariantValue ConvertShreddedValue<int32_t>::ConvertDecimal(int32_t val, uint8_t width, uint8_t scale) {
auto value_str = Decimal::ToString(val, width, scale);
return VariantValue(Value(value_str));
}
template <>
VariantValue ConvertShreddedValue<int64_t>::ConvertDecimal(int64_t val, uint8_t width, uint8_t scale) {
auto value_str = Decimal::ToString(val, width, scale);
return VariantValue(Value(value_str));
}
template <>
VariantValue ConvertShreddedValue<hugeint_t>::ConvertDecimal(hugeint_t val, uint8_t width, uint8_t scale) {
auto value_str = Decimal::ToString(val, width, scale);
return VariantValue(Value(value_str));
}
//! date
template <>
VariantValue ConvertShreddedValue<date_t>::Convert(date_t val) {
return VariantValue(Value::DATE(val));
}
//! time
template <>
VariantValue ConvertShreddedValue<dtime_t>::Convert(dtime_t val) {
return VariantValue(Value::TIME(val));
}
//! timestamptz(6)
template <>
VariantValue ConvertShreddedValue<timestamp_tz_t>::Convert(timestamp_tz_t val) {
return VariantValue(Value::TIMESTAMPTZ(val));
}
////! timestamptz(9)
// template <>
// VariantValue ConvertShreddedValue<timestamp_ns_tz_t>::Convert(timestamp_ns_tz_t val) {
// return VariantValue(Value::TIMESTAMPNS_TZ(val));
//}
//! timestampntz(6)
template <>
VariantValue ConvertShreddedValue<timestamp_t>::Convert(timestamp_t val) {
return VariantValue(Value::TIMESTAMP(val));
}
//! timestampntz(9)
template <>
VariantValue ConvertShreddedValue<timestamp_ns_t>::Convert(timestamp_ns_t val) {
return VariantValue(Value::TIMESTAMPNS(val));
}
//! binary
template <>
VariantValue ConvertShreddedValue<string_t>::ConvertBlob(string_t val) {
return VariantValue(Value(Blob::ToBase64(val)));
}
//! string
template <>
VariantValue ConvertShreddedValue<string_t>::Convert(string_t val) {
if (!Utf8Proc::IsValid(val.GetData(), val.GetSize())) {
throw InternalException("Can't decode Variant string, it isn't valid UTF8");
}
return VariantValue(Value(val.GetString()));
}
//! uuid
template <>
VariantValue ConvertShreddedValue<hugeint_t>::Convert(hugeint_t val) {
return VariantValue(Value(UUID::ToString(val)));
}
template <class T, class OP, LogicalTypeId TYPE_ID>
vector<VariantValue> ConvertTypedValues(Vector &vec, Vector &metadata, Vector &blob, idx_t offset, idx_t length,
idx_t total_size, const bool is_field) {
UnifiedVectorFormat metadata_format;
metadata.ToUnifiedFormat(length, metadata_format);
auto metadata_data = metadata_format.GetData<string_t>(metadata_format);
UnifiedVectorFormat typed_format;
vec.ToUnifiedFormat(total_size, typed_format);
auto data = typed_format.GetData<T>(typed_format);
UnifiedVectorFormat value_format;
blob.ToUnifiedFormat(total_size, value_format);
auto value_data = value_format.GetData<string_t>(value_format);
auto &validity = typed_format.validity;
auto &value_validity = value_format.validity;
auto &type = vec.GetType();
//! Values only used for Decimal conversion
uint8_t width;
uint8_t scale;
if (TYPE_ID == LogicalTypeId::DECIMAL) {
type.GetDecimalProperties(width, scale);
}
vector<VariantValue> ret(length);
if (validity.AllValid()) {
for (idx_t i = 0; i < length; i++) {
auto index = typed_format.sel->get_index(i + offset);
if (TYPE_ID == LogicalTypeId::DECIMAL) {
ret[i] = OP::ConvertDecimal(data[index], width, scale);
} else if (TYPE_ID == LogicalTypeId::BLOB) {
ret[i] = OP::ConvertBlob(data[index]);
} else {
ret[i] = OP::Convert(data[index]);
}
}
} else {
for (idx_t i = 0; i < length; i++) {
auto typed_index = typed_format.sel->get_index(i + offset);
auto value_index = value_format.sel->get_index(i + offset);
if (validity.RowIsValid(typed_index)) {
//! This is a leaf, partially shredded values aren't possible here
D_ASSERT(!value_validity.RowIsValid(value_index));
if (TYPE_ID == LogicalTypeId::DECIMAL) {
ret[i] = OP::ConvertDecimal(data[typed_index], width, scale);
} else if (TYPE_ID == LogicalTypeId::BLOB) {
ret[i] = OP::ConvertBlob(data[typed_index]);
} else {
ret[i] = OP::Convert(data[typed_index]);
}
} else {
if (is_field && !value_validity.RowIsValid(value_index)) {
//! Value is missing for this field
continue;
}
D_ASSERT(value_validity.RowIsValid(value_index));
auto metadata_value = metadata_data[metadata_format.sel->get_index(i)];
VariantMetadata variant_metadata(metadata_value);
ret[i] = VariantBinaryDecoder::Decode(variant_metadata,
const_data_ptr_cast(value_data[value_index].GetData()));
}
}
}
return ret;
}
vector<VariantValue> VariantShreddedConversion::ConvertShreddedLeaf(Vector &metadata, Vector &value,
Vector &typed_value, idx_t offset, idx_t length,
idx_t total_size, const bool is_field) {
D_ASSERT(!typed_value.GetType().IsNested());
vector<VariantValue> result;
auto &type = typed_value.GetType();
switch (type.id()) {
//! boolean
case LogicalTypeId::BOOLEAN: {
return ConvertTypedValues<bool, ConvertShreddedValue<bool>, LogicalTypeId::BOOLEAN>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! int8
case LogicalTypeId::TINYINT: {
return ConvertTypedValues<int8_t, ConvertShreddedValue<int8_t>, LogicalTypeId::TINYINT>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! int16
case LogicalTypeId::SMALLINT: {
return ConvertTypedValues<int16_t, ConvertShreddedValue<int16_t>, LogicalTypeId::SMALLINT>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! int32
case LogicalTypeId::INTEGER: {
return ConvertTypedValues<int32_t, ConvertShreddedValue<int32_t>, LogicalTypeId::INTEGER>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! int64
case LogicalTypeId::BIGINT: {
return ConvertTypedValues<int64_t, ConvertShreddedValue<int64_t>, LogicalTypeId::BIGINT>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! float
case LogicalTypeId::FLOAT: {
return ConvertTypedValues<float, ConvertShreddedValue<float>, LogicalTypeId::FLOAT>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! double
case LogicalTypeId::DOUBLE: {
return ConvertTypedValues<double, ConvertShreddedValue<double>, LogicalTypeId::DOUBLE>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! decimal4/decimal8/decimal16
case LogicalTypeId::DECIMAL: {
auto physical_type = type.InternalType();
switch (physical_type) {
case PhysicalType::INT32: {
return ConvertTypedValues<int32_t, ConvertShreddedValue<int32_t>, LogicalTypeId::DECIMAL>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
case PhysicalType::INT64: {
return ConvertTypedValues<int64_t, ConvertShreddedValue<int64_t>, LogicalTypeId::DECIMAL>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
case PhysicalType::INT128: {
return ConvertTypedValues<hugeint_t, ConvertShreddedValue<hugeint_t>, LogicalTypeId::DECIMAL>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
default:
throw NotImplementedException("Decimal with PhysicalType (%s) not implemented for shredded Variant",
EnumUtil::ToString(physical_type));
}
}
//! date
case LogicalTypeId::DATE: {
return ConvertTypedValues<date_t, ConvertShreddedValue<date_t>, LogicalTypeId::DATE>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! time
case LogicalTypeId::TIME: {
return ConvertTypedValues<dtime_t, ConvertShreddedValue<dtime_t>, LogicalTypeId::TIME>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! timestamptz(6) (timestamptz(9) not implemented in DuckDB)
case LogicalTypeId::TIMESTAMP_TZ: {
return ConvertTypedValues<timestamp_tz_t, ConvertShreddedValue<timestamp_tz_t>, LogicalTypeId::TIMESTAMP_TZ>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! timestampntz(6)
case LogicalTypeId::TIMESTAMP: {
return ConvertTypedValues<timestamp_t, ConvertShreddedValue<timestamp_t>, LogicalTypeId::TIMESTAMP>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! timestampntz(9)
case LogicalTypeId::TIMESTAMP_NS: {
return ConvertTypedValues<timestamp_ns_t, ConvertShreddedValue<timestamp_ns_t>, LogicalTypeId::TIMESTAMP_NS>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! binary
case LogicalTypeId::BLOB: {
return ConvertTypedValues<string_t, ConvertShreddedValue<string_t>, LogicalTypeId::BLOB>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! string
case LogicalTypeId::VARCHAR: {
return ConvertTypedValues<string_t, ConvertShreddedValue<string_t>, LogicalTypeId::VARCHAR>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
//! uuid
case LogicalTypeId::UUID: {
return ConvertTypedValues<hugeint_t, ConvertShreddedValue<hugeint_t>, LogicalTypeId::UUID>(
typed_value, metadata, value, offset, length, total_size, is_field);
}
default:
throw NotImplementedException("Variant shredding on type: '%s' is not implemented", type.ToString());
}
}
namespace {
struct ShreddedVariantField {
public:
explicit ShreddedVariantField(const string &field_name) : field_name(field_name) {
}
public:
string field_name;
//! Values for the field, for all rows
vector<VariantValue> values;
};
} // namespace
template <bool IS_REQUIRED>
static vector<VariantValue> ConvertBinaryEncoding(Vector &metadata, Vector &value, idx_t offset, idx_t length,
idx_t total_size) {
UnifiedVectorFormat value_format;
value.ToUnifiedFormat(total_size, value_format);
auto value_data = value_format.GetData<string_t>(value_format);
auto &validity = value_format.validity;
UnifiedVectorFormat metadata_format;
metadata.ToUnifiedFormat(length, metadata_format);
auto metadata_data = metadata_format.GetData<string_t>(metadata_format);
auto metadata_validity = metadata_format.validity;
vector<VariantValue> ret(length);
if (IS_REQUIRED) {
for (idx_t i = 0; i < length; i++) {
auto index = value_format.sel->get_index(i + offset);
// Variant itself is NULL
if (!validity.RowIsValid(index) && !metadata_validity.RowIsValid(metadata_format.sel->get_index(i))) {
ret[i] = VariantValue(Value());
continue;
}
D_ASSERT(validity.RowIsValid(index));
auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)];
VariantMetadata variant_metadata(metadata_value);
auto binary_value = value_data[index].GetData();
ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value));
}
} else {
//! Even though 'typed_value' is not present, 'value' is allowed to contain NULLs because we're scanning an
//! Object's shredded field.
//! When 'value' is null for a row, that means the Object does not contain this field
//! for that row.
for (idx_t i = 0; i < length; i++) {
auto index = value_format.sel->get_index(i + offset);
if (validity.RowIsValid(index)) {
auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)];
VariantMetadata variant_metadata(metadata_value);
auto binary_value = value_data[index].GetData();
ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value));
}
}
}
return ret;
}
static VariantValue ConvertPartiallyShreddedObject(vector<ShreddedVariantField> &shredded_fields,
const UnifiedVectorFormat &metadata_format,
const UnifiedVectorFormat &value_format, idx_t i, idx_t offset) {
auto ret = VariantValue(VariantValueType::OBJECT);
auto index = value_format.sel->get_index(i + offset);
auto value_data = value_format.GetData<string_t>(value_format);
auto metadata_data = metadata_format.GetData<string_t>(metadata_format);
auto &value_validity = value_format.validity;
for (idx_t field_index = 0; field_index < shredded_fields.size(); field_index++) {
auto &shredded_field = shredded_fields[field_index];
auto &field_value = shredded_field.values[i];
if (field_value.IsMissing()) {
//! This field is missing from the value, skip it
continue;
}
ret.AddChild(shredded_field.field_name, std::move(field_value));
}
if (value_validity.RowIsValid(index)) {
//! Object is partially shredded, decode the object and merge the values
auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)];
VariantMetadata variant_metadata(metadata_value);
auto binary_value = value_data[index].GetData();
auto unshredded = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value));
if (unshredded.value_type != VariantValueType::OBJECT) {
throw InvalidInputException("Partially shredded objects have to encode Object Variants in the 'value'");
}
for (auto &item : unshredded.object_children) {
ret.AddChild(item.first, std::move(item.second));
}
}
return ret;
}
vector<VariantValue> VariantShreddedConversion::ConvertShreddedObject(Vector &metadata, Vector &value,
Vector &typed_value, idx_t offset, idx_t length,
idx_t total_size, const bool is_field) {
auto &type = typed_value.GetType();
D_ASSERT(type.id() == LogicalTypeId::STRUCT);
auto &fields = StructType::GetChildTypes(type);
auto &entries = StructVector::GetEntries(typed_value);
D_ASSERT(entries.size() == fields.size());
//! 'value'
UnifiedVectorFormat value_format;
value.ToUnifiedFormat(total_size, value_format);
auto value_data = value_format.GetData<string_t>(value_format);
auto &validity = value_format.validity;
(void)validity;
//! 'metadata'
UnifiedVectorFormat metadata_format;
metadata.ToUnifiedFormat(length, metadata_format);
auto metadata_data = metadata_format.GetData<string_t>(metadata_format);
//! 'typed_value'
UnifiedVectorFormat typed_format;
typed_value.ToUnifiedFormat(total_size, typed_format);
auto &typed_validity = typed_format.validity;
//! Process all fields to get the shredded field values
vector<ShreddedVariantField> shredded_fields;
shredded_fields.reserve(fields.size());
for (idx_t i = 0; i < fields.size(); i++) {
auto &field = fields[i];
auto &field_name = field.first;
auto &field_vec = *entries[i];
shredded_fields.emplace_back(field_name);
auto &shredded_field = shredded_fields.back();
shredded_field.values = Convert(metadata, field_vec, offset, length, total_size, true);
}
vector<VariantValue> ret(length);
if (typed_validity.AllValid()) {
for (idx_t i = 0; i < length; i++) {
ret[i] = ConvertPartiallyShreddedObject(shredded_fields, metadata_format, value_format, i, offset);
}
} else {
//! For some of the rows, the value is not an object
for (idx_t i = 0; i < length; i++) {
auto typed_index = typed_format.sel->get_index(i + offset);
auto value_index = value_format.sel->get_index(i + offset);
if (typed_validity.RowIsValid(typed_index)) {
ret[i] = ConvertPartiallyShreddedObject(shredded_fields, metadata_format, value_format, i, offset);
} else {
if (is_field && !validity.RowIsValid(value_index)) {
//! This object is a field in the parent object, the value is missing, skip it
continue;
}
D_ASSERT(validity.RowIsValid(value_index));
auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)];
VariantMetadata variant_metadata(metadata_value);
auto binary_value = value_data[value_index].GetData();
ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value));
if (ret[i].value_type == VariantValueType::OBJECT) {
throw InvalidInputException(
"When 'typed_value' for a shredded Object is NULL, 'value' can not contain an Object value");
}
}
}
}
return ret;
}
vector<VariantValue> VariantShreddedConversion::ConvertShreddedArray(Vector &metadata, Vector &value,
Vector &typed_value, idx_t offset, idx_t length,
idx_t total_size, const bool is_field) {
auto &child = ListVector::GetEntry(typed_value);
auto list_size = ListVector::GetListSize(typed_value);
//! 'value'
UnifiedVectorFormat value_format;
value.ToUnifiedFormat(total_size, value_format);
auto value_data = value_format.GetData<string_t>(value_format);
//! 'metadata'
UnifiedVectorFormat metadata_format;
metadata.ToUnifiedFormat(length, metadata_format);
auto metadata_data = metadata_format.GetData<string_t>(metadata_format);
//! 'typed_value'
UnifiedVectorFormat list_format;
typed_value.ToUnifiedFormat(total_size, list_format);
auto list_data = list_format.GetData<list_entry_t>(list_format);
auto &validity = list_format.validity;
auto &value_validity = value_format.validity;
vector<VariantValue> ret(length);
if (validity.AllValid()) {
//! We can be sure that none of the values are binary encoded
for (idx_t i = 0; i < length; i++) {
auto typed_index = list_format.sel->get_index(i + offset);
auto entry = list_data[typed_index];
Vector child_metadata(metadata.GetValue(i));
ret[i] = VariantValue(VariantValueType::ARRAY);
ret[i].array_items = Convert(child_metadata, child, entry.offset, entry.length, list_size, false);
}
} else {
for (idx_t i = 0; i < length; i++) {
auto typed_index = list_format.sel->get_index(i + offset);
auto value_index = value_format.sel->get_index(i + offset);
if (validity.RowIsValid(typed_index)) {
auto entry = list_data[typed_index];
Vector child_metadata(metadata.GetValue(i));
ret[i] = VariantValue(VariantValueType::ARRAY);
ret[i].array_items = Convert(child_metadata, child, entry.offset, entry.length, list_size, false);
} else {
if (is_field && !value_validity.RowIsValid(value_index)) {
//! Value is missing for this field
continue;
}
D_ASSERT(value_validity.RowIsValid(value_index));
auto metadata_value = metadata_data[metadata_format.sel->get_index(i)];
VariantMetadata variant_metadata(metadata_value);
ret[i] = VariantBinaryDecoder::Decode(variant_metadata,
const_data_ptr_cast(value_data[value_index].GetData()));
}
}
}
return ret;
}
vector<VariantValue> VariantShreddedConversion::Convert(Vector &metadata, Vector &group, idx_t offset, idx_t length,
idx_t total_size, bool is_field) {
D_ASSERT(group.GetType().id() == LogicalTypeId::STRUCT);
auto &group_entries = StructVector::GetEntries(group);
auto &group_type_children = StructType::GetChildTypes(group.GetType());
D_ASSERT(group_type_children.size() == group_entries.size());
//! From the spec:
//! The Parquet columns used to store variant metadata and values must be accessed by name, not by position.
optional_ptr<Vector> value;
optional_ptr<Vector> typed_value;
for (idx_t i = 0; i < group_entries.size(); i++) {
auto &name = group_type_children[i].first;
auto &vec = group_entries[i];
if (name == "value") {
value = vec.get();
} else if (name == "typed_value") {
typed_value = vec.get();
} else {
throw InvalidInputException("Variant group can only contain 'value'/'typed_value', not: %s", name);
}
}
if (!value) {
throw InvalidInputException("Required column 'value' not found in Variant group");
}
if (typed_value) {
auto &type = typed_value->GetType();
vector<VariantValue> ret;
if (type.id() == LogicalTypeId::STRUCT) {
return ConvertShreddedObject(metadata, *value, *typed_value, offset, length, total_size, is_field);
} else if (type.id() == LogicalTypeId::LIST) {
return ConvertShreddedArray(metadata, *value, *typed_value, offset, length, total_size, is_field);
} else {
return ConvertShreddedLeaf(metadata, *value, *typed_value, offset, length, total_size, is_field);
}
} else {
if (is_field) {
return ConvertBinaryEncoding<false>(metadata, *value, offset, length, total_size);
} else {
//! Only 'value' is present, we can assume this to be 'required', so it can't contain NULLs
return ConvertBinaryEncoding<true>(metadata, *value, offset, length, total_size);
}
}
}
} // namespace duckdb

View File

@@ -0,0 +1,85 @@
#include "reader/variant/variant_value.hpp"
namespace duckdb {
void VariantValue::AddChild(const string &key, VariantValue &&val) {
D_ASSERT(value_type == VariantValueType::OBJECT);
object_children.emplace(key, std::move(val));
}
void VariantValue::AddItem(VariantValue &&val) {
D_ASSERT(value_type == VariantValueType::ARRAY);
array_items.push_back(std::move(val));
}
yyjson_mut_val *VariantValue::ToJSON(ClientContext &context, yyjson_mut_doc *doc) const {
switch (value_type) {
case VariantValueType::PRIMITIVE: {
if (primitive_value.IsNull()) {
return yyjson_mut_null(doc);
}
switch (primitive_value.type().id()) {
case LogicalTypeId::BOOLEAN: {
if (primitive_value.GetValue<bool>()) {
return yyjson_mut_true(doc);
} else {
return yyjson_mut_false(doc);
}
}
case LogicalTypeId::TINYINT:
return yyjson_mut_int(doc, primitive_value.GetValue<int8_t>());
case LogicalTypeId::SMALLINT:
return yyjson_mut_int(doc, primitive_value.GetValue<int16_t>());
case LogicalTypeId::INTEGER:
return yyjson_mut_int(doc, primitive_value.GetValue<int32_t>());
case LogicalTypeId::BIGINT:
return yyjson_mut_int(doc, primitive_value.GetValue<int64_t>());
case LogicalTypeId::FLOAT:
return yyjson_mut_real(doc, primitive_value.GetValue<float>());
case LogicalTypeId::DOUBLE:
return yyjson_mut_real(doc, primitive_value.GetValue<double>());
case LogicalTypeId::DATE:
case LogicalTypeId::TIME:
case LogicalTypeId::VARCHAR: {
auto value_str = primitive_value.ToString();
return yyjson_mut_strncpy(doc, value_str.c_str(), value_str.size());
}
case LogicalTypeId::TIMESTAMP: {
auto value_str = primitive_value.ToString();
return yyjson_mut_strncpy(doc, value_str.c_str(), value_str.size());
}
case LogicalTypeId::TIMESTAMP_TZ: {
auto value_str = primitive_value.CastAs(context, LogicalType::VARCHAR).GetValue<string>();
return yyjson_mut_strncpy(doc, value_str.c_str(), value_str.size());
}
case LogicalTypeId::TIMESTAMP_NS: {
auto value_str = primitive_value.CastAs(context, LogicalType::VARCHAR).GetValue<string>();
return yyjson_mut_strncpy(doc, value_str.c_str(), value_str.size());
}
default:
throw InternalException("Unexpected primitive type: %s", primitive_value.type().ToString());
}
}
case VariantValueType::OBJECT: {
auto obj = yyjson_mut_obj(doc);
for (const auto &it : object_children) {
auto &key = it.first;
auto value = it.second.ToJSON(context, doc);
yyjson_mut_obj_add_val(doc, obj, key.c_str(), value);
}
return obj;
}
case VariantValueType::ARRAY: {
auto arr = yyjson_mut_arr(doc);
for (auto &item : array_items) {
auto value = item.ToJSON(context, doc);
yyjson_mut_arr_add_val(arr, value);
}
return arr;
}
default:
throw InternalException("Can't serialize this VariantValue type to JSON");
}
}
} // namespace duckdb

View File

@@ -0,0 +1,161 @@
#include "reader/variant_column_reader.hpp"
#include "reader/variant/variant_binary_decoder.hpp"
#include "reader/variant/variant_shredded_conversion.hpp"
namespace duckdb {
//===--------------------------------------------------------------------===//
// Variant Column Reader
//===--------------------------------------------------------------------===//
VariantColumnReader::VariantColumnReader(ClientContext &context, ParquetReader &reader,
const ParquetColumnSchema &schema,
vector<unique_ptr<ColumnReader>> child_readers_p)
: ColumnReader(reader, schema), context(context), child_readers(std::move(child_readers_p)) {
D_ASSERT(Type().InternalType() == PhysicalType::VARCHAR);
if (child_readers[0]->Schema().name == "metadata" && child_readers[1]->Schema().name == "value") {
metadata_reader_idx = 0;
value_reader_idx = 1;
} else if (child_readers[1]->Schema().name == "metadata" && child_readers[0]->Schema().name == "value") {
metadata_reader_idx = 1;
value_reader_idx = 0;
} else {
throw InternalException("The Variant column must have 'metadata' and 'value' as the first two columns");
}
}
ColumnReader &VariantColumnReader::GetChildReader(idx_t child_idx) {
if (!child_readers[child_idx]) {
throw InternalException("VariantColumnReader::GetChildReader(%d) - but this child reader is not set",
child_idx);
}
return *child_readers[child_idx].get();
}
void VariantColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns,
TProtocol &protocol_p) {
for (auto &child : child_readers) {
if (!child) {
continue;
}
child->InitializeRead(row_group_idx_p, columns, protocol_p);
}
}
static LogicalType GetIntermediateGroupType(optional_ptr<ColumnReader> typed_value) {
child_list_t<LogicalType> children;
children.emplace_back("value", LogicalType::BLOB);
if (typed_value) {
children.emplace_back("typed_value", typed_value->Type());
}
return LogicalType::STRUCT(std::move(children));
}
idx_t VariantColumnReader::Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) {
if (pending_skips > 0) {
throw InternalException("VariantColumnReader cannot have pending skips");
}
optional_ptr<ColumnReader> typed_value_reader = child_readers.size() == 3 ? child_readers[2].get() : nullptr;
// If the child reader values are all valid, "define_out" may not be initialized at all
// So, we just initialize them to all be valid beforehand
std::fill_n(define_out, num_values, MaxDefine());
optional_idx read_count;
Vector metadata_intermediate(LogicalType::BLOB, num_values);
Vector intermediate_group(GetIntermediateGroupType(typed_value_reader), num_values);
auto &group_entries = StructVector::GetEntries(intermediate_group);
auto &value_intermediate = *group_entries[0];
auto metadata_values =
child_readers[metadata_reader_idx]->Read(num_values, define_out, repeat_out, metadata_intermediate);
auto value_values = child_readers[value_reader_idx]->Read(num_values, define_out, repeat_out, value_intermediate);
D_ASSERT(child_readers[metadata_reader_idx]->Schema().name == "metadata");
D_ASSERT(child_readers[value_reader_idx]->Schema().name == "value");
if (metadata_values != value_values) {
throw InvalidInputException(
"The Variant column did not contain the same amount of values for 'metadata' and 'value'");
}
auto result_data = FlatVector::GetData<string_t>(result);
auto &result_validity = FlatVector::Validity(result);
vector<VariantValue> conversion_result;
if (typed_value_reader) {
auto typed_values = typed_value_reader->Read(num_values, define_out, repeat_out, *group_entries[1]);
if (typed_values != value_values) {
throw InvalidInputException(
"The shredded Variant column did not contain the same amount of values for 'typed_value' and 'value'");
}
}
conversion_result =
VariantShreddedConversion::Convert(metadata_intermediate, intermediate_group, 0, num_values, num_values, false);
for (idx_t i = 0; i < conversion_result.size(); i++) {
auto &variant = conversion_result[i];
if (variant.IsNull()) {
result_validity.SetInvalid(i);
continue;
}
//! Write the result to a string
VariantDecodeResult decode_result;
decode_result.doc = yyjson_mut_doc_new(nullptr);
auto json_val = variant.ToJSON(context, decode_result.doc);
size_t len;
decode_result.data =
yyjson_mut_val_write_opts(json_val, YYJSON_WRITE_ALLOW_INF_AND_NAN, nullptr, &len, nullptr);
if (!decode_result.data) {
throw InvalidInputException("Could not serialize the JSON to string, yyjson failed");
}
result_data[i] = StringVector::AddString(result, decode_result.data, static_cast<idx_t>(len));
}
read_count = value_values;
return read_count.GetIndex();
}
void VariantColumnReader::Skip(idx_t num_values) {
for (auto &child : child_readers) {
if (!child) {
continue;
}
child->Skip(num_values);
}
}
void VariantColumnReader::RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) {
for (auto &child : child_readers) {
if (!child) {
continue;
}
child->RegisterPrefetch(transport, allow_merge);
}
}
uint64_t VariantColumnReader::TotalCompressedSize() {
uint64_t size = 0;
for (auto &child : child_readers) {
if (!child) {
continue;
}
size += child->TotalCompressedSize();
}
return size;
}
idx_t VariantColumnReader::GroupRowsAvailable() {
for (auto &child : child_readers) {
if (!child) {
continue;
}
return child->GroupRowsAvailable();
}
throw InternalException("No projected columns in struct?");
}
} // namespace duckdb

View File

@@ -0,0 +1,117 @@
//===----------------------------------------------------------------------===//
// This file is automatically generated by scripts/generate_serialization.py
// Do not edit this file manually, your changes will be overwritten
//===----------------------------------------------------------------------===//
#include "duckdb/common/serializer/serializer.hpp"
#include "duckdb/common/serializer/deserializer.hpp"
#include "parquet_reader.hpp"
#include "parquet_crypto.hpp"
#include "parquet_field_id.hpp"
#include "parquet_shredding.hpp"
namespace duckdb {
void ChildFieldIDs::Serialize(Serializer &serializer) const {
serializer.WritePropertyWithDefault<case_insensitive_map_t<FieldID>>(100, "ids", ids.operator*());
}
ChildFieldIDs ChildFieldIDs::Deserialize(Deserializer &deserializer) {
ChildFieldIDs result;
deserializer.ReadPropertyWithDefault<case_insensitive_map_t<FieldID>>(100, "ids", result.ids.operator*());
return result;
}
void ChildShreddingTypes::Serialize(Serializer &serializer) const {
serializer.WritePropertyWithDefault<case_insensitive_map_t<ShreddingType>>(100, "types", types.operator*());
}
ChildShreddingTypes ChildShreddingTypes::Deserialize(Deserializer &deserializer) {
ChildShreddingTypes result;
deserializer.ReadPropertyWithDefault<case_insensitive_map_t<ShreddingType>>(100, "types", result.types.operator*());
return result;
}
void FieldID::Serialize(Serializer &serializer) const {
serializer.WritePropertyWithDefault<bool>(100, "set", set);
serializer.WritePropertyWithDefault<int32_t>(101, "field_id", field_id);
serializer.WriteProperty<ChildFieldIDs>(102, "child_field_ids", child_field_ids);
}
FieldID FieldID::Deserialize(Deserializer &deserializer) {
FieldID result;
deserializer.ReadPropertyWithDefault<bool>(100, "set", result.set);
deserializer.ReadPropertyWithDefault<int32_t>(101, "field_id", result.field_id);
deserializer.ReadProperty<ChildFieldIDs>(102, "child_field_ids", result.child_field_ids);
return result;
}
void ParquetColumnDefinition::Serialize(Serializer &serializer) const {
serializer.WritePropertyWithDefault<int32_t>(100, "field_id", field_id);
serializer.WritePropertyWithDefault<string>(101, "name", name);
serializer.WriteProperty<LogicalType>(103, "type", type);
serializer.WriteProperty<Value>(104, "default_value", default_value);
serializer.WritePropertyWithDefault<Value>(105, "identifier", identifier, Value());
}
ParquetColumnDefinition ParquetColumnDefinition::Deserialize(Deserializer &deserializer) {
ParquetColumnDefinition result;
deserializer.ReadPropertyWithDefault<int32_t>(100, "field_id", result.field_id);
deserializer.ReadPropertyWithDefault<string>(101, "name", result.name);
deserializer.ReadProperty<LogicalType>(103, "type", result.type);
deserializer.ReadProperty<Value>(104, "default_value", result.default_value);
deserializer.ReadPropertyWithExplicitDefault<Value>(105, "identifier", result.identifier, Value());
return result;
}
void ParquetEncryptionConfig::Serialize(Serializer &serializer) const {
serializer.WritePropertyWithDefault<string>(100, "footer_key", footer_key);
serializer.WritePropertyWithDefault<unordered_map<string, string>>(101, "column_keys", column_keys);
}
shared_ptr<ParquetEncryptionConfig> ParquetEncryptionConfig::Deserialize(Deserializer &deserializer) {
auto result = duckdb::shared_ptr<ParquetEncryptionConfig>(new ParquetEncryptionConfig());
deserializer.ReadPropertyWithDefault<string>(100, "footer_key", result->footer_key);
deserializer.ReadPropertyWithDefault<unordered_map<string, string>>(101, "column_keys", result->column_keys);
return result;
}
void ParquetOptionsSerialization::Serialize(Serializer &serializer) const {
serializer.WritePropertyWithDefault<bool>(100, "binary_as_string", parquet_options.binary_as_string);
serializer.WritePropertyWithDefault<bool>(101, "file_row_number", parquet_options.file_row_number);
serializer.WriteProperty<MultiFileOptions>(102, "file_options", file_options);
serializer.WritePropertyWithDefault<vector<ParquetColumnDefinition>>(103, "schema", parquet_options.schema);
serializer.WritePropertyWithDefault<shared_ptr<ParquetEncryptionConfig>>(104, "encryption_config", parquet_options.encryption_config, nullptr);
serializer.WritePropertyWithDefault<bool>(105, "debug_use_openssl", parquet_options.debug_use_openssl, true);
serializer.WritePropertyWithDefault<idx_t>(106, "explicit_cardinality", parquet_options.explicit_cardinality, 0);
serializer.WritePropertyWithDefault<bool>(107, "can_have_nan", parquet_options.can_have_nan, false);
}
ParquetOptionsSerialization ParquetOptionsSerialization::Deserialize(Deserializer &deserializer) {
ParquetOptionsSerialization result;
deserializer.ReadPropertyWithDefault<bool>(100, "binary_as_string", result.parquet_options.binary_as_string);
deserializer.ReadPropertyWithDefault<bool>(101, "file_row_number", result.parquet_options.file_row_number);
deserializer.ReadProperty<MultiFileOptions>(102, "file_options", result.file_options);
deserializer.ReadPropertyWithDefault<vector<ParquetColumnDefinition>>(103, "schema", result.parquet_options.schema);
deserializer.ReadPropertyWithExplicitDefault<shared_ptr<ParquetEncryptionConfig>>(104, "encryption_config", result.parquet_options.encryption_config, nullptr);
deserializer.ReadPropertyWithExplicitDefault<bool>(105, "debug_use_openssl", result.parquet_options.debug_use_openssl, true);
deserializer.ReadPropertyWithExplicitDefault<idx_t>(106, "explicit_cardinality", result.parquet_options.explicit_cardinality, 0);
deserializer.ReadPropertyWithExplicitDefault<bool>(107, "can_have_nan", result.parquet_options.can_have_nan, false);
return result;
}
void ShreddingType::Serialize(Serializer &serializer) const {
serializer.WritePropertyWithDefault<bool>(100, "set", set);
serializer.WriteProperty<LogicalType>(101, "type", type);
serializer.WriteProperty<ChildShreddingTypes>(102, "children", children);
}
ShreddingType ShreddingType::Deserialize(Deserializer &deserializer) {
ShreddingType result;
deserializer.ReadPropertyWithDefault<bool>(100, "set", result.set);
deserializer.ReadProperty<LogicalType>(101, "type", result.type);
deserializer.ReadProperty<ChildShreddingTypes>(102, "children", result.children);
return result;
}
} // namespace duckdb

Some files were not shown because too many files have changed in this diff Show More