should be it
This commit is contained in:
12
external/duckdb/extension/parquet/decoder/CMakeLists.txt
vendored
Normal file
12
external/duckdb/extension/parquet/decoder/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
add_library_unity(
|
||||
duckdb_parquet_decoders
|
||||
OBJECT
|
||||
byte_stream_split_decoder.cpp
|
||||
delta_binary_packed_decoder.cpp
|
||||
delta_byte_array_decoder.cpp
|
||||
delta_length_byte_array_decoder.cpp
|
||||
dictionary_decoder.cpp
|
||||
rle_decoder.cpp)
|
||||
set(PARQUET_EXTENSION_FILES
|
||||
${PARQUET_EXTENSION_FILES} $<TARGET_OBJECTS:duckdb_parquet_decoders>
|
||||
PARENT_SCOPE)
|
||||
54
external/duckdb/extension/parquet/decoder/byte_stream_split_decoder.cpp
vendored
Normal file
54
external/duckdb/extension/parquet/decoder/byte_stream_split_decoder.cpp
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
#include "decoder/byte_stream_split_decoder.hpp"
|
||||
#include "column_reader.hpp"
|
||||
#include "parquet_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
ByteStreamSplitDecoder::ByteStreamSplitDecoder(ColumnReader &reader)
|
||||
: reader(reader), decoded_data_buffer(reader.encoding_buffers[0]) {
|
||||
}
|
||||
|
||||
void ByteStreamSplitDecoder::InitializePage() {
|
||||
auto &block = reader.block;
|
||||
// Subtract 1 from length as the block is allocated with 1 extra byte,
|
||||
// but the byte stream split encoder needs to know the correct data size.
|
||||
bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
|
||||
block->inc(block->len);
|
||||
}
|
||||
|
||||
void ByteStreamSplitDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
|
||||
idx_t valid_count = reader.GetValidCount(defines, read_count, result_offset);
|
||||
|
||||
auto &allocator = reader.reader.allocator;
|
||||
decoded_data_buffer.reset();
|
||||
switch (reader.Schema().parquet_type) {
|
||||
case duckdb_parquet::Type::FLOAT:
|
||||
decoded_data_buffer.resize(allocator, sizeof(float) * valid_count);
|
||||
bss_decoder->GetBatch<float>(decoded_data_buffer.ptr, valid_count);
|
||||
break;
|
||||
case duckdb_parquet::Type::DOUBLE:
|
||||
decoded_data_buffer.resize(allocator, sizeof(double) * valid_count);
|
||||
bss_decoder->GetBatch<double>(decoded_data_buffer.ptr, valid_count);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
|
||||
}
|
||||
|
||||
reader.Plain(decoded_data_buffer, defines, read_count, result_offset, result);
|
||||
}
|
||||
|
||||
void ByteStreamSplitDecoder::Skip(uint8_t *defines, idx_t skip_count) {
|
||||
idx_t valid_count = reader.GetValidCount(defines, skip_count);
|
||||
switch (reader.Schema().parquet_type) {
|
||||
case duckdb_parquet::Type::FLOAT:
|
||||
bss_decoder->Skip<float>(valid_count);
|
||||
break;
|
||||
case duckdb_parquet::Type::DOUBLE:
|
||||
bss_decoder->Skip<double>(valid_count);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
54
external/duckdb/extension/parquet/decoder/delta_binary_packed_decoder.cpp
vendored
Normal file
54
external/duckdb/extension/parquet/decoder/delta_binary_packed_decoder.cpp
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
#include "decoder/delta_binary_packed_decoder.hpp"
|
||||
#include "column_reader.hpp"
|
||||
#include "parquet_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
DeltaBinaryPackedDecoder::DeltaBinaryPackedDecoder(ColumnReader &reader)
|
||||
: reader(reader), decoded_data_buffer(reader.encoding_buffers[0]) {
|
||||
}
|
||||
|
||||
void DeltaBinaryPackedDecoder::InitializePage() {
|
||||
auto &block = reader.block;
|
||||
dbp_decoder = make_uniq<DbpDecoder>(block->ptr, block->len);
|
||||
block->inc(block->len);
|
||||
}
|
||||
|
||||
void DeltaBinaryPackedDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
|
||||
idx_t valid_count = reader.GetValidCount(defines, read_count, result_offset);
|
||||
|
||||
auto &allocator = reader.reader.allocator;
|
||||
decoded_data_buffer.reset();
|
||||
switch (reader.Schema().parquet_type) {
|
||||
case duckdb_parquet::Type::INT32:
|
||||
decoded_data_buffer.resize(allocator, sizeof(int32_t) * (valid_count));
|
||||
dbp_decoder->GetBatch<int32_t>(decoded_data_buffer.ptr, valid_count);
|
||||
break;
|
||||
case duckdb_parquet::Type::INT64:
|
||||
decoded_data_buffer.resize(allocator, sizeof(int64_t) * (valid_count));
|
||||
dbp_decoder->GetBatch<int64_t>(decoded_data_buffer.ptr, valid_count);
|
||||
break;
|
||||
|
||||
default:
|
||||
throw std::runtime_error("DELTA_BINARY_PACKED should only be INT32 or INT64");
|
||||
}
|
||||
// Plain() will put NULLs in the right place
|
||||
reader.Plain(decoded_data_buffer, defines, read_count, result_offset, result);
|
||||
}
|
||||
|
||||
void DeltaBinaryPackedDecoder::Skip(uint8_t *defines, idx_t skip_count) {
|
||||
idx_t valid_count = reader.GetValidCount(defines, skip_count);
|
||||
switch (reader.Schema().parquet_type) {
|
||||
case duckdb_parquet::Type::INT32:
|
||||
dbp_decoder->Skip<int32_t>(valid_count);
|
||||
break;
|
||||
case duckdb_parquet::Type::INT64:
|
||||
dbp_decoder->Skip<int64_t>(valid_count);
|
||||
break;
|
||||
|
||||
default:
|
||||
throw std::runtime_error("DELTA_BINARY_PACKED should only be INT32 or INT64");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
103
external/duckdb/extension/parquet/decoder/delta_byte_array_decoder.cpp
vendored
Normal file
103
external/duckdb/extension/parquet/decoder/delta_byte_array_decoder.cpp
vendored
Normal file
@@ -0,0 +1,103 @@
|
||||
#include "decoder/delta_byte_array_decoder.hpp"
|
||||
#include "column_reader.hpp"
|
||||
#include "parquet_reader.hpp"
|
||||
#include "reader/templated_column_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
DeltaByteArrayDecoder::DeltaByteArrayDecoder(ColumnReader &reader) : reader(reader) {
|
||||
}
|
||||
|
||||
void DeltaByteArrayDecoder::ReadDbpData(Allocator &allocator, ResizeableBuffer &buffer, ResizeableBuffer &result_buffer,
|
||||
idx_t &value_count) {
|
||||
auto decoder = make_uniq<DbpDecoder>(buffer.ptr, buffer.len);
|
||||
value_count = decoder->TotalValues();
|
||||
result_buffer.reset();
|
||||
result_buffer.resize(allocator, sizeof(uint32_t) * value_count);
|
||||
decoder->GetBatch<uint32_t>(result_buffer.ptr, value_count);
|
||||
decoder->Finalize();
|
||||
buffer.inc(buffer.len - decoder->BufferPtr().len);
|
||||
}
|
||||
|
||||
void DeltaByteArrayDecoder::InitializePage() {
|
||||
if (reader.Type().InternalType() != PhysicalType::VARCHAR) {
|
||||
throw std::runtime_error("Delta Byte Array encoding is only supported for string/blob data");
|
||||
}
|
||||
auto &block = *reader.block;
|
||||
auto &allocator = reader.reader.allocator;
|
||||
idx_t prefix_count, suffix_count;
|
||||
auto &prefix_buffer = reader.encoding_buffers[0];
|
||||
auto &suffix_buffer = reader.encoding_buffers[1];
|
||||
ReadDbpData(allocator, block, prefix_buffer, prefix_count);
|
||||
ReadDbpData(allocator, block, suffix_buffer, suffix_count);
|
||||
if (prefix_count != suffix_count) {
|
||||
throw std::runtime_error("DELTA_BYTE_ARRAY - prefix and suffix counts are different - corrupt file?");
|
||||
}
|
||||
if (prefix_count == 0) {
|
||||
// no values
|
||||
byte_array_data = make_uniq<Vector>(LogicalType::VARCHAR, nullptr);
|
||||
return;
|
||||
}
|
||||
auto prefix_data = reinterpret_cast<uint32_t *>(prefix_buffer.ptr);
|
||||
auto suffix_data = reinterpret_cast<uint32_t *>(suffix_buffer.ptr);
|
||||
byte_array_data = make_uniq<Vector>(LogicalType::VARCHAR, prefix_count);
|
||||
byte_array_count = prefix_count;
|
||||
delta_offset = 0;
|
||||
auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
|
||||
for (idx_t i = 0; i < prefix_count; i++) {
|
||||
auto str_len = prefix_data[i] + suffix_data[i];
|
||||
block.available(suffix_data[i]);
|
||||
string_data[i] = StringVector::EmptyString(*byte_array_data, str_len);
|
||||
auto result_data = string_data[i].GetDataWriteable();
|
||||
if (prefix_data[i] > 0) {
|
||||
if (i == 0 || prefix_data[i] > string_data[i - 1].GetSize()) {
|
||||
throw std::runtime_error("DELTA_BYTE_ARRAY - prefix is out of range - corrupt file?");
|
||||
}
|
||||
memcpy(result_data, string_data[i - 1].GetData(), prefix_data[i]);
|
||||
}
|
||||
memcpy(result_data + prefix_data[i], block.ptr, suffix_data[i]);
|
||||
block.inc(suffix_data[i]);
|
||||
string_data[i].Finalize();
|
||||
}
|
||||
}
|
||||
|
||||
void DeltaByteArrayDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
|
||||
if (!byte_array_data) {
|
||||
throw std::runtime_error("Internal error - DeltaByteArray called but there was no byte_array_data set");
|
||||
}
|
||||
auto result_ptr = FlatVector::GetData<string_t>(result);
|
||||
auto &result_mask = FlatVector::Validity(result);
|
||||
auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
|
||||
for (idx_t row_idx = 0; row_idx < read_count; row_idx++) {
|
||||
if (defines && defines[row_idx + result_offset] != reader.MaxDefine()) {
|
||||
result_mask.SetInvalid(row_idx + result_offset);
|
||||
continue;
|
||||
}
|
||||
if (delta_offset >= byte_array_count) {
|
||||
throw IOException("DELTA_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
|
||||
"read of %d from %d entries) - corrupt file?",
|
||||
delta_offset + 1, byte_array_count);
|
||||
}
|
||||
result_ptr[row_idx + result_offset] = string_data[delta_offset++];
|
||||
}
|
||||
StringVector::AddHeapReference(result, *byte_array_data);
|
||||
}
|
||||
|
||||
void DeltaByteArrayDecoder::Skip(uint8_t *defines, idx_t skip_count) {
|
||||
if (!byte_array_data) {
|
||||
throw std::runtime_error("Internal error - DeltaByteArray called but there was no byte_array_data set");
|
||||
}
|
||||
for (idx_t row_idx = 0; row_idx < skip_count; row_idx++) {
|
||||
if (defines && defines[row_idx] != reader.MaxDefine()) {
|
||||
continue;
|
||||
}
|
||||
if (delta_offset >= byte_array_count) {
|
||||
throw IOException("DELTA_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
|
||||
"read of %d from %d entries) - corrupt file?",
|
||||
delta_offset + 1, byte_array_count);
|
||||
}
|
||||
delta_offset++;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
128
external/duckdb/extension/parquet/decoder/delta_length_byte_array_decoder.cpp
vendored
Normal file
128
external/duckdb/extension/parquet/decoder/delta_length_byte_array_decoder.cpp
vendored
Normal file
@@ -0,0 +1,128 @@
|
||||
#include "decoder/delta_length_byte_array_decoder.hpp"
|
||||
#include "decoder/delta_byte_array_decoder.hpp"
|
||||
#include "column_reader.hpp"
|
||||
#include "parquet_reader.hpp"
|
||||
#include "reader/string_column_reader.hpp"
|
||||
#include "utf8proc_wrapper.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
DeltaLengthByteArrayDecoder::DeltaLengthByteArrayDecoder(ColumnReader &reader)
|
||||
: reader(reader), length_buffer(reader.encoding_buffers[0]), length_idx(0) {
|
||||
}
|
||||
|
||||
void DeltaLengthByteArrayDecoder::InitializePage() {
|
||||
if (reader.Type().InternalType() != PhysicalType::VARCHAR) {
|
||||
throw std::runtime_error("Delta Length Byte Array encoding is only supported for string/blob data");
|
||||
}
|
||||
// read the binary packed lengths
|
||||
auto &block = *reader.block;
|
||||
auto &allocator = reader.reader.allocator;
|
||||
DeltaByteArrayDecoder::ReadDbpData(allocator, block, length_buffer, byte_array_count);
|
||||
|
||||
// Verify that the sum of DBP string lengths match up with the available string data
|
||||
idx_t total_string_length = 0;
|
||||
const auto length_data = reinterpret_cast<uint32_t *>(length_buffer.ptr);
|
||||
for (idx_t i = 0; i < byte_array_count; i++) {
|
||||
total_string_length += length_data[i];
|
||||
}
|
||||
block.available(total_string_length);
|
||||
|
||||
length_idx = 0;
|
||||
}
|
||||
|
||||
void DeltaLengthByteArrayDecoder::Read(shared_ptr<ResizeableBuffer> &block_ref, uint8_t *defines, idx_t read_count,
|
||||
Vector &result, idx_t result_offset) {
|
||||
if (defines) {
|
||||
ReadInternal<true>(block_ref, defines, read_count, result, result_offset);
|
||||
} else {
|
||||
ReadInternal<false>(block_ref, defines, read_count, result, result_offset);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool HAS_DEFINES>
|
||||
void DeltaLengthByteArrayDecoder::ReadInternal(shared_ptr<ResizeableBuffer> &block_ref, uint8_t *const defines,
|
||||
const idx_t read_count, Vector &result, const idx_t result_offset) {
|
||||
auto &block = *block_ref;
|
||||
const auto length_data = reinterpret_cast<uint32_t *>(length_buffer.ptr);
|
||||
auto result_data = FlatVector::GetData<string_t>(result);
|
||||
auto &result_mask = FlatVector::Validity(result);
|
||||
|
||||
if (!HAS_DEFINES) {
|
||||
// Fast path: take this out of the loop below
|
||||
if (length_idx + read_count > byte_array_count) {
|
||||
throw IOException(
|
||||
"DELTA_LENGTH_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
|
||||
"read of %d from %d entries) - corrupt file?",
|
||||
length_idx + read_count, byte_array_count);
|
||||
}
|
||||
}
|
||||
|
||||
const auto start_ptr = block.ptr;
|
||||
for (idx_t row_idx = 0; row_idx < read_count; row_idx++) {
|
||||
const auto result_idx = result_offset + row_idx;
|
||||
if (HAS_DEFINES) {
|
||||
if (defines[result_idx] != reader.MaxDefine()) {
|
||||
result_mask.SetInvalid(result_idx);
|
||||
continue;
|
||||
}
|
||||
if (length_idx >= byte_array_count) {
|
||||
throw IOException(
|
||||
"DELTA_LENGTH_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
|
||||
"read of %d from %d entries) - corrupt file?",
|
||||
length_idx, byte_array_count);
|
||||
}
|
||||
}
|
||||
const auto &str_len = length_data[length_idx++];
|
||||
result_data[result_idx] = string_t(char_ptr_cast(block.ptr), str_len);
|
||||
block.unsafe_inc(str_len);
|
||||
}
|
||||
|
||||
// Verify that the strings we read are valid UTF-8
|
||||
reader.Cast<StringColumnReader>().VerifyString(char_ptr_cast(start_ptr), block.ptr - start_ptr);
|
||||
|
||||
StringColumnReader::ReferenceBlock(result, block_ref);
|
||||
}
|
||||
|
||||
void DeltaLengthByteArrayDecoder::Skip(uint8_t *defines, idx_t skip_count) {
|
||||
if (defines) {
|
||||
SkipInternal<true>(defines, skip_count);
|
||||
} else {
|
||||
SkipInternal<false>(defines, skip_count);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool HAS_DEFINES>
|
||||
void DeltaLengthByteArrayDecoder::SkipInternal(uint8_t *defines, idx_t skip_count) {
|
||||
auto &block = *reader.block;
|
||||
const auto length_data = reinterpret_cast<uint32_t *>(length_buffer.ptr);
|
||||
|
||||
if (!HAS_DEFINES) {
|
||||
// Fast path: take this out of the loop below
|
||||
if (length_idx + skip_count > byte_array_count) {
|
||||
throw IOException(
|
||||
"DELTA_LENGTH_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
|
||||
"read of %d from %d entries) - corrupt file?",
|
||||
length_idx + skip_count, byte_array_count);
|
||||
}
|
||||
}
|
||||
|
||||
idx_t skip_bytes = 0;
|
||||
for (idx_t row_idx = 0; row_idx < skip_count; row_idx++) {
|
||||
if (HAS_DEFINES) {
|
||||
if (defines[row_idx] != reader.MaxDefine()) {
|
||||
continue;
|
||||
}
|
||||
if (length_idx >= byte_array_count) {
|
||||
throw IOException(
|
||||
"DELTA_LENGTH_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
|
||||
"read of %d from %d entries) - corrupt file?",
|
||||
length_idx, byte_array_count);
|
||||
}
|
||||
}
|
||||
skip_bytes += length_data[length_idx++];
|
||||
}
|
||||
block.inc(skip_bytes);
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
229
external/duckdb/extension/parquet/decoder/dictionary_decoder.cpp
vendored
Normal file
229
external/duckdb/extension/parquet/decoder/dictionary_decoder.cpp
vendored
Normal file
@@ -0,0 +1,229 @@
|
||||
#include "decoder/dictionary_decoder.hpp"
|
||||
#include "column_reader.hpp"
|
||||
#include "parquet_reader.hpp"
|
||||
#include "duckdb/planner/filter/conjunction_filter.hpp"
|
||||
#include "duckdb/planner/filter/expression_filter.hpp"
|
||||
#include "duckdb/planner/table_filter_state.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
DictionaryDecoder::DictionaryDecoder(ColumnReader &reader)
|
||||
: reader(reader), offset_buffer(reader.encoding_buffers[0]), valid_sel(STANDARD_VECTOR_SIZE),
|
||||
dictionary_selection_vector(STANDARD_VECTOR_SIZE), dictionary_size(0) {
|
||||
}
|
||||
|
||||
void DictionaryDecoder::InitializeDictionary(idx_t new_dictionary_size, optional_ptr<const TableFilter> filter,
|
||||
optional_ptr<TableFilterState> filter_state, bool has_defines) {
|
||||
dictionary_size = new_dictionary_size;
|
||||
filter_result.reset();
|
||||
filter_count = 0;
|
||||
can_have_nulls = has_defines;
|
||||
|
||||
// we use the last entry as a NULL, dictionary vectors don't have a separate validity mask
|
||||
const auto duckdb_dictionary_size = dictionary_size + can_have_nulls;
|
||||
dictionary = DictionaryVector::CreateReusableDictionary(reader.Type(), duckdb_dictionary_size);
|
||||
auto &dict_validity = FlatVector::Validity(dictionary->data);
|
||||
dict_validity.Reset(duckdb_dictionary_size);
|
||||
if (can_have_nulls) {
|
||||
dict_validity.SetInvalid(dictionary_size);
|
||||
}
|
||||
|
||||
// now read the non-NULL values from Parquet
|
||||
reader.Plain(reader.block, nullptr, dictionary_size, 0, dictionary->data);
|
||||
|
||||
// immediately filter the dictionary, if applicable
|
||||
if (filter && CanFilter(*filter, *filter_state)) {
|
||||
// no filter result yet - apply filter to the dictionary
|
||||
// initialize the filter result - setting everything to false
|
||||
filter_result = make_unsafe_uniq_array<bool>(duckdb_dictionary_size);
|
||||
|
||||
// apply the filter
|
||||
UnifiedVectorFormat vdata;
|
||||
dictionary->data.ToUnifiedFormat(duckdb_dictionary_size, vdata);
|
||||
SelectionVector dict_sel;
|
||||
filter_count = duckdb_dictionary_size;
|
||||
ColumnSegment::FilterSelection(dict_sel, dictionary->data, vdata, *filter, *filter_state,
|
||||
duckdb_dictionary_size, filter_count);
|
||||
|
||||
// now set all matching tuples to true
|
||||
for (idx_t i = 0; i < filter_count; i++) {
|
||||
auto idx = dict_sel.get_index(i);
|
||||
filter_result[idx] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DictionaryDecoder::InitializePage() {
|
||||
// where is it otherwise??
|
||||
auto &block = reader.block;
|
||||
auto dict_width = block->read<uint8_t>();
|
||||
dict_decoder = make_uniq<RleBpDecoder>(block->ptr, block->len, dict_width);
|
||||
block->inc(block->len);
|
||||
}
|
||||
|
||||
void DictionaryDecoder::ConvertDictToSelVec(uint32_t *offsets, const SelectionVector &rows, idx_t count) {
|
||||
D_ASSERT(count <= STANDARD_VECTOR_SIZE);
|
||||
for (idx_t idx = 0; idx < count; idx++) {
|
||||
auto row_idx = rows.get_index(idx);
|
||||
auto offset = offsets[idx];
|
||||
if (offset >= dictionary_size) {
|
||||
throw std::runtime_error("Parquet file is likely corrupted, dictionary offset out of range");
|
||||
}
|
||||
dictionary_selection_vector.set_index(row_idx, offset);
|
||||
}
|
||||
}
|
||||
|
||||
idx_t DictionaryDecoder::GetValidValues(uint8_t *defines, idx_t read_count, idx_t result_offset) {
|
||||
idx_t valid_count = read_count;
|
||||
if (defines) {
|
||||
D_ASSERT(can_have_nulls);
|
||||
valid_count = 0;
|
||||
for (idx_t i = 0; i < read_count; i++) {
|
||||
valid_sel.set_index(valid_count, i);
|
||||
dictionary_selection_vector.set_index(i, dictionary_size);
|
||||
valid_count += defines[result_offset + i] == reader.MaxDefine();
|
||||
}
|
||||
}
|
||||
return valid_count;
|
||||
}
|
||||
|
||||
idx_t DictionaryDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
|
||||
if (!dictionary || dictionary_size < 0) {
|
||||
throw std::runtime_error("Parquet file is likely corrupted, missing dictionary");
|
||||
}
|
||||
idx_t valid_count = GetValidValues(defines, read_count, result_offset);
|
||||
if (valid_count == read_count) {
|
||||
// all values are valid - we can directly decompress the offsets into the selection vector
|
||||
dict_decoder->GetBatch<uint32_t>(data_ptr_cast(dictionary_selection_vector.data()),
|
||||
NumericCast<uint32_t>(valid_count));
|
||||
// we do still need to verify the offsets though
|
||||
uint32_t max_index = 0;
|
||||
for (idx_t idx = 0; idx < valid_count; idx++) {
|
||||
max_index = MaxValue(max_index, dictionary_selection_vector[idx]);
|
||||
}
|
||||
if (max_index >= dictionary_size) {
|
||||
throw std::runtime_error("Parquet file is likely corrupted, dictionary offset out of range");
|
||||
}
|
||||
} else if (valid_count > 0) {
|
||||
// for the valid entries - decode the offsets
|
||||
offset_buffer.resize(reader.reader.allocator, sizeof(uint32_t) * valid_count);
|
||||
dict_decoder->GetBatch<uint32_t>(offset_buffer.ptr, NumericCast<uint32_t>(valid_count));
|
||||
ConvertDictToSelVec(reinterpret_cast<uint32_t *>(offset_buffer.ptr), valid_sel, valid_count);
|
||||
}
|
||||
#ifdef DEBUG
|
||||
dictionary_selection_vector.Verify(read_count, dictionary_size + can_have_nulls);
|
||||
#endif
|
||||
if (result_offset == 0) {
|
||||
result.Dictionary(dictionary, dictionary_selection_vector);
|
||||
D_ASSERT(result.GetVectorType() == VectorType::DICTIONARY_VECTOR);
|
||||
} else {
|
||||
D_ASSERT(result.GetVectorType() == VectorType::FLAT_VECTOR);
|
||||
VectorOperations::Copy(dictionary->data, result, dictionary_selection_vector, read_count, 0, result_offset);
|
||||
}
|
||||
return valid_count;
|
||||
}
|
||||
|
||||
void DictionaryDecoder::Skip(uint8_t *defines, idx_t skip_count) {
|
||||
if (!dictionary || dictionary_size < 0) {
|
||||
throw std::runtime_error("Parquet file is likely corrupted, missing dictionary");
|
||||
}
|
||||
idx_t valid_count = reader.GetValidCount(defines, skip_count);
|
||||
// skip past the valid offsets
|
||||
dict_decoder->Skip(NumericCast<uint32_t>(valid_count));
|
||||
}
|
||||
|
||||
bool DictionaryDecoder::DictionarySupportsFilter(const TableFilter &filter, TableFilterState &filter_state) {
|
||||
switch (filter.filter_type) {
|
||||
case TableFilterType::CONJUNCTION_OR: {
|
||||
auto &conjunction = filter.Cast<ConjunctionOrFilter>();
|
||||
auto &state = filter_state.Cast<ConjunctionOrFilterState>();
|
||||
for (idx_t child_idx = 0; child_idx < conjunction.child_filters.size(); child_idx++) {
|
||||
auto &child_filter = *conjunction.child_filters[child_idx];
|
||||
auto &child_state = *state.child_states[child_idx];
|
||||
if (!DictionarySupportsFilter(child_filter, child_state)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case TableFilterType::CONJUNCTION_AND: {
|
||||
auto &conjunction = filter.Cast<ConjunctionAndFilter>();
|
||||
auto &state = filter_state.Cast<ConjunctionAndFilterState>();
|
||||
for (idx_t child_idx = 0; child_idx < conjunction.child_filters.size(); child_idx++) {
|
||||
auto &child_filter = *conjunction.child_filters[child_idx];
|
||||
auto &child_state = *state.child_states[child_idx];
|
||||
if (!DictionarySupportsFilter(child_filter, child_state)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case TableFilterType::CONSTANT_COMPARISON:
|
||||
case TableFilterType::IS_NOT_NULL:
|
||||
return true;
|
||||
case TableFilterType::EXPRESSION_FILTER: {
|
||||
// expression filters can only be pushed into the dictionary if they filter out NULL values
|
||||
auto &expr_filter = filter.Cast<ExpressionFilter>();
|
||||
auto &state = filter_state.Cast<ExpressionFilterState>();
|
||||
auto emits_nulls = expr_filter.EvaluateWithConstant(state.executor, Value(reader.Type()));
|
||||
return !emits_nulls;
|
||||
}
|
||||
case TableFilterType::IS_NULL:
|
||||
case TableFilterType::DYNAMIC_FILTER:
|
||||
case TableFilterType::OPTIONAL_FILTER:
|
||||
case TableFilterType::STRUCT_EXTRACT:
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool DictionaryDecoder::CanFilter(const TableFilter &filter, TableFilterState &filter_state) {
|
||||
if (dictionary_size == 0) {
|
||||
return false;
|
||||
}
|
||||
// We can only push the filter if the filter removes NULL values
|
||||
if (!DictionarySupportsFilter(filter, filter_state)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void DictionaryDecoder::Filter(uint8_t *defines, const idx_t read_count, Vector &result, SelectionVector &sel,
|
||||
idx_t &approved_tuple_count) {
|
||||
if (!dictionary || dictionary_size < 0) {
|
||||
throw std::runtime_error("Parquet file is likely corrupted, missing dictionary");
|
||||
}
|
||||
D_ASSERT(filter_count > 0);
|
||||
// read the dictionary values
|
||||
const auto valid_count = Read(defines, read_count, result, 0);
|
||||
if (valid_count == 0) {
|
||||
// all values are NULL
|
||||
approved_tuple_count = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
// apply the filter by checking the dictionary offsets directly
|
||||
uint32_t *offsets;
|
||||
if (valid_count == read_count) {
|
||||
offsets = dictionary_selection_vector.data();
|
||||
} else {
|
||||
offsets = reinterpret_cast<uint32_t *>(offset_buffer.ptr);
|
||||
}
|
||||
D_ASSERT(offsets);
|
||||
SelectionVector new_sel(valid_count);
|
||||
approved_tuple_count = 0;
|
||||
for (idx_t idx = 0; idx < valid_count; idx++) {
|
||||
auto row_idx = valid_count == read_count ? idx : valid_sel.get_index(idx);
|
||||
auto offset = offsets[idx];
|
||||
if (!filter_result[offset]) {
|
||||
// does not pass the filter
|
||||
continue;
|
||||
}
|
||||
new_sel.set_index(approved_tuple_count++, row_idx);
|
||||
}
|
||||
if (approved_tuple_count < read_count) {
|
||||
sel.Initialize(new_sel);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
36
external/duckdb/extension/parquet/decoder/rle_decoder.cpp
vendored
Normal file
36
external/duckdb/extension/parquet/decoder/rle_decoder.cpp
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
#include "decoder/rle_decoder.hpp"
|
||||
#include "column_reader.hpp"
|
||||
#include "parquet_reader.hpp"
|
||||
#include "reader/templated_column_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
RLEDecoder::RLEDecoder(ColumnReader &reader) : reader(reader), decoded_data_buffer(reader.encoding_buffers[0]) {
|
||||
}
|
||||
|
||||
void RLEDecoder::InitializePage() {
|
||||
if (reader.Type().id() != LogicalTypeId::BOOLEAN) {
|
||||
throw std::runtime_error("RLE encoding is only supported for boolean data");
|
||||
}
|
||||
auto &block = reader.block;
|
||||
block->inc(sizeof(uint32_t));
|
||||
rle_decoder = make_uniq<RleBpDecoder>(block->ptr, block->len, 1);
|
||||
}
|
||||
|
||||
void RLEDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
|
||||
// RLE encoding for boolean
|
||||
D_ASSERT(reader.Type().id() == LogicalTypeId::BOOLEAN);
|
||||
idx_t valid_count = reader.GetValidCount(defines, read_count, result_offset);
|
||||
decoded_data_buffer.reset();
|
||||
decoded_data_buffer.resize(reader.reader.allocator, sizeof(bool) * valid_count);
|
||||
rle_decoder->GetBatch<uint8_t>(decoded_data_buffer.ptr, valid_count);
|
||||
reader.PlainTemplated<bool, TemplatedParquetValueConversion<bool>>(decoded_data_buffer, defines, read_count,
|
||||
result_offset, result);
|
||||
}
|
||||
|
||||
void RLEDecoder::Skip(uint8_t *defines, idx_t skip_count) {
|
||||
idx_t valid_count = reader.GetValidCount(defines, skip_count);
|
||||
rle_decoder->Skip(valid_count);
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
Reference in New Issue
Block a user