//===----------------------------------------------------------------------===// // DuckDB // // parquet_dbp_deccoder.hpp // // //===----------------------------------------------------------------------===// #pragma once #include "decode_utils.hpp" namespace duckdb { class DbpDecoder { public: DbpDecoder(const data_ptr_t buffer, const uint32_t buffer_len) : buffer_(buffer, buffer_len), // block_size_in_values(ParquetDecodeUtils::VarintDecode(buffer_)), number_of_miniblocks_per_block(DecodeNumberOfMiniblocksPerBlock(buffer_)), number_of_values_in_a_miniblock(block_size_in_values / number_of_miniblocks_per_block), total_value_count(ParquetDecodeUtils::VarintDecode(buffer_)), previous_value(ParquetDecodeUtils::ZigzagToInt(ParquetDecodeUtils::VarintDecode(buffer_))), // init state to something sane is_first_value(true), read_values(0), min_delta(NumericLimits::Maximum()), miniblock_index(number_of_miniblocks_per_block - 1), list_of_bitwidths_of_miniblocks(nullptr), miniblock_offset(number_of_values_in_a_miniblock), unpacked_data_offset(BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) { if (!(block_size_in_values % number_of_miniblocks_per_block == 0 && number_of_values_in_a_miniblock % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0)) { throw InvalidInputException("Parquet file has invalid block sizes for DELTA_BINARY_PACKED"); } } ByteBuffer BufferPtr() const { return buffer_; } uint64_t TotalValues() const { return total_value_count; } template void GetBatch(const data_ptr_t target_values_ptr, const idx_t batch_size) { if (read_values + batch_size > total_value_count) { throw std::runtime_error("DBP decode did not find enough values"); } read_values += batch_size; GetBatchInternal(target_values_ptr, batch_size); } template void Skip(idx_t skip_count) { if (read_values + skip_count > total_value_count) { throw std::runtime_error("DBP decode did not find enough values"); } read_values += skip_count; GetBatchInternal(nullptr, skip_count); } void Finalize() { if (miniblock_offset == number_of_values_in_a_miniblock) { return; } auto data = make_unsafe_uniq_array(number_of_values_in_a_miniblock); GetBatchInternal(data_ptr_cast(data.get()), number_of_values_in_a_miniblock - miniblock_offset); } private: static idx_t DecodeNumberOfMiniblocksPerBlock(ByteBuffer &buffer) { auto res = ParquetDecodeUtils::VarintDecode(buffer); if (res == 0) { throw InvalidInputException( "Parquet file has invalid number of miniblocks per block for DELTA_BINARY_PACKED"); } return res; } template void GetBatchInternal(const data_ptr_t target_values_ptr, const idx_t batch_size) { if (batch_size == 0) { return; } D_ASSERT(target_values_ptr || SKIP_READ); T *target_values = nullptr; if (!SKIP_READ) { target_values = reinterpret_cast(target_values_ptr); } idx_t target_values_offset = 0; if (is_first_value) { if (!SKIP_READ) { target_values[0] = static_cast(previous_value); } target_values_offset++; is_first_value = false; } while (target_values_offset < batch_size) { // Copy over any remaining data const idx_t next = MinValue(batch_size - target_values_offset, BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE - unpacked_data_offset); if (next != 0) { for (idx_t i = 0; i < next; i++) { const auto &unpacked_value = unpacked_data[unpacked_data_offset + i]; auto current_value = static_cast(static_cast(previous_value) + static_cast(min_delta) + unpacked_value); if (!SKIP_READ) { target_values[target_values_offset + i] = current_value; } previous_value = static_cast(current_value); } target_values_offset += next; unpacked_data_offset += next; continue; } // Move to next miniblock / block D_ASSERT(unpacked_data_offset == BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE); D_ASSERT(miniblock_index < number_of_miniblocks_per_block); D_ASSERT(miniblock_offset <= number_of_values_in_a_miniblock); if (miniblock_offset == number_of_values_in_a_miniblock) { miniblock_offset = 0; if (++miniblock_index == number_of_miniblocks_per_block) { // min_delta = ParquetDecodeUtils::ZigzagToInt(ParquetDecodeUtils::VarintDecode(buffer_)); buffer_.available(number_of_miniblocks_per_block); list_of_bitwidths_of_miniblocks = buffer_.ptr; buffer_.unsafe_inc(number_of_miniblocks_per_block); miniblock_index = 0; } } // Unpack from current miniblock ParquetDecodeUtils::BitUnpackAligned(buffer_, unpacked_data, BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE, list_of_bitwidths_of_miniblocks[miniblock_index]); unpacked_data_offset = 0; miniblock_offset += BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE; } } private: ByteBuffer buffer_; const idx_t block_size_in_values; const idx_t number_of_miniblocks_per_block; const idx_t number_of_values_in_a_miniblock; const idx_t total_value_count; int64_t previous_value; bool is_first_value; idx_t read_values; //! Block stuff int64_t min_delta; idx_t miniblock_index; bitpacking_width_t *list_of_bitwidths_of_miniblocks; idx_t miniblock_offset; uint64_t unpacked_data[BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE]; idx_t unpacked_data_offset; }; } // namespace duckdb