//===----------------------------------------------------------------------===// // DuckDB // // decode_utils.hpp // // //===----------------------------------------------------------------------===// #pragma once #include "duckdb/common/fast_mem.hpp" #include "duckdb/common/bitpacking.hpp" #include "resizable_buffer.hpp" namespace duckdb { class ParquetDecodeUtils { //===--------------------------------------------------------------------===// // Bitpacking //===--------------------------------------------------------------------===// private: static const uint64_t BITPACK_MASKS[]; static const uint64_t BITPACK_MASKS_SIZE; static const uint8_t BITPACK_DLEN; static void CheckWidth(const uint8_t width) { if (width >= BITPACK_MASKS_SIZE) { throw InvalidInputException("The width (%d) of the bitpacked data exceeds the supported max width (%d), " "the file might be corrupted.", width, BITPACK_MASKS_SIZE); } } public: template static void BitUnpack(ByteBuffer &src, bitpacking_width_t &bitpack_pos, T *dst, idx_t count, const bitpacking_width_t width) { CheckWidth(width); const auto mask = BITPACK_MASKS[width]; src.available(count * width / BITPACK_DLEN); // check if buffer has enough space available once if (bitpack_pos == 0 && count >= BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) { idx_t remainder = count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE; idx_t aligned_count = count - remainder; BitUnpackAlignedInternal(src, dst, aligned_count, width); dst += aligned_count; count = remainder; } for (idx_t i = 0; i < count; i++) { auto val = (src.unsafe_get() >> bitpack_pos) & mask; bitpack_pos += width; while (bitpack_pos > BITPACK_DLEN) { src.unsafe_inc(1); val |= (static_cast(src.unsafe_get()) << static_cast(BITPACK_DLEN - (bitpack_pos - width))) & mask; bitpack_pos -= BITPACK_DLEN; } dst[i] = val; } } static void Skip(ByteBuffer &src, bitpacking_width_t &bitpack_pos, idx_t count, const bitpacking_width_t width) { CheckWidth(width); src.available(count * width / BITPACK_DLEN); // check if buffer has enough space available once if (bitpack_pos == 0 && count >= BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) { idx_t remainder = count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE; idx_t aligned_count = count - remainder; SkipAligned(src, aligned_count, width); count = remainder; } // FIXME: we should be able to just do this in one go instead of having this loop for (idx_t i = 0; i < count; i++) { bitpack_pos += width; while (bitpack_pos > BITPACK_DLEN) { src.unsafe_inc(1); bitpack_pos -= BITPACK_DLEN; } } } template static void BitPackAligned(T *src, data_ptr_t dst, const idx_t count, const bitpacking_width_t width) { D_ASSERT(width < BITPACK_MASKS_SIZE); D_ASSERT(count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0); BitpackingPrimitives::PackBuffer(dst, src, count, width); } template static void BitUnpackAlignedInternal(ByteBuffer &src, T *dst, const idx_t count, const bitpacking_width_t width) { D_ASSERT(count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0); if (cast_pointer_to_uint64(src.ptr) % sizeof(T) == 0) { // Fast path: aligned BitpackingPrimitives::UnPackBuffer(data_ptr_cast(dst), src.ptr, count, width); src.unsafe_inc(count * width / BITPACK_DLEN); return; } for (idx_t i = 0; i < count; i += BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) { const auto next_read = BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE * width / BITPACK_DLEN; // Buffer for alignment T aligned_data[BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE]; // Copy over to aligned buffer FastMemcpy(aligned_data, src.ptr, next_read); // Unpack BitpackingPrimitives::UnPackBlock(data_ptr_cast(dst), data_ptr_cast(aligned_data), width, true); src.unsafe_inc(next_read); dst += BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE; } } template static void BitUnpackAligned(ByteBuffer &src, T *dst, const idx_t count, const bitpacking_width_t width) { CheckWidth(width); if (count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE != 0) { throw InvalidInputException("Aligned bitpacking count must be a multiple of %llu", BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE); } const auto read_size = count * width / BITPACK_DLEN; src.available(read_size); // check if buffer has enough space available once BitUnpackAlignedInternal(src, dst, count, width); } static void SkipAligned(ByteBuffer &src, const idx_t count, const bitpacking_width_t width) { CheckWidth(width); if (count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE != 0) { throw InvalidInputException("Aligned bitpacking count must be a multiple of %llu", BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE); } const auto read_size = count * width / BITPACK_DLEN; src.inc(read_size); } //===--------------------------------------------------------------------===// // Zigzag //===--------------------------------------------------------------------===// private: //! https://lemire.me/blog/2022/11/25/making-all-your-integers-positive-with-zigzag-encoding/ template static typename std::enable_if::value, typename std::make_signed::type>::type ZigzagToIntInternal(UNSIGNED x) { return (x >> 1) ^ (-(x & 1)); } template static typename std::enable_if::value, typename std::make_unsigned::type>::type IntToZigzagInternal(SIGNED x) { using UNSIGNED = typename std::make_unsigned::type; return (static_cast(x) << 1) ^ static_cast(x >> (sizeof(SIGNED) * 8 - 1)); } public: template static typename std::enable_if::value, typename std::make_signed::type>::type ZigzagToInt(UNSIGNED x) { auto integer = ZigzagToIntInternal(x); D_ASSERT(x == IntToZigzagInternal(integer)); // test roundtrip return integer; } template static typename std::enable_if::value, typename std::make_unsigned::type>::type IntToZigzag(SIGNED x) { auto zigzag = IntToZigzagInternal(x); D_ASSERT(x == ZigzagToIntInternal(zigzag)); // test roundtrip return zigzag; } //===--------------------------------------------------------------------===// // Varint //===--------------------------------------------------------------------===// public: template static uint8_t GetVarintSize(T val) { uint8_t res = 0; do { val >>= 7; res++; } while (val != 0); return res; } template static void VarintEncode(T val, WriteStream &ser) { do { uint8_t byte = val & 127; val >>= 7; if (val != 0) { byte |= 128; } ser.Write(byte); } while (val != 0); } template static T VarintDecode(ByteBuffer &buf) { T result = 0; uint8_t shift = 0; while (true) { uint8_t byte; if (CHECKED) { byte = buf.read(); } else { byte = buf.unsafe_read(); } result |= T(byte & 127) << shift; if ((byte & 128) == 0) { break; } shift += 7; if (shift > sizeof(T) * 8) { throw std::runtime_error("Varint-decoding found too large number"); } } return result; } }; } // namespace duckdb