should be it

2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions
--- a/external/duckdb/extension/parquet/include/decode_utils.hpp
+++ b/external/duckdb/extension/parquet/include/decode_utils.hpp
@@ -0,0 +1,221 @@
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// decode_utils.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "duckdb/common/fast_mem.hpp"
+#include "duckdb/common/bitpacking.hpp"
+#include "resizable_buffer.hpp"
+
+namespace duckdb {
+
+class ParquetDecodeUtils {
+	//===--------------------------------------------------------------------===//
+	// Bitpacking
+	//===--------------------------------------------------------------------===//
+private:
+	static const uint64_t BITPACK_MASKS[];
+	static const uint64_t BITPACK_MASKS_SIZE;
+	static const uint8_t BITPACK_DLEN;
+
+	static void CheckWidth(const uint8_t width) {
+		if (width >= BITPACK_MASKS_SIZE) {
+			throw InvalidInputException("The width (%d) of the bitpacked data exceeds the supported max width (%d), "
+			                            "the file might be corrupted.",
+			                            width, BITPACK_MASKS_SIZE);
+		}
+	}
+
+public:
+	template <class T>
+	static void BitUnpack(ByteBuffer &src, bitpacking_width_t &bitpack_pos, T *dst, idx_t count,
+	                      const bitpacking_width_t width) {
+		CheckWidth(width);
+		const auto mask = BITPACK_MASKS[width];
+		src.available(count * width / BITPACK_DLEN); // check if buffer has enough space available once
+		if (bitpack_pos == 0 && count >= BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) {
+			idx_t remainder = count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE;
+			idx_t aligned_count = count - remainder;
+			BitUnpackAlignedInternal(src, dst, aligned_count, width);
+			dst += aligned_count;
+			count = remainder;
+		}
+		for (idx_t i = 0; i < count; i++) {
+			auto val = (src.unsafe_get<uint8_t>() >> bitpack_pos) & mask;
+			bitpack_pos += width;
+			while (bitpack_pos > BITPACK_DLEN) {
+				src.unsafe_inc(1);
+				val |= (static_cast<T>(src.unsafe_get<uint8_t>())
+				        << static_cast<T>(BITPACK_DLEN - (bitpack_pos - width))) &
+				       mask;
+				bitpack_pos -= BITPACK_DLEN;
+			}
+			dst[i] = val;
+		}
+	}
+
+	static void Skip(ByteBuffer &src, bitpacking_width_t &bitpack_pos, idx_t count, const bitpacking_width_t width) {
+		CheckWidth(width);
+		src.available(count * width / BITPACK_DLEN); // check if buffer has enough space available once
+		if (bitpack_pos == 0 && count >= BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) {
+			idx_t remainder = count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE;
+			idx_t aligned_count = count - remainder;
+			SkipAligned(src, aligned_count, width);
+			count = remainder;
+		}
+		// FIXME: we should be able to just do this in one go instead of having this loop
+		for (idx_t i = 0; i < count; i++) {
+			bitpack_pos += width;
+			while (bitpack_pos > BITPACK_DLEN) {
+				src.unsafe_inc(1);
+				bitpack_pos -= BITPACK_DLEN;
+			}
+		}
+	}
+
+	template <class T>
+	static void BitPackAligned(T *src, data_ptr_t dst, const idx_t count, const bitpacking_width_t width) {
+		D_ASSERT(width < BITPACK_MASKS_SIZE);
+		D_ASSERT(count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0);
+		BitpackingPrimitives::PackBuffer<T, true>(dst, src, count, width);
+	}
+
+	template <class T>
+	static void BitUnpackAlignedInternal(ByteBuffer &src, T *dst, const idx_t count, const bitpacking_width_t width) {
+		D_ASSERT(count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0);
+		if (cast_pointer_to_uint64(src.ptr) % sizeof(T) == 0) {
+			// Fast path: aligned
+			BitpackingPrimitives::UnPackBuffer<T>(data_ptr_cast(dst), src.ptr, count, width);
+			src.unsafe_inc(count * width / BITPACK_DLEN);
+			return;
+		}
+
+		for (idx_t i = 0; i < count; i += BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) {
+			const auto next_read = BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE * width / BITPACK_DLEN;
+
+			// Buffer for alignment
+			T aligned_data[BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE];
+
+			// Copy over to aligned buffer
+			FastMemcpy(aligned_data, src.ptr, next_read);
+
+			// Unpack
+			BitpackingPrimitives::UnPackBlock<T>(data_ptr_cast(dst), data_ptr_cast(aligned_data), width, true);
+
+			src.unsafe_inc(next_read);
+			dst += BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE;
+		}
+	}
+
+	template <class T>
+	static void BitUnpackAligned(ByteBuffer &src, T *dst, const idx_t count, const bitpacking_width_t width) {
+		CheckWidth(width);
+		if (count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE != 0) {
+			throw InvalidInputException("Aligned bitpacking count must be a multiple of %llu",
+			                            BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE);
+		}
+		const auto read_size = count * width / BITPACK_DLEN;
+		src.available(read_size); // check if buffer has enough space available once
+		BitUnpackAlignedInternal(src, dst, count, width);
+	}
+
+	static void SkipAligned(ByteBuffer &src, const idx_t count, const bitpacking_width_t width) {
+		CheckWidth(width);
+		if (count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE != 0) {
+			throw InvalidInputException("Aligned bitpacking count must be a multiple of %llu",
+			                            BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE);
+		}
+		const auto read_size = count * width / BITPACK_DLEN;
+		src.inc(read_size);
+	}
+
+	//===--------------------------------------------------------------------===//
+	// Zigzag
+	//===--------------------------------------------------------------------===//
+private:
+	//! https://lemire.me/blog/2022/11/25/making-all-your-integers-positive-with-zigzag-encoding/
+	template <class UNSIGNED>
+	static typename std::enable_if<std::is_unsigned<UNSIGNED>::value, typename std::make_signed<UNSIGNED>::type>::type
+	ZigzagToIntInternal(UNSIGNED x) {
+		return (x >> 1) ^ (-(x & 1));
+	}
+
+	template <typename SIGNED>
+	static typename std::enable_if<std::is_signed<SIGNED>::value, typename std::make_unsigned<SIGNED>::type>::type
+	IntToZigzagInternal(SIGNED x) {
+		using UNSIGNED = typename std::make_unsigned<SIGNED>::type;
+		return (static_cast<UNSIGNED>(x) << 1) ^ static_cast<UNSIGNED>(x >> (sizeof(SIGNED) * 8 - 1));
+	}
+
+public:
+	template <class UNSIGNED>
+	static typename std::enable_if<std::is_unsigned<UNSIGNED>::value, typename std::make_signed<UNSIGNED>::type>::type
+	ZigzagToInt(UNSIGNED x) {
+		auto integer = ZigzagToIntInternal(x);
+		D_ASSERT(x == IntToZigzagInternal(integer)); // test roundtrip
+		return integer;
+	}
+
+	template <typename SIGNED>
+	static typename std::enable_if<std::is_signed<SIGNED>::value, typename std::make_unsigned<SIGNED>::type>::type
+	IntToZigzag(SIGNED x) {
+		auto zigzag = IntToZigzagInternal(x);
+		D_ASSERT(x == ZigzagToIntInternal(zigzag)); // test roundtrip
+		return zigzag;
+	}
+
+	//===--------------------------------------------------------------------===//
+	// Varint
+	//===--------------------------------------------------------------------===//
+public:
+	template <class T>
+	static uint8_t GetVarintSize(T val) {
+		uint8_t res = 0;
+		do {
+			val >>= 7;
+			res++;
+		} while (val != 0);
+		return res;
+	}
+
+	template <class T>
+	static void VarintEncode(T val, WriteStream &ser) {
+		do {
+			uint8_t byte = val & 127;
+			val >>= 7;
+			if (val != 0) {
+				byte |= 128;
+			}
+			ser.Write<uint8_t>(byte);
+		} while (val != 0);
+	}
+
+	template <class T, bool CHECKED = true>
+	static T VarintDecode(ByteBuffer &buf) {
+		T result = 0;
+		uint8_t shift = 0;
+		while (true) {
+			uint8_t byte;
+			if (CHECKED) {
+				byte = buf.read<uint8_t>();
+			} else {
+				byte = buf.unsafe_read<uint8_t>();
+			}
+			result |= T(byte & 127) << shift;
+			if ((byte & 128) == 0) {
+				break;
+			}
+			shift += 7;
+			if (shift > sizeof(T) * 8) {
+				throw std::runtime_error("Varint-decoding found too large number");
+			}
+		}
+		return result;
+	}
+};
+} // namespace duckdb