should be it
This commit is contained in:
111
external/duckdb/extension/parquet/include/parquet_statistics.hpp
vendored
Normal file
111
external/duckdb/extension/parquet/include/parquet_statistics.hpp
vendored
Normal file
@@ -0,0 +1,111 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// parquet_statistics.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb.hpp"
|
||||
#include "duckdb/storage/statistics/base_statistics.hpp"
|
||||
#include "parquet_types.h"
|
||||
#include "resizable_buffer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
using duckdb_parquet::ColumnChunk;
|
||||
using duckdb_parquet::SchemaElement;
|
||||
|
||||
struct LogicalType;
|
||||
struct ParquetColumnSchema;
|
||||
class ResizeableBuffer;
|
||||
|
||||
struct ParquetStatisticsUtils {
|
||||
|
||||
static unique_ptr<BaseStatistics> TransformColumnStatistics(const ParquetColumnSchema &reader,
|
||||
const vector<ColumnChunk> &columns, bool can_have_nan);
|
||||
|
||||
static Value ConvertValue(const LogicalType &type, const ParquetColumnSchema &schema_ele, const std::string &stats);
|
||||
|
||||
static bool BloomFilterSupported(const LogicalTypeId &type_id);
|
||||
|
||||
static bool BloomFilterExcludes(const TableFilter &filter, const duckdb_parquet::ColumnMetaData &column_meta_data,
|
||||
duckdb_apache::thrift::protocol::TProtocol &file_proto, Allocator &allocator);
|
||||
|
||||
static unique_ptr<BaseStatistics> CreateNumericStats(const LogicalType &type, const ParquetColumnSchema &schema_ele,
|
||||
const duckdb_parquet::Statistics &parquet_stats);
|
||||
|
||||
private:
|
||||
static Value ConvertValueInternal(const LogicalType &type, const ParquetColumnSchema &schema_ele,
|
||||
const std::string &stats);
|
||||
};
|
||||
|
||||
class ParquetBloomFilter {
|
||||
static constexpr const idx_t DEFAULT_BLOCK_COUNT = 32; // 4k filter
|
||||
|
||||
public:
|
||||
ParquetBloomFilter(idx_t num_entries, double bloom_filter_false_positive_ratio);
|
||||
ParquetBloomFilter(unique_ptr<ResizeableBuffer> data_p);
|
||||
void FilterInsert(uint64_t x);
|
||||
bool FilterCheck(uint64_t x);
|
||||
void Shrink(idx_t new_block_count);
|
||||
double OneRatio();
|
||||
ResizeableBuffer *Get();
|
||||
|
||||
private:
|
||||
unique_ptr<ResizeableBuffer> data;
|
||||
idx_t block_count;
|
||||
};
|
||||
|
||||
// see https://github.com/apache/parquet-format/blob/master/BloomFilter.md
|
||||
|
||||
struct ParquetBloomBlock {
|
||||
struct ParquetBloomMaskResult {
|
||||
uint8_t bit_set[8] = {0};
|
||||
};
|
||||
|
||||
uint32_t block[8] = {0};
|
||||
|
||||
static bool check_bit(uint32_t &x, const uint8_t i) {
|
||||
D_ASSERT(i < 32);
|
||||
return (x >> i) & (uint32_t)1;
|
||||
}
|
||||
|
||||
static void set_bit(uint32_t &x, const uint8_t i) {
|
||||
D_ASSERT(i < 32);
|
||||
x |= (uint32_t)1 << i;
|
||||
D_ASSERT(check_bit(x, i));
|
||||
}
|
||||
|
||||
static ParquetBloomMaskResult Mask(uint32_t x) {
|
||||
static const uint32_t parquet_bloom_salt[8] = {0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
|
||||
0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
|
||||
ParquetBloomMaskResult result;
|
||||
for (idx_t i = 0; i < 8; i++) {
|
||||
result.bit_set[i] = (x * parquet_bloom_salt[i]) >> 27;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static void BlockInsert(ParquetBloomBlock &b, uint32_t x) {
|
||||
auto masked = Mask(x);
|
||||
for (idx_t i = 0; i < 8; i++) {
|
||||
set_bit(b.block[i], masked.bit_set[i]);
|
||||
D_ASSERT(check_bit(b.block[i], masked.bit_set[i]));
|
||||
}
|
||||
}
|
||||
|
||||
static bool BlockCheck(ParquetBloomBlock &b, uint32_t x) {
|
||||
auto masked = Mask(x);
|
||||
for (idx_t i = 0; i < 8; i++) {
|
||||
if (!check_bit(b.block[i], masked.bit_set[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
Reference in New Issue
Block a user