should be it
This commit is contained in:
34
external/duckdb/extension/parquet/include/writer/array_column_writer.hpp
vendored
Normal file
34
external/duckdb/extension/parquet/include/writer/array_column_writer.hpp
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// writer/array_column_writer.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "writer/list_column_writer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class ArrayColumnWriter : public ListColumnWriter {
|
||||
public:
|
||||
ArrayColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
|
||||
unique_ptr<ColumnWriter> child_writer_p, bool can_have_nulls)
|
||||
: ListColumnWriter(writer, column_schema, std::move(schema_path_p), std::move(child_writer_p), can_have_nulls) {
|
||||
}
|
||||
~ArrayColumnWriter() override = default;
|
||||
|
||||
public:
|
||||
void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
|
||||
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
|
||||
bool vector_can_span_multiple_pages) override;
|
||||
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
|
||||
|
||||
protected:
|
||||
void WriteArrayState(ListColumnWriterState &state, idx_t array_size, uint16_t first_repeat_level,
|
||||
idx_t define_value, const bool is_empty = false);
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
33
external/duckdb/extension/parquet/include/writer/boolean_column_writer.hpp
vendored
Normal file
33
external/duckdb/extension/parquet/include/writer/boolean_column_writer.hpp
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// writer/boolean_column_writer.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "writer/primitive_column_writer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class BooleanColumnWriter : public PrimitiveColumnWriter {
|
||||
public:
|
||||
BooleanColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
|
||||
bool can_have_nulls);
|
||||
~BooleanColumnWriter() override = default;
|
||||
|
||||
public:
|
||||
unique_ptr<ColumnWriterStatistics> InitializeStatsState() override;
|
||||
|
||||
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p, ColumnWriterPageState *state_p,
|
||||
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override;
|
||||
|
||||
unique_ptr<ColumnWriterPageState> InitializePageState(PrimitiveColumnWriterState &state, idx_t page_idx) override;
|
||||
void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) override;
|
||||
|
||||
idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state) const override;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
30
external/duckdb/extension/parquet/include/writer/decimal_column_writer.hpp
vendored
Normal file
30
external/duckdb/extension/parquet/include/writer/decimal_column_writer.hpp
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// writer/decimal_column_writer.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "writer/primitive_column_writer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class FixedDecimalColumnWriter : public PrimitiveColumnWriter {
|
||||
public:
|
||||
FixedDecimalColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
|
||||
vector<string> schema_path_p, bool can_have_nulls);
|
||||
~FixedDecimalColumnWriter() override = default;
|
||||
|
||||
public:
|
||||
unique_ptr<ColumnWriterStatistics> InitializeStatsState() override;
|
||||
|
||||
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p, ColumnWriterPageState *page_state,
|
||||
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override;
|
||||
|
||||
idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state) const override;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
50
external/duckdb/extension/parquet/include/writer/enum_column_writer.hpp
vendored
Normal file
50
external/duckdb/extension/parquet/include/writer/enum_column_writer.hpp
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// writer/enum_column_writer.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "writer/primitive_column_writer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
class EnumWriterPageState;
|
||||
|
||||
class EnumColumnWriter : public PrimitiveColumnWriter {
|
||||
public:
|
||||
EnumColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
|
||||
bool can_have_nulls);
|
||||
~EnumColumnWriter() override = default;
|
||||
|
||||
uint32_t bit_width;
|
||||
|
||||
public:
|
||||
unique_ptr<ColumnWriterStatistics> InitializeStatsState() override;
|
||||
|
||||
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p, ColumnWriterPageState *page_state_p,
|
||||
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override;
|
||||
|
||||
unique_ptr<ColumnWriterPageState> InitializePageState(PrimitiveColumnWriterState &state, idx_t page_idx) override;
|
||||
|
||||
void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) override;
|
||||
|
||||
duckdb_parquet::Encoding::type GetEncoding(PrimitiveColumnWriterState &state) override;
|
||||
|
||||
bool HasDictionary(PrimitiveColumnWriterState &state) override;
|
||||
|
||||
idx_t DictionarySize(PrimitiveColumnWriterState &state_p) override;
|
||||
|
||||
void FlushDictionary(PrimitiveColumnWriterState &state, ColumnWriterStatistics *stats_p) override;
|
||||
|
||||
idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state) const override;
|
||||
|
||||
private:
|
||||
template <class T>
|
||||
void WriteEnumInternal(WriteStream &temp_writer, Vector &input_column, idx_t chunk_start, idx_t chunk_end,
|
||||
EnumWriterPageState &page_state);
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
52
external/duckdb/extension/parquet/include/writer/list_column_writer.hpp
vendored
Normal file
52
external/duckdb/extension/parquet/include/writer/list_column_writer.hpp
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// writer/list_column_writer.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_writer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class ListColumnWriterState : public ColumnWriterState {
|
||||
public:
|
||||
ListColumnWriterState(duckdb_parquet::RowGroup &row_group, idx_t col_idx) : row_group(row_group), col_idx(col_idx) {
|
||||
}
|
||||
~ListColumnWriterState() override = default;
|
||||
|
||||
duckdb_parquet::RowGroup &row_group;
|
||||
idx_t col_idx;
|
||||
unique_ptr<ColumnWriterState> child_state;
|
||||
idx_t parent_index = 0;
|
||||
};
|
||||
|
||||
class ListColumnWriter : public ColumnWriter {
|
||||
public:
|
||||
ListColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
|
||||
unique_ptr<ColumnWriter> child_writer_p, bool can_have_nulls)
|
||||
: ColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
|
||||
child_writers.push_back(std::move(child_writer_p));
|
||||
}
|
||||
~ListColumnWriter() override = default;
|
||||
|
||||
public:
|
||||
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override;
|
||||
bool HasAnalyze() override;
|
||||
void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
|
||||
void FinalizeAnalyze(ColumnWriterState &state) override;
|
||||
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
|
||||
bool vector_can_span_multiple_pages) override;
|
||||
|
||||
void BeginWrite(ColumnWriterState &state) override;
|
||||
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
|
||||
void FinalizeWrite(ColumnWriterState &state) override;
|
||||
|
||||
protected:
|
||||
ColumnWriter &GetChildWriter();
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
326
external/duckdb/extension/parquet/include/writer/parquet_write_operators.hpp
vendored
Normal file
326
external/duckdb/extension/parquet/include/writer/parquet_write_operators.hpp
vendored
Normal file
@@ -0,0 +1,326 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// writer/parquet_write_operators.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "writer/parquet_write_stats.hpp"
|
||||
#include "zstd/common/xxhash.hpp"
|
||||
#include "duckdb/common/types/uhugeint.hpp"
|
||||
#include "duckdb/common/types/uuid.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct BaseParquetOperator {
|
||||
template <class SRC, class TGT>
|
||||
static void WriteToStream(const TGT &input, WriteStream &ser) {
|
||||
ser.WriteData(const_data_ptr_cast(&input), sizeof(TGT));
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static constexpr idx_t WriteSize(const TGT &input) {
|
||||
return sizeof(TGT);
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static uint64_t XXHash64(const TGT &target_value) {
|
||||
return duckdb_zstd::XXH64(&target_value, sizeof(target_value), 0);
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static idx_t GetRowSize(const Vector &, idx_t) {
|
||||
return sizeof(TGT);
|
||||
}
|
||||
};
|
||||
|
||||
struct ParquetCastOperator : public BaseParquetOperator {
|
||||
template <class SRC, class TGT>
|
||||
static TGT Operation(SRC input) {
|
||||
return TGT(input);
|
||||
}
|
||||
template <class SRC, class TGT>
|
||||
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
|
||||
return make_uniq<NumericStatisticsState<SRC, TGT, BaseParquetOperator>>();
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
|
||||
auto &numeric_stats = stats->Cast<NumericStatisticsState<SRC, TGT, BaseParquetOperator>>();
|
||||
if (LessThan::Operation(target_value, numeric_stats.min)) {
|
||||
numeric_stats.min = target_value;
|
||||
}
|
||||
if (GreaterThan::Operation(target_value, numeric_stats.max)) {
|
||||
numeric_stats.max = target_value;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct FloatingPointOperator : public BaseParquetOperator {
|
||||
template <class SRC, class TGT>
|
||||
static TGT Operation(SRC input) {
|
||||
return TGT(input);
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
|
||||
return make_uniq<FloatingPointStatisticsState<SRC, TGT, BaseParquetOperator>>();
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
|
||||
auto &numeric_stats = stats->Cast<FloatingPointStatisticsState<SRC, TGT, BaseParquetOperator>>();
|
||||
if (Value::IsNan(target_value)) {
|
||||
numeric_stats.has_nan = true;
|
||||
} else {
|
||||
if (LessThan::Operation(target_value, numeric_stats.min)) {
|
||||
numeric_stats.min = target_value;
|
||||
}
|
||||
if (GreaterThan::Operation(target_value, numeric_stats.max)) {
|
||||
numeric_stats.max = target_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct ParquetTimestampNSOperator : public ParquetCastOperator {
|
||||
template <class SRC, class TGT>
|
||||
static TGT Operation(SRC input) {
|
||||
return TGT(input);
|
||||
}
|
||||
};
|
||||
|
||||
struct ParquetTimestampSOperator : public ParquetCastOperator {
|
||||
template <class SRC, class TGT>
|
||||
static TGT Operation(SRC input) {
|
||||
return Timestamp::FromEpochSecondsPossiblyInfinite(input).value;
|
||||
}
|
||||
};
|
||||
|
||||
// We will need a different operator for GEOGRAPHY later, so we define a base geo operator
|
||||
struct ParquetBaseGeoOperator : public BaseParquetOperator {
|
||||
template <class SRC, class TGT>
|
||||
static TGT Operation(SRC input) {
|
||||
return input;
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
|
||||
auto &geo_stats = stats->Cast<GeoStatisticsState>();
|
||||
geo_stats.Update(target_value);
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static void WriteToStream(const TGT &target_value, WriteStream &ser) {
|
||||
ser.Write<uint32_t>(target_value.GetSize());
|
||||
ser.WriteData(const_data_ptr_cast(target_value.GetData()), target_value.GetSize());
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static idx_t WriteSize(const TGT &target_value) {
|
||||
return sizeof(uint32_t) + target_value.GetSize();
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static uint64_t XXHash64(const TGT &target_value) {
|
||||
return duckdb_zstd::XXH64(target_value.GetData(), target_value.GetSize(), 0);
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static idx_t GetRowSize(const Vector &vector, idx_t index) {
|
||||
// This needs to add the 4 bytes (just like WriteSize) otherwise we underestimate and we have to realloc
|
||||
// This seriously harms performance, mostly by making it very inconsistent (see internal issue #4990)
|
||||
return sizeof(uint32_t) + FlatVector::GetData<string_t>(vector)[index].GetSize();
|
||||
}
|
||||
};
|
||||
|
||||
struct ParquetGeometryOperator : public ParquetBaseGeoOperator {
|
||||
template <class SRC, class TGT>
|
||||
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
|
||||
return make_uniq<GeoStatisticsState>();
|
||||
}
|
||||
};
|
||||
|
||||
struct ParquetBaseStringOperator : public BaseParquetOperator {
|
||||
template <class SRC, class TGT>
|
||||
static TGT Operation(SRC input) {
|
||||
return input;
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
|
||||
auto &string_stats = stats->Cast<StringStatisticsState>();
|
||||
string_stats.Update(target_value);
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static void WriteToStream(const TGT &target_value, WriteStream &ser) {
|
||||
ser.Write<uint32_t>(target_value.GetSize());
|
||||
ser.WriteData(const_data_ptr_cast(target_value.GetData()), target_value.GetSize());
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static idx_t WriteSize(const TGT &target_value) {
|
||||
return sizeof(uint32_t) + target_value.GetSize();
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static uint64_t XXHash64(const TGT &target_value) {
|
||||
return duckdb_zstd::XXH64(target_value.GetData(), target_value.GetSize(), 0);
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static idx_t GetRowSize(const Vector &vector, idx_t index) {
|
||||
// This needs to add the 4 bytes (just like WriteSize) otherwise we underestimate and we have to realloc
|
||||
// This seriously harms performance, mostly by making it very inconsistent (see internal issue #4990)
|
||||
return sizeof(uint32_t) + FlatVector::GetData<string_t>(vector)[index].GetSize();
|
||||
}
|
||||
};
|
||||
|
||||
struct ParquetBlobOperator : public ParquetBaseStringOperator {
|
||||
template <class SRC, class TGT>
|
||||
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
|
||||
return make_uniq<StringStatisticsState>(LogicalTypeId::BLOB);
|
||||
}
|
||||
};
|
||||
|
||||
struct ParquetStringOperator : public ParquetBaseStringOperator {
|
||||
template <class SRC, class TGT>
|
||||
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
|
||||
return make_uniq<StringStatisticsState>();
|
||||
}
|
||||
};
|
||||
|
||||
struct ParquetIntervalTargetType {
|
||||
static constexpr const idx_t PARQUET_INTERVAL_SIZE = 12;
|
||||
data_t bytes[PARQUET_INTERVAL_SIZE];
|
||||
};
|
||||
|
||||
struct ParquetIntervalOperator : public BaseParquetOperator {
|
||||
template <class SRC, class TGT>
|
||||
static TGT Operation(SRC input) {
|
||||
if (input.days < 0 || input.months < 0 || input.micros < 0) {
|
||||
throw IOException("Parquet files do not support negative intervals");
|
||||
}
|
||||
TGT result;
|
||||
Store<uint32_t>(input.months, result.bytes);
|
||||
Store<uint32_t>(input.days, result.bytes + sizeof(uint32_t));
|
||||
Store<uint32_t>(input.micros / 1000, result.bytes + sizeof(uint32_t) * 2);
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static void WriteToStream(const TGT &target_value, WriteStream &ser) {
|
||||
ser.WriteData(target_value.bytes, ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE);
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static constexpr idx_t WriteSize(const TGT &target_value) {
|
||||
return ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE;
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static uint64_t XXHash64(const TGT &target_value) {
|
||||
return duckdb_zstd::XXH64(target_value.bytes, ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE, 0);
|
||||
}
|
||||
};
|
||||
|
||||
struct ParquetUUIDTargetType {
|
||||
static constexpr const idx_t PARQUET_UUID_SIZE = 16;
|
||||
data_t bytes[PARQUET_UUID_SIZE];
|
||||
};
|
||||
|
||||
struct ParquetUUIDOperator : public BaseParquetOperator {
|
||||
template <class SRC, class TGT>
|
||||
static TGT Operation(SRC input) {
|
||||
TGT result;
|
||||
// Use the utility function from BaseUUID
|
||||
BaseUUID::ToBlob(input, result.bytes);
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static void WriteToStream(const TGT &target_value, WriteStream &ser) {
|
||||
ser.WriteData(target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE);
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static constexpr idx_t WriteSize(const TGT &target_value) {
|
||||
return ParquetUUIDTargetType::PARQUET_UUID_SIZE;
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static uint64_t XXHash64(const TGT &target_value) {
|
||||
return duckdb_zstd::XXH64(target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE, 0);
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
|
||||
return make_uniq<UUIDStatisticsState>();
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static void HandleStats(ColumnWriterStatistics *stats_p, TGT target_value) {
|
||||
auto &stats = stats_p->Cast<UUIDStatisticsState>();
|
||||
if (!stats.has_stats || memcmp(target_value.bytes, stats.min, ParquetUUIDTargetType::PARQUET_UUID_SIZE) < 0) {
|
||||
memcpy(stats.min, target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE);
|
||||
}
|
||||
if (!stats.has_stats || memcmp(target_value.bytes, stats.max, ParquetUUIDTargetType::PARQUET_UUID_SIZE) > 0) {
|
||||
memcpy(stats.max, target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE);
|
||||
}
|
||||
stats.has_stats = true;
|
||||
}
|
||||
};
|
||||
|
||||
struct ParquetTimeTZOperator : public BaseParquetOperator {
|
||||
template <class SRC, class TGT>
|
||||
static TGT Operation(SRC input) {
|
||||
return input.time().micros;
|
||||
}
|
||||
};
|
||||
|
||||
struct ParquetHugeintOperator : public BaseParquetOperator {
|
||||
template <class SRC, class TGT>
|
||||
static TGT Operation(SRC input) {
|
||||
return Hugeint::Cast<double>(input);
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
|
||||
return make_uniq<ColumnWriterStatistics>();
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
|
||||
}
|
||||
};
|
||||
|
||||
struct ParquetUhugeintOperator : public BaseParquetOperator {
|
||||
template <class SRC, class TGT>
|
||||
static TGT Operation(SRC input) {
|
||||
return Uhugeint::Cast<double>(input);
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
|
||||
return make_uniq<ColumnWriterStatistics>();
|
||||
}
|
||||
|
||||
template <class SRC, class TGT>
|
||||
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
305
external/duckdb/extension/parquet/include/writer/parquet_write_stats.hpp
vendored
Normal file
305
external/duckdb/extension/parquet/include/writer/parquet_write_stats.hpp
vendored
Normal file
@@ -0,0 +1,305 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// writer/parquet_write_stats.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_writer.hpp"
|
||||
#include "geo_parquet.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class ColumnWriterStatistics {
|
||||
public:
|
||||
virtual ~ColumnWriterStatistics();
|
||||
|
||||
virtual bool HasStats();
|
||||
virtual string GetMin();
|
||||
virtual string GetMax();
|
||||
virtual string GetMinValue();
|
||||
virtual string GetMaxValue();
|
||||
virtual bool CanHaveNaN();
|
||||
virtual bool HasNaN();
|
||||
virtual bool MinIsExact();
|
||||
virtual bool MaxIsExact();
|
||||
|
||||
virtual bool HasGeoStats();
|
||||
virtual optional_ptr<GeometryStatsData> GetGeoStats();
|
||||
virtual void WriteGeoStats(duckdb_parquet::GeospatialStatistics &stats);
|
||||
|
||||
public:
|
||||
template <class TARGET>
|
||||
TARGET &Cast() {
|
||||
DynamicCastCheck<TARGET>(this);
|
||||
return reinterpret_cast<TARGET &>(*this);
|
||||
}
|
||||
template <class TARGET>
|
||||
const TARGET &Cast() const {
|
||||
D_ASSERT(dynamic_cast<const TARGET *>(this));
|
||||
return reinterpret_cast<const TARGET &>(*this);
|
||||
}
|
||||
};
|
||||
|
||||
template <class SRC, class T, class OP>
|
||||
class NumericStatisticsState : public ColumnWriterStatistics {
|
||||
public:
|
||||
NumericStatisticsState() : min(NumericLimits<T>::Maximum()), max(NumericLimits<T>::Minimum()) {
|
||||
}
|
||||
|
||||
T min;
|
||||
T max;
|
||||
|
||||
public:
|
||||
bool HasStats() override {
|
||||
return min <= max;
|
||||
}
|
||||
|
||||
string GetMin() override {
|
||||
return NumericLimits<SRC>::IsSigned() ? GetMinValue() : string();
|
||||
}
|
||||
string GetMax() override {
|
||||
return NumericLimits<SRC>::IsSigned() ? GetMaxValue() : string();
|
||||
}
|
||||
string GetMinValue() override {
|
||||
return HasStats() ? string(char_ptr_cast(&min), sizeof(T)) : string();
|
||||
}
|
||||
string GetMaxValue() override {
|
||||
return HasStats() ? string(char_ptr_cast(&max), sizeof(T)) : string();
|
||||
}
|
||||
};
|
||||
|
||||
template <class SRC, class T, class OP>
|
||||
class FloatingPointStatisticsState : public NumericStatisticsState<SRC, T, OP> {
|
||||
public:
|
||||
bool has_nan = false;
|
||||
|
||||
public:
|
||||
bool CanHaveNaN() override {
|
||||
return true;
|
||||
}
|
||||
bool HasNaN() override {
|
||||
return has_nan;
|
||||
}
|
||||
};
|
||||
|
||||
class StringStatisticsState : public ColumnWriterStatistics {
|
||||
static constexpr const idx_t MAX_STRING_STATISTICS_SIZE = 256;
|
||||
|
||||
public:
|
||||
explicit StringStatisticsState(LogicalTypeId type = LogicalTypeId::VARCHAR)
|
||||
: type(type), has_stats(false), min_truncated(false), max_truncated(false), min(), max() {
|
||||
}
|
||||
|
||||
LogicalTypeId type;
|
||||
bool has_stats;
|
||||
bool min_truncated;
|
||||
bool max_truncated;
|
||||
bool failed_truncate = false;
|
||||
string min;
|
||||
string max;
|
||||
|
||||
public:
|
||||
bool HasStats() override {
|
||||
return has_stats;
|
||||
}
|
||||
|
||||
void Update(const string_t &val) {
|
||||
if (failed_truncate) {
|
||||
return;
|
||||
}
|
||||
if (!has_stats || LessThan::Operation(val, string_t(min))) {
|
||||
if (val.GetSize() > MAX_STRING_STATISTICS_SIZE) {
|
||||
// string value exceeds our max string stats size - truncate
|
||||
min = TruncateMin(val, MAX_STRING_STATISTICS_SIZE);
|
||||
min_truncated = true;
|
||||
} else {
|
||||
min = val.GetString();
|
||||
min_truncated = false;
|
||||
}
|
||||
}
|
||||
if (!has_stats || GreaterThan::Operation(val, string_t(max))) {
|
||||
if (val.GetSize() > MAX_STRING_STATISTICS_SIZE) {
|
||||
// string value exceeds our max string stats size - truncate
|
||||
if (!TryTruncateMax(val, MAX_STRING_STATISTICS_SIZE, max)) {
|
||||
// we failed to truncate - this can happen in some edge cases
|
||||
// skip stats for this column
|
||||
failed_truncate = true;
|
||||
has_stats = false;
|
||||
min = string();
|
||||
max = string();
|
||||
return;
|
||||
}
|
||||
max_truncated = true;
|
||||
} else {
|
||||
max = val.GetString();
|
||||
max_truncated = false;
|
||||
}
|
||||
}
|
||||
has_stats = true;
|
||||
}
|
||||
|
||||
static inline bool IsCharacter(char c) {
|
||||
return (c & 0xc0) != 0x80;
|
||||
}
|
||||
|
||||
string TruncateMin(string_t str, idx_t max_size) {
|
||||
// truncate a string for the min value
|
||||
// since 'AAA' < 'AAAA', we can just truncate the string
|
||||
D_ASSERT(str.GetSize() > max_size);
|
||||
if (type == LogicalTypeId::BLOB) {
|
||||
// for blobs - just truncate directly
|
||||
return string(str.GetData(), max_size);
|
||||
}
|
||||
D_ASSERT(type == LogicalTypeId::VARCHAR);
|
||||
// for varchar we need to truncate to a valid UTF8 string - so we need to truncate to the last valid UTF8 byte
|
||||
auto str_data = str.GetData();
|
||||
for (; max_size > 0; max_size--) {
|
||||
if (IsCharacter(str_data[max_size])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return string(str_data, max_size);
|
||||
}
|
||||
|
||||
bool TryTruncateMax(string_t str, idx_t max_size, string &result, data_t max_byte) {
|
||||
auto data = const_data_ptr_cast(str.GetData());
|
||||
|
||||
// find the last position in the string which we can increment for the truncation
|
||||
// if ALL characters are above the max byte we cannot truncate
|
||||
idx_t increment_pos;
|
||||
for (increment_pos = max_size; increment_pos > 0; increment_pos--) {
|
||||
idx_t str_idx = increment_pos - 1;
|
||||
if (data[str_idx] < max_byte) {
|
||||
// found the increment position
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (increment_pos == 0) {
|
||||
// all characters are above the max byte - we cannot truncate - return false
|
||||
return false;
|
||||
}
|
||||
// set up the result string - we don't care about anything after the increment pos
|
||||
result = string(str.GetData(), increment_pos);
|
||||
// actually increment
|
||||
result[increment_pos - 1]++;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TryTruncateMax(string_t str, idx_t max_size, string &result) {
|
||||
// truncate a string for the max value
|
||||
// since 'XXX' < 'XXXX', we need to "increment" a byte to get a correct max value
|
||||
// i.e. we need to generate 'XXY' as a string
|
||||
// note that this is not necessarily always possible
|
||||
D_ASSERT(str.GetSize() > max_size);
|
||||
if (type == LogicalTypeId::BLOB) {
|
||||
// for blobs we can always increment bytes - we just can't increment past the max of a single byte (2^8)
|
||||
return TryTruncateMax(str, max_size, result, static_cast<data_t>(0xFF));
|
||||
}
|
||||
D_ASSERT(type == LogicalTypeId::VARCHAR);
|
||||
// for varchar the situation is more complex - we need to truncate to a valid UTF8 string and increment
|
||||
// for now we only increment ASCII characters (characters below 0x7F)
|
||||
return TryTruncateMax(str, max_size, result, static_cast<data_t>(0x7F));
|
||||
}
|
||||
|
||||
string GetMin() override {
|
||||
return GetMinValue();
|
||||
}
|
||||
string GetMax() override {
|
||||
return GetMaxValue();
|
||||
}
|
||||
string GetMinValue() override {
|
||||
return HasStats() ? min : string();
|
||||
}
|
||||
string GetMaxValue() override {
|
||||
return HasStats() ? max : string();
|
||||
}
|
||||
|
||||
bool MinIsExact() override {
|
||||
return !min_truncated;
|
||||
}
|
||||
|
||||
bool MaxIsExact() override {
|
||||
return !max_truncated;
|
||||
}
|
||||
};
|
||||
|
||||
class UUIDStatisticsState : public ColumnWriterStatistics {
|
||||
public:
|
||||
bool has_stats = false;
|
||||
data_t min[16] = {0};
|
||||
data_t max[16] = {0};
|
||||
|
||||
public:
|
||||
bool HasStats() override {
|
||||
return has_stats;
|
||||
}
|
||||
|
||||
string GetMin() override {
|
||||
return GetMinValue();
|
||||
}
|
||||
string GetMax() override {
|
||||
return GetMaxValue();
|
||||
}
|
||||
string GetMinValue() override {
|
||||
return HasStats() ? string(char_ptr_cast(min), 16) : string();
|
||||
}
|
||||
string GetMaxValue() override {
|
||||
return HasStats() ? string(char_ptr_cast(max), 16) : string();
|
||||
}
|
||||
};
|
||||
|
||||
class GeoStatisticsState final : public ColumnWriterStatistics {
|
||||
public:
|
||||
explicit GeoStatisticsState() : has_stats(false) {
|
||||
geo_stats.SetEmpty();
|
||||
}
|
||||
|
||||
bool has_stats;
|
||||
GeometryStatsData geo_stats;
|
||||
|
||||
public:
|
||||
void Update(const string_t &val) {
|
||||
geo_stats.Update(val);
|
||||
has_stats = true;
|
||||
}
|
||||
bool HasGeoStats() override {
|
||||
return has_stats;
|
||||
}
|
||||
optional_ptr<GeometryStatsData> GetGeoStats() override {
|
||||
return geo_stats;
|
||||
}
|
||||
void WriteGeoStats(duckdb_parquet::GeospatialStatistics &stats) override {
|
||||
const auto &types = geo_stats.types;
|
||||
const auto &bbox = geo_stats.extent;
|
||||
|
||||
if (bbox.HasXY()) {
|
||||
stats.__isset.bbox = true;
|
||||
stats.bbox.xmin = bbox.x_min;
|
||||
stats.bbox.xmax = bbox.x_max;
|
||||
stats.bbox.ymin = bbox.y_min;
|
||||
stats.bbox.ymax = bbox.y_max;
|
||||
|
||||
if (bbox.HasZ()) {
|
||||
stats.bbox.__isset.zmin = true;
|
||||
stats.bbox.__isset.zmax = true;
|
||||
stats.bbox.zmin = bbox.z_min;
|
||||
stats.bbox.zmax = bbox.z_max;
|
||||
}
|
||||
if (bbox.HasM()) {
|
||||
stats.bbox.__isset.mmin = true;
|
||||
stats.bbox.__isset.mmax = true;
|
||||
stats.bbox.mmin = bbox.m_min;
|
||||
stats.bbox.mmax = bbox.m_max;
|
||||
}
|
||||
}
|
||||
|
||||
stats.__isset.geospatial_types = true;
|
||||
stats.geospatial_types = types.ToWKBList();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
115
external/duckdb/extension/parquet/include/writer/primitive_column_writer.hpp
vendored
Normal file
115
external/duckdb/extension/parquet/include/writer/primitive_column_writer.hpp
vendored
Normal file
@@ -0,0 +1,115 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// writer/primitive_column_writer.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_writer.hpp"
|
||||
#include "writer/parquet_write_stats.hpp"
|
||||
#include "duckdb/common/serializer/memory_stream.hpp"
|
||||
#include "parquet_statistics.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct PageInformation {
|
||||
idx_t offset = 0;
|
||||
idx_t row_count = 0;
|
||||
idx_t empty_count = 0;
|
||||
idx_t estimated_page_size = 0;
|
||||
idx_t null_count = 0;
|
||||
};
|
||||
|
||||
struct PageWriteInformation {
|
||||
duckdb_parquet::PageHeader page_header;
|
||||
unique_ptr<MemoryStream> temp_writer;
|
||||
unique_ptr<ColumnWriterPageState> page_state;
|
||||
idx_t write_page_idx = 0;
|
||||
idx_t write_count = 0;
|
||||
idx_t max_write_count = 0;
|
||||
size_t compressed_size;
|
||||
data_ptr_t compressed_data;
|
||||
AllocatedData compressed_buf;
|
||||
};
|
||||
|
||||
class PrimitiveColumnWriterState : public ColumnWriterState {
|
||||
public:
|
||||
PrimitiveColumnWriterState(ParquetWriter &writer_p, duckdb_parquet::RowGroup &row_group, idx_t col_idx)
|
||||
: writer(writer_p), row_group(row_group), col_idx(col_idx) {
|
||||
page_info.emplace_back();
|
||||
}
|
||||
~PrimitiveColumnWriterState() override = default;
|
||||
|
||||
ParquetWriter &writer;
|
||||
duckdb_parquet::RowGroup &row_group;
|
||||
idx_t col_idx;
|
||||
vector<PageInformation> page_info;
|
||||
vector<PageWriteInformation> write_info;
|
||||
unique_ptr<ColumnWriterStatistics> stats_state;
|
||||
idx_t current_page = 0;
|
||||
|
||||
unique_ptr<ParquetBloomFilter> bloom_filter;
|
||||
};
|
||||
|
||||
//! Base class for writing non-compound types (ex. numerics, strings)
|
||||
class PrimitiveColumnWriter : public ColumnWriter {
|
||||
public:
|
||||
PrimitiveColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path,
|
||||
bool can_have_nulls);
|
||||
~PrimitiveColumnWriter() override = default;
|
||||
|
||||
//! We limit the uncompressed page size to 100MB
|
||||
//! The max size in Parquet is 2GB, but we choose a more conservative limit
|
||||
static constexpr const idx_t MAX_UNCOMPRESSED_PAGE_SIZE = 104857600ULL;
|
||||
//! Dictionary pages must be below 2GB. Unlike data pages, there's only one dictionary page.
|
||||
//! For this reason we go with a much higher, but still a conservative upper bound of 1GB;
|
||||
static constexpr const idx_t MAX_UNCOMPRESSED_DICT_PAGE_SIZE = 1073741824ULL;
|
||||
|
||||
public:
|
||||
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override;
|
||||
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
|
||||
bool vector_can_span_multiple_pages) override;
|
||||
void BeginWrite(ColumnWriterState &state) override;
|
||||
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
|
||||
void FinalizeWrite(ColumnWriterState &state) override;
|
||||
|
||||
protected:
|
||||
static void WriteLevels(Allocator &allocator, WriteStream &temp_writer, const unsafe_vector<uint16_t> &levels,
|
||||
idx_t max_value, idx_t start_offset, idx_t count, optional_idx null_count = optional_idx());
|
||||
|
||||
virtual duckdb_parquet::Encoding::type GetEncoding(PrimitiveColumnWriterState &state);
|
||||
|
||||
void NextPage(PrimitiveColumnWriterState &state);
|
||||
void FlushPage(PrimitiveColumnWriterState &state);
|
||||
|
||||
//! Initializes the state used to track statistics during writing. Only used for scalar types.
|
||||
virtual unique_ptr<ColumnWriterStatistics> InitializeStatsState();
|
||||
|
||||
//! Initialize the writer for a specific page. Only used for scalar types.
|
||||
virtual unique_ptr<ColumnWriterPageState> InitializePageState(PrimitiveColumnWriterState &state, idx_t page_idx);
|
||||
|
||||
//! Flushes the writer for a specific page. Only used for scalar types.
|
||||
virtual void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state);
|
||||
|
||||
//! Retrieves the row size of a vector at the specified location. Only used for scalar types.
|
||||
virtual idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state) const;
|
||||
//! Writes a (subset of a) vector to the specified serializer. Only used for scalar types.
|
||||
virtual void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats, ColumnWriterPageState *page_state,
|
||||
Vector &vector, idx_t chunk_start, idx_t chunk_end) = 0;
|
||||
|
||||
virtual bool HasDictionary(PrimitiveColumnWriterState &state_p) {
|
||||
return false;
|
||||
}
|
||||
//! The number of elements in the dictionary
|
||||
virtual idx_t DictionarySize(PrimitiveColumnWriterState &state_p);
|
||||
void WriteDictionary(PrimitiveColumnWriterState &state, unique_ptr<MemoryStream> temp_writer, idx_t row_count);
|
||||
virtual void FlushDictionary(PrimitiveColumnWriterState &state, ColumnWriterStatistics *stats);
|
||||
|
||||
void SetParquetStatistics(PrimitiveColumnWriterState &state, duckdb_parquet::ColumnChunk &column);
|
||||
void RegisterToRowGroup(duckdb_parquet::RowGroup &row_group);
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
37
external/duckdb/extension/parquet/include/writer/struct_column_writer.hpp
vendored
Normal file
37
external/duckdb/extension/parquet/include/writer/struct_column_writer.hpp
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// writer/struct_column_writer.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "column_writer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class StructColumnWriter : public ColumnWriter {
|
||||
public:
|
||||
StructColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
|
||||
vector<unique_ptr<ColumnWriter>> child_writers_p, bool can_have_nulls)
|
||||
: ColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
|
||||
child_writers = std::move(child_writers_p);
|
||||
}
|
||||
~StructColumnWriter() override = default;
|
||||
|
||||
public:
|
||||
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override;
|
||||
bool HasAnalyze() override;
|
||||
void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
|
||||
void FinalizeAnalyze(ColumnWriterState &state) override;
|
||||
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
|
||||
bool vector_can_span_multiple_pages) override;
|
||||
|
||||
void BeginWrite(ColumnWriterState &state) override;
|
||||
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
|
||||
void FinalizeWrite(ColumnWriterState &state) override;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
444
external/duckdb/extension/parquet/include/writer/templated_column_writer.hpp
vendored
Normal file
444
external/duckdb/extension/parquet/include/writer/templated_column_writer.hpp
vendored
Normal file
@@ -0,0 +1,444 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// writer/templated_column_writer.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "writer/primitive_column_writer.hpp"
|
||||
#include "writer/parquet_write_operators.hpp"
|
||||
#include "parquet_dbp_encoder.hpp"
|
||||
#include "parquet_dlba_encoder.hpp"
|
||||
#include "parquet_rle_bp_encoder.hpp"
|
||||
#include "duckdb/common/primitive_dictionary.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
template <class SRC, class TGT, class OP = ParquetCastOperator, bool ALL_VALID>
|
||||
static void TemplatedWritePlain(Vector &col, ColumnWriterStatistics *stats, const idx_t chunk_start,
|
||||
const idx_t chunk_end, const ValidityMask &mask, WriteStream &ser) {
|
||||
static constexpr bool COPY_DIRECTLY_FROM_VECTOR = ALL_VALID && std::is_same<SRC, TGT>::value &&
|
||||
std::is_arithmetic<TGT>::value &&
|
||||
std::is_same<OP, ParquetCastOperator>::value;
|
||||
|
||||
const auto *const ptr = FlatVector::GetData<SRC>(col);
|
||||
|
||||
TGT local_write[STANDARD_VECTOR_SIZE];
|
||||
idx_t local_write_count = 0;
|
||||
|
||||
for (idx_t r = chunk_start; r < chunk_end; r++) {
|
||||
if (!ALL_VALID && !mask.RowIsValid(r)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TGT target_value = OP::template Operation<SRC, TGT>(ptr[r]);
|
||||
OP::template HandleStats<SRC, TGT>(stats, target_value);
|
||||
|
||||
if (COPY_DIRECTLY_FROM_VECTOR) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (std::is_arithmetic<TGT>::value) {
|
||||
local_write[local_write_count++] = target_value;
|
||||
if (local_write_count == STANDARD_VECTOR_SIZE) {
|
||||
ser.WriteData(data_ptr_cast(local_write), local_write_count * sizeof(TGT));
|
||||
local_write_count = 0;
|
||||
}
|
||||
} else {
|
||||
OP::template WriteToStream<SRC, TGT>(target_value, ser);
|
||||
}
|
||||
}
|
||||
|
||||
if (COPY_DIRECTLY_FROM_VECTOR) {
|
||||
ser.WriteData(const_data_ptr_cast(&ptr[chunk_start]), (chunk_end - chunk_start) * sizeof(TGT));
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_arithmetic<TGT>::value) {
|
||||
ser.WriteData(data_ptr_cast(local_write), local_write_count * sizeof(TGT));
|
||||
}
|
||||
// Else we already wrote to stream
|
||||
}
|
||||
|
||||
template <class SRC, class TGT, class OP>
|
||||
class StandardColumnWriterState : public PrimitiveColumnWriterState {
|
||||
public:
|
||||
StandardColumnWriterState(ParquetWriter &writer, duckdb_parquet::RowGroup &row_group, idx_t col_idx)
|
||||
: PrimitiveColumnWriterState(writer, row_group, col_idx),
|
||||
dictionary(BufferAllocator::Get(writer.GetContext()),
|
||||
writer.DictionarySizeLimit().IsValid() ? writer.DictionarySizeLimit().GetIndex()
|
||||
: NumericCast<idx_t>(row_group.num_rows) / 5,
|
||||
writer.StringDictionaryPageSizeLimit()),
|
||||
encoding(duckdb_parquet::Encoding::PLAIN) {
|
||||
}
|
||||
~StandardColumnWriterState() override = default;
|
||||
|
||||
// analysis state for integer values for DELTA_BINARY_PACKED/DELTA_LENGTH_BYTE_ARRAY
|
||||
idx_t total_value_count = 0;
|
||||
idx_t total_string_size = 0;
|
||||
uint32_t key_bit_width = 0;
|
||||
|
||||
PrimitiveDictionary<SRC, TGT, OP> dictionary;
|
||||
duckdb_parquet::Encoding::type encoding;
|
||||
};
|
||||
|
||||
template <class SRC, class TGT, class OP>
|
||||
class StandardWriterPageState : public ColumnWriterPageState {
|
||||
public:
|
||||
explicit StandardWriterPageState(const idx_t total_value_count, const idx_t total_string_size,
|
||||
duckdb_parquet::Encoding::type encoding_p,
|
||||
const PrimitiveDictionary<SRC, TGT, OP> &dictionary_p)
|
||||
: encoding(encoding_p), dbp_initialized(false), dbp_encoder(total_value_count), dlba_initialized(false),
|
||||
dlba_encoder(total_value_count, total_string_size), bss_initialized(false),
|
||||
bss_encoder(total_value_count, sizeof(TGT)), dictionary(dictionary_p), dict_written_value(false),
|
||||
dict_bit_width(RleBpDecoder::ComputeBitWidth(dictionary.GetSize())), dict_encoder(dict_bit_width) {
|
||||
}
|
||||
duckdb_parquet::Encoding::type encoding;
|
||||
|
||||
bool dbp_initialized;
|
||||
DbpEncoder dbp_encoder;
|
||||
|
||||
bool dlba_initialized;
|
||||
DlbaEncoder dlba_encoder;
|
||||
|
||||
bool bss_initialized;
|
||||
BssEncoder bss_encoder;
|
||||
|
||||
const PrimitiveDictionary<SRC, TGT, OP> &dictionary;
|
||||
bool dict_written_value;
|
||||
uint32_t dict_bit_width;
|
||||
RleBpEncoder dict_encoder;
|
||||
};
|
||||
|
||||
template <class SRC, class TGT, class OP = ParquetCastOperator>
|
||||
class StandardColumnWriter : public PrimitiveColumnWriter {
|
||||
public:
|
||||
StandardColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
|
||||
vector<string> schema_path_p, // NOLINT
|
||||
bool can_have_nulls)
|
||||
: PrimitiveColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
|
||||
}
|
||||
~StandardColumnWriter() override = default;
|
||||
|
||||
public:
|
||||
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override {
|
||||
auto result = make_uniq<StandardColumnWriterState<SRC, TGT, OP>>(writer, row_group, row_group.columns.size());
|
||||
result->encoding = duckdb_parquet::Encoding::RLE_DICTIONARY;
|
||||
RegisterToRowGroup(row_group);
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
unique_ptr<ColumnWriterPageState> InitializePageState(PrimitiveColumnWriterState &state_p,
|
||||
idx_t page_idx) override {
|
||||
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
|
||||
const auto &page_info = state_p.page_info[page_idx];
|
||||
auto result = make_uniq<StandardWriterPageState<SRC, TGT, OP>>(
|
||||
page_info.row_count - (page_info.empty_count + page_info.null_count), state.total_string_size,
|
||||
state.encoding, state.dictionary);
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) override {
|
||||
auto &page_state = state_p->Cast<StandardWriterPageState<SRC, TGT, OP>>();
|
||||
switch (page_state.encoding) {
|
||||
case duckdb_parquet::Encoding::DELTA_BINARY_PACKED:
|
||||
if (!page_state.dbp_initialized) {
|
||||
page_state.dbp_encoder.BeginWrite(temp_writer, 0);
|
||||
}
|
||||
page_state.dbp_encoder.FinishWrite(temp_writer);
|
||||
break;
|
||||
case duckdb_parquet::Encoding::RLE_DICTIONARY:
|
||||
D_ASSERT(page_state.dict_bit_width != 0);
|
||||
if (!page_state.dict_written_value) {
|
||||
// all values are null
|
||||
// just write the bit width
|
||||
temp_writer.Write<uint8_t>(page_state.dict_bit_width);
|
||||
return;
|
||||
}
|
||||
page_state.dict_encoder.FinishWrite(temp_writer);
|
||||
break;
|
||||
case duckdb_parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY:
|
||||
if (!page_state.dlba_initialized) {
|
||||
page_state.dlba_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()), temp_writer,
|
||||
string_t(""));
|
||||
}
|
||||
page_state.dlba_encoder.FinishWrite(temp_writer);
|
||||
break;
|
||||
case duckdb_parquet::Encoding::BYTE_STREAM_SPLIT:
|
||||
if (!page_state.bss_initialized) {
|
||||
page_state.bss_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()));
|
||||
}
|
||||
page_state.bss_encoder.FinishWrite(temp_writer);
|
||||
break;
|
||||
case duckdb_parquet::Encoding::PLAIN:
|
||||
break;
|
||||
default:
|
||||
throw InternalException("Unknown encoding");
|
||||
}
|
||||
}
|
||||
|
||||
duckdb_parquet::Encoding::type GetEncoding(PrimitiveColumnWriterState &state_p) override {
|
||||
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
|
||||
return state.encoding;
|
||||
}
|
||||
|
||||
bool HasAnalyze() override {
|
||||
return true;
|
||||
}
|
||||
|
||||
void Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) override {
|
||||
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
|
||||
|
||||
auto data_ptr = FlatVector::GetData<SRC>(vector);
|
||||
idx_t vector_index = 0;
|
||||
|
||||
const bool check_parent_empty = parent && !parent->is_empty.empty();
|
||||
const idx_t parent_index = state.definition_levels.size();
|
||||
D_ASSERT(!check_parent_empty || parent_index < parent->is_empty.size());
|
||||
|
||||
const idx_t vcount =
|
||||
check_parent_empty ? parent->definition_levels.size() - state.definition_levels.size() : count;
|
||||
|
||||
const auto &validity = FlatVector::Validity(vector);
|
||||
|
||||
if (!check_parent_empty && validity.AllValid()) {
|
||||
// Fast path
|
||||
for (; vector_index < vcount; vector_index++) {
|
||||
const auto &src_value = data_ptr[vector_index];
|
||||
state.dictionary.Insert(src_value);
|
||||
state.total_value_count++;
|
||||
state.total_string_size += DlbaEncoder::GetStringSize(src_value);
|
||||
}
|
||||
} else {
|
||||
for (idx_t i = 0; i < vcount; i++) {
|
||||
if (check_parent_empty && parent->is_empty[parent_index + i]) {
|
||||
continue;
|
||||
}
|
||||
if (validity.RowIsValid(vector_index)) {
|
||||
const auto &src_value = data_ptr[vector_index];
|
||||
state.dictionary.Insert(src_value);
|
||||
state.total_value_count++;
|
||||
state.total_string_size += DlbaEncoder::GetStringSize(src_value);
|
||||
}
|
||||
vector_index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FinalizeAnalyze(ColumnWriterState &state_p) override {
|
||||
const auto type = writer.GetType(SchemaIndex());
|
||||
|
||||
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
|
||||
if (state.dictionary.GetSize() == 0 || state.dictionary.IsFull()) {
|
||||
state.dictionary.Reset();
|
||||
if (writer.GetParquetVersion() == ParquetVersion::V1) {
|
||||
// Can't do the cool stuff for V1
|
||||
state.encoding = duckdb_parquet::Encoding::PLAIN;
|
||||
} else {
|
||||
// If we aren't doing dictionary encoding, these encodings are virtually always better than PLAIN
|
||||
switch (type) {
|
||||
case duckdb_parquet::Type::type::INT32:
|
||||
case duckdb_parquet::Type::type::INT64:
|
||||
state.encoding = duckdb_parquet::Encoding::DELTA_BINARY_PACKED;
|
||||
break;
|
||||
case duckdb_parquet::Type::type::BYTE_ARRAY:
|
||||
state.encoding = duckdb_parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY;
|
||||
break;
|
||||
case duckdb_parquet::Type::type::FLOAT:
|
||||
case duckdb_parquet::Type::type::DOUBLE:
|
||||
state.encoding = duckdb_parquet::Encoding::BYTE_STREAM_SPLIT;
|
||||
break;
|
||||
default:
|
||||
state.encoding = duckdb_parquet::Encoding::PLAIN;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
state.key_bit_width = RleBpDecoder::ComputeBitWidth(state.dictionary.GetSize());
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<ColumnWriterStatistics> InitializeStatsState() override {
|
||||
return OP::template InitializeStats<SRC, TGT>();
|
||||
}
|
||||
|
||||
bool HasDictionary(PrimitiveColumnWriterState &state_p) override {
|
||||
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
|
||||
return state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY;
|
||||
}
|
||||
|
||||
idx_t DictionarySize(PrimitiveColumnWriterState &state_p) override {
|
||||
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
|
||||
return state.dictionary.GetSize();
|
||||
}
|
||||
|
||||
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats, ColumnWriterPageState *page_state_p,
|
||||
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override {
|
||||
const auto &mask = FlatVector::Validity(input_column);
|
||||
if (mask.AllValid()) {
|
||||
WriteVectorInternal<true>(temp_writer, stats, page_state_p, input_column, chunk_start, chunk_end);
|
||||
} else {
|
||||
WriteVectorInternal<false>(temp_writer, stats, page_state_p, input_column, chunk_start, chunk_end);
|
||||
}
|
||||
}
|
||||
|
||||
void FlushDictionary(PrimitiveColumnWriterState &state_p, ColumnWriterStatistics *stats) override {
|
||||
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
|
||||
D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY);
|
||||
|
||||
if (writer.EnableBloomFilters()) {
|
||||
state.bloom_filter =
|
||||
make_uniq<ParquetBloomFilter>(state.dictionary.GetSize(), writer.BloomFilterFalsePositiveRatio());
|
||||
}
|
||||
|
||||
state.dictionary.IterateValues([&](const SRC &src_value, const TGT &tgt_value) {
|
||||
// update the statistics
|
||||
OP::template HandleStats<SRC, TGT>(stats, tgt_value);
|
||||
if (state.bloom_filter) {
|
||||
// update the bloom filter
|
||||
auto hash = OP::template XXHash64<SRC, TGT>(tgt_value);
|
||||
state.bloom_filter->FilterInsert(hash);
|
||||
}
|
||||
});
|
||||
|
||||
// flush the dictionary page and add it to the to-be-written pages
|
||||
WriteDictionary(state, state.dictionary.GetTargetMemoryStream(), state.dictionary.GetSize());
|
||||
// bloom filter will be queued for writing in ParquetWriter::BufferBloomFilter one level up
|
||||
}
|
||||
|
||||
idx_t GetRowSize(const Vector &vector, const idx_t index,
|
||||
const PrimitiveColumnWriterState &state_p) const override {
|
||||
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
|
||||
if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY) {
|
||||
return (state.key_bit_width + 7) / 8;
|
||||
} else {
|
||||
return OP::template GetRowSize<SRC, TGT>(vector, index);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
template <bool ALL_VALID>
|
||||
void WriteVectorInternal(WriteStream &temp_writer, ColumnWriterStatistics *stats,
|
||||
ColumnWriterPageState *page_state_p, Vector &input_column, idx_t chunk_start,
|
||||
idx_t chunk_end) {
|
||||
auto &page_state = page_state_p->Cast<StandardWriterPageState<SRC, TGT, OP>>();
|
||||
|
||||
const auto &mask = FlatVector::Validity(input_column);
|
||||
const auto *data_ptr = FlatVector::GetData<SRC>(input_column);
|
||||
|
||||
switch (page_state.encoding) {
|
||||
case duckdb_parquet::Encoding::RLE_DICTIONARY: {
|
||||
idx_t r = chunk_start;
|
||||
if (!page_state.dict_written_value) {
|
||||
// find first non-null value
|
||||
for (; r < chunk_end; r++) {
|
||||
if (!mask.RowIsValid(r)) {
|
||||
continue;
|
||||
}
|
||||
// write the bit-width as a one-byte entry and initialize writer
|
||||
temp_writer.Write<uint8_t>(page_state.dict_bit_width);
|
||||
page_state.dict_encoder.BeginWrite();
|
||||
page_state.dict_written_value = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (; r < chunk_end; r++) {
|
||||
if (!ALL_VALID && !mask.RowIsValid(r)) {
|
||||
continue;
|
||||
}
|
||||
const auto &src_value = data_ptr[r];
|
||||
const auto value_index = page_state.dictionary.GetIndex(src_value);
|
||||
page_state.dict_encoder.WriteValue(temp_writer, value_index);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case duckdb_parquet::Encoding::DELTA_BINARY_PACKED: {
|
||||
idx_t r = chunk_start;
|
||||
if (!page_state.dbp_initialized) {
|
||||
// find first non-null value
|
||||
for (; r < chunk_end; r++) {
|
||||
if (!mask.RowIsValid(r)) {
|
||||
continue;
|
||||
}
|
||||
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
|
||||
OP::template HandleStats<SRC, TGT>(stats, target_value);
|
||||
page_state.dbp_encoder.BeginWrite(temp_writer, target_value);
|
||||
page_state.dbp_initialized = true;
|
||||
r++; // skip over
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (; r < chunk_end; r++) {
|
||||
if (!ALL_VALID && !mask.RowIsValid(r)) {
|
||||
continue;
|
||||
}
|
||||
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
|
||||
OP::template HandleStats<SRC, TGT>(stats, target_value);
|
||||
page_state.dbp_encoder.WriteValue(temp_writer, target_value);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case duckdb_parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY: {
|
||||
idx_t r = chunk_start;
|
||||
if (!page_state.dlba_initialized) {
|
||||
// find first non-null value
|
||||
for (; r < chunk_end; r++) {
|
||||
if (!mask.RowIsValid(r)) {
|
||||
continue;
|
||||
}
|
||||
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
|
||||
OP::template HandleStats<SRC, TGT>(stats, target_value);
|
||||
page_state.dlba_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()), temp_writer,
|
||||
target_value);
|
||||
page_state.dlba_initialized = true;
|
||||
r++; // skip over
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (; r < chunk_end; r++) {
|
||||
if (!ALL_VALID && !mask.RowIsValid(r)) {
|
||||
continue;
|
||||
}
|
||||
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
|
||||
OP::template HandleStats<SRC, TGT>(stats, target_value);
|
||||
page_state.dlba_encoder.WriteValue(temp_writer, target_value);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case duckdb_parquet::Encoding::BYTE_STREAM_SPLIT: {
|
||||
if (!page_state.bss_initialized) {
|
||||
page_state.bss_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()));
|
||||
page_state.bss_initialized = true;
|
||||
}
|
||||
for (idx_t r = chunk_start; r < chunk_end; r++) {
|
||||
if (!ALL_VALID && !mask.RowIsValid(r)) {
|
||||
continue;
|
||||
}
|
||||
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
|
||||
OP::template HandleStats<SRC, TGT>(stats, target_value);
|
||||
page_state.bss_encoder.WriteValue(target_value);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case duckdb_parquet::Encoding::PLAIN: {
|
||||
D_ASSERT(page_state.encoding == duckdb_parquet::Encoding::PLAIN);
|
||||
if (mask.AllValid()) {
|
||||
TemplatedWritePlain<SRC, TGT, OP, true>(input_column, stats, chunk_start, chunk_end, mask, temp_writer);
|
||||
} else {
|
||||
TemplatedWritePlain<SRC, TGT, OP, false>(input_column, stats, chunk_start, chunk_end, mask,
|
||||
temp_writer);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw InternalException("Unknown encoding");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
30
external/duckdb/extension/parquet/include/writer/variant_column_writer.hpp
vendored
Normal file
30
external/duckdb/extension/parquet/include/writer/variant_column_writer.hpp
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// writer/variant_column_writer.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "struct_column_writer.hpp"
|
||||
#include "duckdb/planner/expression/bound_function_expression.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class VariantColumnWriter : public StructColumnWriter {
|
||||
public:
|
||||
VariantColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
|
||||
vector<unique_ptr<ColumnWriter>> child_writers_p, bool can_have_nulls)
|
||||
: StructColumnWriter(writer, column_schema, std::move(schema_path_p), std::move(child_writers_p),
|
||||
can_have_nulls) {
|
||||
}
|
||||
~VariantColumnWriter() override = default;
|
||||
|
||||
public:
|
||||
static ScalarFunction GetTransformFunction();
|
||||
static LogicalType TransformTypedValueRecursive(const LogicalType &type);
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
Reference in New Issue
Block a user