should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,34 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/array_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "writer/list_column_writer.hpp"
namespace duckdb {
class ArrayColumnWriter : public ListColumnWriter {
public:
ArrayColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
unique_ptr<ColumnWriter> child_writer_p, bool can_have_nulls)
: ListColumnWriter(writer, column_schema, std::move(schema_path_p), std::move(child_writer_p), can_have_nulls) {
}
~ArrayColumnWriter() override = default;
public:
void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) override;
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
protected:
void WriteArrayState(ListColumnWriterState &state, idx_t array_size, uint16_t first_repeat_level,
idx_t define_value, const bool is_empty = false);
};
} // namespace duckdb

View File

@@ -0,0 +1,33 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/boolean_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "writer/primitive_column_writer.hpp"
namespace duckdb {
class BooleanColumnWriter : public PrimitiveColumnWriter {
public:
BooleanColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
bool can_have_nulls);
~BooleanColumnWriter() override = default;
public:
unique_ptr<ColumnWriterStatistics> InitializeStatsState() override;
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p, ColumnWriterPageState *state_p,
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override;
unique_ptr<ColumnWriterPageState> InitializePageState(PrimitiveColumnWriterState &state, idx_t page_idx) override;
void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) override;
idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state) const override;
};
} // namespace duckdb

View File

@@ -0,0 +1,30 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/decimal_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "writer/primitive_column_writer.hpp"
namespace duckdb {
class FixedDecimalColumnWriter : public PrimitiveColumnWriter {
public:
FixedDecimalColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
vector<string> schema_path_p, bool can_have_nulls);
~FixedDecimalColumnWriter() override = default;
public:
unique_ptr<ColumnWriterStatistics> InitializeStatsState() override;
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p, ColumnWriterPageState *page_state,
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override;
idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state) const override;
};
} // namespace duckdb

View File

@@ -0,0 +1,50 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/enum_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "writer/primitive_column_writer.hpp"
namespace duckdb {
class EnumWriterPageState;
class EnumColumnWriter : public PrimitiveColumnWriter {
public:
EnumColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
bool can_have_nulls);
~EnumColumnWriter() override = default;
uint32_t bit_width;
public:
unique_ptr<ColumnWriterStatistics> InitializeStatsState() override;
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p, ColumnWriterPageState *page_state_p,
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override;
unique_ptr<ColumnWriterPageState> InitializePageState(PrimitiveColumnWriterState &state, idx_t page_idx) override;
void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) override;
duckdb_parquet::Encoding::type GetEncoding(PrimitiveColumnWriterState &state) override;
bool HasDictionary(PrimitiveColumnWriterState &state) override;
idx_t DictionarySize(PrimitiveColumnWriterState &state_p) override;
void FlushDictionary(PrimitiveColumnWriterState &state, ColumnWriterStatistics *stats_p) override;
idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state) const override;
private:
template <class T>
void WriteEnumInternal(WriteStream &temp_writer, Vector &input_column, idx_t chunk_start, idx_t chunk_end,
EnumWriterPageState &page_state);
};
} // namespace duckdb

View File

@@ -0,0 +1,52 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/list_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_writer.hpp"
namespace duckdb {
class ListColumnWriterState : public ColumnWriterState {
public:
ListColumnWriterState(duckdb_parquet::RowGroup &row_group, idx_t col_idx) : row_group(row_group), col_idx(col_idx) {
}
~ListColumnWriterState() override = default;
duckdb_parquet::RowGroup &row_group;
idx_t col_idx;
unique_ptr<ColumnWriterState> child_state;
idx_t parent_index = 0;
};
class ListColumnWriter : public ColumnWriter {
public:
ListColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
unique_ptr<ColumnWriter> child_writer_p, bool can_have_nulls)
: ColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
child_writers.push_back(std::move(child_writer_p));
}
~ListColumnWriter() override = default;
public:
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override;
bool HasAnalyze() override;
void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
void FinalizeAnalyze(ColumnWriterState &state) override;
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) override;
void BeginWrite(ColumnWriterState &state) override;
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
void FinalizeWrite(ColumnWriterState &state) override;
protected:
ColumnWriter &GetChildWriter();
};
} // namespace duckdb

View File

@@ -0,0 +1,326 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/parquet_write_operators.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "writer/parquet_write_stats.hpp"
#include "zstd/common/xxhash.hpp"
#include "duckdb/common/types/uhugeint.hpp"
#include "duckdb/common/types/uuid.hpp"
namespace duckdb {
struct BaseParquetOperator {
template <class SRC, class TGT>
static void WriteToStream(const TGT &input, WriteStream &ser) {
ser.WriteData(const_data_ptr_cast(&input), sizeof(TGT));
}
template <class SRC, class TGT>
static constexpr idx_t WriteSize(const TGT &input) {
return sizeof(TGT);
}
template <class SRC, class TGT>
static uint64_t XXHash64(const TGT &target_value) {
return duckdb_zstd::XXH64(&target_value, sizeof(target_value), 0);
}
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return nullptr;
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
}
template <class SRC, class TGT>
static idx_t GetRowSize(const Vector &, idx_t) {
return sizeof(TGT);
}
};
struct ParquetCastOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return TGT(input);
}
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<NumericStatisticsState<SRC, TGT, BaseParquetOperator>>();
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
auto &numeric_stats = stats->Cast<NumericStatisticsState<SRC, TGT, BaseParquetOperator>>();
if (LessThan::Operation(target_value, numeric_stats.min)) {
numeric_stats.min = target_value;
}
if (GreaterThan::Operation(target_value, numeric_stats.max)) {
numeric_stats.max = target_value;
}
}
};
struct FloatingPointOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return TGT(input);
}
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<FloatingPointStatisticsState<SRC, TGT, BaseParquetOperator>>();
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
auto &numeric_stats = stats->Cast<FloatingPointStatisticsState<SRC, TGT, BaseParquetOperator>>();
if (Value::IsNan(target_value)) {
numeric_stats.has_nan = true;
} else {
if (LessThan::Operation(target_value, numeric_stats.min)) {
numeric_stats.min = target_value;
}
if (GreaterThan::Operation(target_value, numeric_stats.max)) {
numeric_stats.max = target_value;
}
}
}
};
struct ParquetTimestampNSOperator : public ParquetCastOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return TGT(input);
}
};
struct ParquetTimestampSOperator : public ParquetCastOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return Timestamp::FromEpochSecondsPossiblyInfinite(input).value;
}
};
// We will need a different operator for GEOGRAPHY later, so we define a base geo operator
struct ParquetBaseGeoOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return input;
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
auto &geo_stats = stats->Cast<GeoStatisticsState>();
geo_stats.Update(target_value);
}
template <class SRC, class TGT>
static void WriteToStream(const TGT &target_value, WriteStream &ser) {
ser.Write<uint32_t>(target_value.GetSize());
ser.WriteData(const_data_ptr_cast(target_value.GetData()), target_value.GetSize());
}
template <class SRC, class TGT>
static idx_t WriteSize(const TGT &target_value) {
return sizeof(uint32_t) + target_value.GetSize();
}
template <class SRC, class TGT>
static uint64_t XXHash64(const TGT &target_value) {
return duckdb_zstd::XXH64(target_value.GetData(), target_value.GetSize(), 0);
}
template <class SRC, class TGT>
static idx_t GetRowSize(const Vector &vector, idx_t index) {
// This needs to add the 4 bytes (just like WriteSize) otherwise we underestimate and we have to realloc
// This seriously harms performance, mostly by making it very inconsistent (see internal issue #4990)
return sizeof(uint32_t) + FlatVector::GetData<string_t>(vector)[index].GetSize();
}
};
struct ParquetGeometryOperator : public ParquetBaseGeoOperator {
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<GeoStatisticsState>();
}
};
struct ParquetBaseStringOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return input;
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
auto &string_stats = stats->Cast<StringStatisticsState>();
string_stats.Update(target_value);
}
template <class SRC, class TGT>
static void WriteToStream(const TGT &target_value, WriteStream &ser) {
ser.Write<uint32_t>(target_value.GetSize());
ser.WriteData(const_data_ptr_cast(target_value.GetData()), target_value.GetSize());
}
template <class SRC, class TGT>
static idx_t WriteSize(const TGT &target_value) {
return sizeof(uint32_t) + target_value.GetSize();
}
template <class SRC, class TGT>
static uint64_t XXHash64(const TGT &target_value) {
return duckdb_zstd::XXH64(target_value.GetData(), target_value.GetSize(), 0);
}
template <class SRC, class TGT>
static idx_t GetRowSize(const Vector &vector, idx_t index) {
// This needs to add the 4 bytes (just like WriteSize) otherwise we underestimate and we have to realloc
// This seriously harms performance, mostly by making it very inconsistent (see internal issue #4990)
return sizeof(uint32_t) + FlatVector::GetData<string_t>(vector)[index].GetSize();
}
};
struct ParquetBlobOperator : public ParquetBaseStringOperator {
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<StringStatisticsState>(LogicalTypeId::BLOB);
}
};
struct ParquetStringOperator : public ParquetBaseStringOperator {
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<StringStatisticsState>();
}
};
struct ParquetIntervalTargetType {
static constexpr const idx_t PARQUET_INTERVAL_SIZE = 12;
data_t bytes[PARQUET_INTERVAL_SIZE];
};
struct ParquetIntervalOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
if (input.days < 0 || input.months < 0 || input.micros < 0) {
throw IOException("Parquet files do not support negative intervals");
}
TGT result;
Store<uint32_t>(input.months, result.bytes);
Store<uint32_t>(input.days, result.bytes + sizeof(uint32_t));
Store<uint32_t>(input.micros / 1000, result.bytes + sizeof(uint32_t) * 2);
return result;
}
template <class SRC, class TGT>
static void WriteToStream(const TGT &target_value, WriteStream &ser) {
ser.WriteData(target_value.bytes, ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE);
}
template <class SRC, class TGT>
static constexpr idx_t WriteSize(const TGT &target_value) {
return ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE;
}
template <class SRC, class TGT>
static uint64_t XXHash64(const TGT &target_value) {
return duckdb_zstd::XXH64(target_value.bytes, ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE, 0);
}
};
struct ParquetUUIDTargetType {
static constexpr const idx_t PARQUET_UUID_SIZE = 16;
data_t bytes[PARQUET_UUID_SIZE];
};
struct ParquetUUIDOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
TGT result;
// Use the utility function from BaseUUID
BaseUUID::ToBlob(input, result.bytes);
return result;
}
template <class SRC, class TGT>
static void WriteToStream(const TGT &target_value, WriteStream &ser) {
ser.WriteData(target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE);
}
template <class SRC, class TGT>
static constexpr idx_t WriteSize(const TGT &target_value) {
return ParquetUUIDTargetType::PARQUET_UUID_SIZE;
}
template <class SRC, class TGT>
static uint64_t XXHash64(const TGT &target_value) {
return duckdb_zstd::XXH64(target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE, 0);
}
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<UUIDStatisticsState>();
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats_p, TGT target_value) {
auto &stats = stats_p->Cast<UUIDStatisticsState>();
if (!stats.has_stats || memcmp(target_value.bytes, stats.min, ParquetUUIDTargetType::PARQUET_UUID_SIZE) < 0) {
memcpy(stats.min, target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE);
}
if (!stats.has_stats || memcmp(target_value.bytes, stats.max, ParquetUUIDTargetType::PARQUET_UUID_SIZE) > 0) {
memcpy(stats.max, target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE);
}
stats.has_stats = true;
}
};
struct ParquetTimeTZOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return input.time().micros;
}
};
struct ParquetHugeintOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return Hugeint::Cast<double>(input);
}
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<ColumnWriterStatistics>();
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
}
};
struct ParquetUhugeintOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return Uhugeint::Cast<double>(input);
}
template <class SRC, class TGT>
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
return make_uniq<ColumnWriterStatistics>();
}
template <class SRC, class TGT>
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
}
};
} // namespace duckdb

View File

@@ -0,0 +1,305 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/parquet_write_stats.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_writer.hpp"
#include "geo_parquet.hpp"
namespace duckdb {
class ColumnWriterStatistics {
public:
virtual ~ColumnWriterStatistics();
virtual bool HasStats();
virtual string GetMin();
virtual string GetMax();
virtual string GetMinValue();
virtual string GetMaxValue();
virtual bool CanHaveNaN();
virtual bool HasNaN();
virtual bool MinIsExact();
virtual bool MaxIsExact();
virtual bool HasGeoStats();
virtual optional_ptr<GeometryStatsData> GetGeoStats();
virtual void WriteGeoStats(duckdb_parquet::GeospatialStatistics &stats);
public:
template <class TARGET>
TARGET &Cast() {
DynamicCastCheck<TARGET>(this);
return reinterpret_cast<TARGET &>(*this);
}
template <class TARGET>
const TARGET &Cast() const {
D_ASSERT(dynamic_cast<const TARGET *>(this));
return reinterpret_cast<const TARGET &>(*this);
}
};
template <class SRC, class T, class OP>
class NumericStatisticsState : public ColumnWriterStatistics {
public:
NumericStatisticsState() : min(NumericLimits<T>::Maximum()), max(NumericLimits<T>::Minimum()) {
}
T min;
T max;
public:
bool HasStats() override {
return min <= max;
}
string GetMin() override {
return NumericLimits<SRC>::IsSigned() ? GetMinValue() : string();
}
string GetMax() override {
return NumericLimits<SRC>::IsSigned() ? GetMaxValue() : string();
}
string GetMinValue() override {
return HasStats() ? string(char_ptr_cast(&min), sizeof(T)) : string();
}
string GetMaxValue() override {
return HasStats() ? string(char_ptr_cast(&max), sizeof(T)) : string();
}
};
template <class SRC, class T, class OP>
class FloatingPointStatisticsState : public NumericStatisticsState<SRC, T, OP> {
public:
bool has_nan = false;
public:
bool CanHaveNaN() override {
return true;
}
bool HasNaN() override {
return has_nan;
}
};
class StringStatisticsState : public ColumnWriterStatistics {
static constexpr const idx_t MAX_STRING_STATISTICS_SIZE = 256;
public:
explicit StringStatisticsState(LogicalTypeId type = LogicalTypeId::VARCHAR)
: type(type), has_stats(false), min_truncated(false), max_truncated(false), min(), max() {
}
LogicalTypeId type;
bool has_stats;
bool min_truncated;
bool max_truncated;
bool failed_truncate = false;
string min;
string max;
public:
bool HasStats() override {
return has_stats;
}
void Update(const string_t &val) {
if (failed_truncate) {
return;
}
if (!has_stats || LessThan::Operation(val, string_t(min))) {
if (val.GetSize() > MAX_STRING_STATISTICS_SIZE) {
// string value exceeds our max string stats size - truncate
min = TruncateMin(val, MAX_STRING_STATISTICS_SIZE);
min_truncated = true;
} else {
min = val.GetString();
min_truncated = false;
}
}
if (!has_stats || GreaterThan::Operation(val, string_t(max))) {
if (val.GetSize() > MAX_STRING_STATISTICS_SIZE) {
// string value exceeds our max string stats size - truncate
if (!TryTruncateMax(val, MAX_STRING_STATISTICS_SIZE, max)) {
// we failed to truncate - this can happen in some edge cases
// skip stats for this column
failed_truncate = true;
has_stats = false;
min = string();
max = string();
return;
}
max_truncated = true;
} else {
max = val.GetString();
max_truncated = false;
}
}
has_stats = true;
}
static inline bool IsCharacter(char c) {
return (c & 0xc0) != 0x80;
}
string TruncateMin(string_t str, idx_t max_size) {
// truncate a string for the min value
// since 'AAA' < 'AAAA', we can just truncate the string
D_ASSERT(str.GetSize() > max_size);
if (type == LogicalTypeId::BLOB) {
// for blobs - just truncate directly
return string(str.GetData(), max_size);
}
D_ASSERT(type == LogicalTypeId::VARCHAR);
// for varchar we need to truncate to a valid UTF8 string - so we need to truncate to the last valid UTF8 byte
auto str_data = str.GetData();
for (; max_size > 0; max_size--) {
if (IsCharacter(str_data[max_size])) {
break;
}
}
return string(str_data, max_size);
}
bool TryTruncateMax(string_t str, idx_t max_size, string &result, data_t max_byte) {
auto data = const_data_ptr_cast(str.GetData());
// find the last position in the string which we can increment for the truncation
// if ALL characters are above the max byte we cannot truncate
idx_t increment_pos;
for (increment_pos = max_size; increment_pos > 0; increment_pos--) {
idx_t str_idx = increment_pos - 1;
if (data[str_idx] < max_byte) {
// found the increment position
break;
}
}
if (increment_pos == 0) {
// all characters are above the max byte - we cannot truncate - return false
return false;
}
// set up the result string - we don't care about anything after the increment pos
result = string(str.GetData(), increment_pos);
// actually increment
result[increment_pos - 1]++;
return true;
}
bool TryTruncateMax(string_t str, idx_t max_size, string &result) {
// truncate a string for the max value
// since 'XXX' < 'XXXX', we need to "increment" a byte to get a correct max value
// i.e. we need to generate 'XXY' as a string
// note that this is not necessarily always possible
D_ASSERT(str.GetSize() > max_size);
if (type == LogicalTypeId::BLOB) {
// for blobs we can always increment bytes - we just can't increment past the max of a single byte (2^8)
return TryTruncateMax(str, max_size, result, static_cast<data_t>(0xFF));
}
D_ASSERT(type == LogicalTypeId::VARCHAR);
// for varchar the situation is more complex - we need to truncate to a valid UTF8 string and increment
// for now we only increment ASCII characters (characters below 0x7F)
return TryTruncateMax(str, max_size, result, static_cast<data_t>(0x7F));
}
string GetMin() override {
return GetMinValue();
}
string GetMax() override {
return GetMaxValue();
}
string GetMinValue() override {
return HasStats() ? min : string();
}
string GetMaxValue() override {
return HasStats() ? max : string();
}
bool MinIsExact() override {
return !min_truncated;
}
bool MaxIsExact() override {
return !max_truncated;
}
};
class UUIDStatisticsState : public ColumnWriterStatistics {
public:
bool has_stats = false;
data_t min[16] = {0};
data_t max[16] = {0};
public:
bool HasStats() override {
return has_stats;
}
string GetMin() override {
return GetMinValue();
}
string GetMax() override {
return GetMaxValue();
}
string GetMinValue() override {
return HasStats() ? string(char_ptr_cast(min), 16) : string();
}
string GetMaxValue() override {
return HasStats() ? string(char_ptr_cast(max), 16) : string();
}
};
class GeoStatisticsState final : public ColumnWriterStatistics {
public:
explicit GeoStatisticsState() : has_stats(false) {
geo_stats.SetEmpty();
}
bool has_stats;
GeometryStatsData geo_stats;
public:
void Update(const string_t &val) {
geo_stats.Update(val);
has_stats = true;
}
bool HasGeoStats() override {
return has_stats;
}
optional_ptr<GeometryStatsData> GetGeoStats() override {
return geo_stats;
}
void WriteGeoStats(duckdb_parquet::GeospatialStatistics &stats) override {
const auto &types = geo_stats.types;
const auto &bbox = geo_stats.extent;
if (bbox.HasXY()) {
stats.__isset.bbox = true;
stats.bbox.xmin = bbox.x_min;
stats.bbox.xmax = bbox.x_max;
stats.bbox.ymin = bbox.y_min;
stats.bbox.ymax = bbox.y_max;
if (bbox.HasZ()) {
stats.bbox.__isset.zmin = true;
stats.bbox.__isset.zmax = true;
stats.bbox.zmin = bbox.z_min;
stats.bbox.zmax = bbox.z_max;
}
if (bbox.HasM()) {
stats.bbox.__isset.mmin = true;
stats.bbox.__isset.mmax = true;
stats.bbox.mmin = bbox.m_min;
stats.bbox.mmax = bbox.m_max;
}
}
stats.__isset.geospatial_types = true;
stats.geospatial_types = types.ToWKBList();
}
};
} // namespace duckdb

View File

@@ -0,0 +1,115 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/primitive_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_writer.hpp"
#include "writer/parquet_write_stats.hpp"
#include "duckdb/common/serializer/memory_stream.hpp"
#include "parquet_statistics.hpp"
namespace duckdb {
struct PageInformation {
idx_t offset = 0;
idx_t row_count = 0;
idx_t empty_count = 0;
idx_t estimated_page_size = 0;
idx_t null_count = 0;
};
struct PageWriteInformation {
duckdb_parquet::PageHeader page_header;
unique_ptr<MemoryStream> temp_writer;
unique_ptr<ColumnWriterPageState> page_state;
idx_t write_page_idx = 0;
idx_t write_count = 0;
idx_t max_write_count = 0;
size_t compressed_size;
data_ptr_t compressed_data;
AllocatedData compressed_buf;
};
class PrimitiveColumnWriterState : public ColumnWriterState {
public:
PrimitiveColumnWriterState(ParquetWriter &writer_p, duckdb_parquet::RowGroup &row_group, idx_t col_idx)
: writer(writer_p), row_group(row_group), col_idx(col_idx) {
page_info.emplace_back();
}
~PrimitiveColumnWriterState() override = default;
ParquetWriter &writer;
duckdb_parquet::RowGroup &row_group;
idx_t col_idx;
vector<PageInformation> page_info;
vector<PageWriteInformation> write_info;
unique_ptr<ColumnWriterStatistics> stats_state;
idx_t current_page = 0;
unique_ptr<ParquetBloomFilter> bloom_filter;
};
//! Base class for writing non-compound types (ex. numerics, strings)
class PrimitiveColumnWriter : public ColumnWriter {
public:
PrimitiveColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path,
bool can_have_nulls);
~PrimitiveColumnWriter() override = default;
//! We limit the uncompressed page size to 100MB
//! The max size in Parquet is 2GB, but we choose a more conservative limit
static constexpr const idx_t MAX_UNCOMPRESSED_PAGE_SIZE = 104857600ULL;
//! Dictionary pages must be below 2GB. Unlike data pages, there's only one dictionary page.
//! For this reason we go with a much higher, but still a conservative upper bound of 1GB;
static constexpr const idx_t MAX_UNCOMPRESSED_DICT_PAGE_SIZE = 1073741824ULL;
public:
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override;
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) override;
void BeginWrite(ColumnWriterState &state) override;
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
void FinalizeWrite(ColumnWriterState &state) override;
protected:
static void WriteLevels(Allocator &allocator, WriteStream &temp_writer, const unsafe_vector<uint16_t> &levels,
idx_t max_value, idx_t start_offset, idx_t count, optional_idx null_count = optional_idx());
virtual duckdb_parquet::Encoding::type GetEncoding(PrimitiveColumnWriterState &state);
void NextPage(PrimitiveColumnWriterState &state);
void FlushPage(PrimitiveColumnWriterState &state);
//! Initializes the state used to track statistics during writing. Only used for scalar types.
virtual unique_ptr<ColumnWriterStatistics> InitializeStatsState();
//! Initialize the writer for a specific page. Only used for scalar types.
virtual unique_ptr<ColumnWriterPageState> InitializePageState(PrimitiveColumnWriterState &state, idx_t page_idx);
//! Flushes the writer for a specific page. Only used for scalar types.
virtual void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state);
//! Retrieves the row size of a vector at the specified location. Only used for scalar types.
virtual idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state) const;
//! Writes a (subset of a) vector to the specified serializer. Only used for scalar types.
virtual void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats, ColumnWriterPageState *page_state,
Vector &vector, idx_t chunk_start, idx_t chunk_end) = 0;
virtual bool HasDictionary(PrimitiveColumnWriterState &state_p) {
return false;
}
//! The number of elements in the dictionary
virtual idx_t DictionarySize(PrimitiveColumnWriterState &state_p);
void WriteDictionary(PrimitiveColumnWriterState &state, unique_ptr<MemoryStream> temp_writer, idx_t row_count);
virtual void FlushDictionary(PrimitiveColumnWriterState &state, ColumnWriterStatistics *stats);
void SetParquetStatistics(PrimitiveColumnWriterState &state, duckdb_parquet::ColumnChunk &column);
void RegisterToRowGroup(duckdb_parquet::RowGroup &row_group);
};
} // namespace duckdb

View File

@@ -0,0 +1,37 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/struct_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "column_writer.hpp"
namespace duckdb {
class StructColumnWriter : public ColumnWriter {
public:
StructColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
vector<unique_ptr<ColumnWriter>> child_writers_p, bool can_have_nulls)
: ColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
child_writers = std::move(child_writers_p);
}
~StructColumnWriter() override = default;
public:
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override;
bool HasAnalyze() override;
void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
void FinalizeAnalyze(ColumnWriterState &state) override;
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) override;
void BeginWrite(ColumnWriterState &state) override;
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
void FinalizeWrite(ColumnWriterState &state) override;
};
} // namespace duckdb

View File

@@ -0,0 +1,444 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/templated_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "writer/primitive_column_writer.hpp"
#include "writer/parquet_write_operators.hpp"
#include "parquet_dbp_encoder.hpp"
#include "parquet_dlba_encoder.hpp"
#include "parquet_rle_bp_encoder.hpp"
#include "duckdb/common/primitive_dictionary.hpp"
namespace duckdb {
template <class SRC, class TGT, class OP = ParquetCastOperator, bool ALL_VALID>
static void TemplatedWritePlain(Vector &col, ColumnWriterStatistics *stats, const idx_t chunk_start,
const idx_t chunk_end, const ValidityMask &mask, WriteStream &ser) {
static constexpr bool COPY_DIRECTLY_FROM_VECTOR = ALL_VALID && std::is_same<SRC, TGT>::value &&
std::is_arithmetic<TGT>::value &&
std::is_same<OP, ParquetCastOperator>::value;
const auto *const ptr = FlatVector::GetData<SRC>(col);
TGT local_write[STANDARD_VECTOR_SIZE];
idx_t local_write_count = 0;
for (idx_t r = chunk_start; r < chunk_end; r++) {
if (!ALL_VALID && !mask.RowIsValid(r)) {
continue;
}
TGT target_value = OP::template Operation<SRC, TGT>(ptr[r]);
OP::template HandleStats<SRC, TGT>(stats, target_value);
if (COPY_DIRECTLY_FROM_VECTOR) {
continue;
}
if (std::is_arithmetic<TGT>::value) {
local_write[local_write_count++] = target_value;
if (local_write_count == STANDARD_VECTOR_SIZE) {
ser.WriteData(data_ptr_cast(local_write), local_write_count * sizeof(TGT));
local_write_count = 0;
}
} else {
OP::template WriteToStream<SRC, TGT>(target_value, ser);
}
}
if (COPY_DIRECTLY_FROM_VECTOR) {
ser.WriteData(const_data_ptr_cast(&ptr[chunk_start]), (chunk_end - chunk_start) * sizeof(TGT));
return;
}
if (std::is_arithmetic<TGT>::value) {
ser.WriteData(data_ptr_cast(local_write), local_write_count * sizeof(TGT));
}
// Else we already wrote to stream
}
template <class SRC, class TGT, class OP>
class StandardColumnWriterState : public PrimitiveColumnWriterState {
public:
StandardColumnWriterState(ParquetWriter &writer, duckdb_parquet::RowGroup &row_group, idx_t col_idx)
: PrimitiveColumnWriterState(writer, row_group, col_idx),
dictionary(BufferAllocator::Get(writer.GetContext()),
writer.DictionarySizeLimit().IsValid() ? writer.DictionarySizeLimit().GetIndex()
: NumericCast<idx_t>(row_group.num_rows) / 5,
writer.StringDictionaryPageSizeLimit()),
encoding(duckdb_parquet::Encoding::PLAIN) {
}
~StandardColumnWriterState() override = default;
// analysis state for integer values for DELTA_BINARY_PACKED/DELTA_LENGTH_BYTE_ARRAY
idx_t total_value_count = 0;
idx_t total_string_size = 0;
uint32_t key_bit_width = 0;
PrimitiveDictionary<SRC, TGT, OP> dictionary;
duckdb_parquet::Encoding::type encoding;
};
template <class SRC, class TGT, class OP>
class StandardWriterPageState : public ColumnWriterPageState {
public:
explicit StandardWriterPageState(const idx_t total_value_count, const idx_t total_string_size,
duckdb_parquet::Encoding::type encoding_p,
const PrimitiveDictionary<SRC, TGT, OP> &dictionary_p)
: encoding(encoding_p), dbp_initialized(false), dbp_encoder(total_value_count), dlba_initialized(false),
dlba_encoder(total_value_count, total_string_size), bss_initialized(false),
bss_encoder(total_value_count, sizeof(TGT)), dictionary(dictionary_p), dict_written_value(false),
dict_bit_width(RleBpDecoder::ComputeBitWidth(dictionary.GetSize())), dict_encoder(dict_bit_width) {
}
duckdb_parquet::Encoding::type encoding;
bool dbp_initialized;
DbpEncoder dbp_encoder;
bool dlba_initialized;
DlbaEncoder dlba_encoder;
bool bss_initialized;
BssEncoder bss_encoder;
const PrimitiveDictionary<SRC, TGT, OP> &dictionary;
bool dict_written_value;
uint32_t dict_bit_width;
RleBpEncoder dict_encoder;
};
template <class SRC, class TGT, class OP = ParquetCastOperator>
class StandardColumnWriter : public PrimitiveColumnWriter {
public:
StandardColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
vector<string> schema_path_p, // NOLINT
bool can_have_nulls)
: PrimitiveColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
}
~StandardColumnWriter() override = default;
public:
unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) override {
auto result = make_uniq<StandardColumnWriterState<SRC, TGT, OP>>(writer, row_group, row_group.columns.size());
result->encoding = duckdb_parquet::Encoding::RLE_DICTIONARY;
RegisterToRowGroup(row_group);
return std::move(result);
}
unique_ptr<ColumnWriterPageState> InitializePageState(PrimitiveColumnWriterState &state_p,
idx_t page_idx) override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
const auto &page_info = state_p.page_info[page_idx];
auto result = make_uniq<StandardWriterPageState<SRC, TGT, OP>>(
page_info.row_count - (page_info.empty_count + page_info.null_count), state.total_string_size,
state.encoding, state.dictionary);
return std::move(result);
}
void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) override {
auto &page_state = state_p->Cast<StandardWriterPageState<SRC, TGT, OP>>();
switch (page_state.encoding) {
case duckdb_parquet::Encoding::DELTA_BINARY_PACKED:
if (!page_state.dbp_initialized) {
page_state.dbp_encoder.BeginWrite(temp_writer, 0);
}
page_state.dbp_encoder.FinishWrite(temp_writer);
break;
case duckdb_parquet::Encoding::RLE_DICTIONARY:
D_ASSERT(page_state.dict_bit_width != 0);
if (!page_state.dict_written_value) {
// all values are null
// just write the bit width
temp_writer.Write<uint8_t>(page_state.dict_bit_width);
return;
}
page_state.dict_encoder.FinishWrite(temp_writer);
break;
case duckdb_parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY:
if (!page_state.dlba_initialized) {
page_state.dlba_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()), temp_writer,
string_t(""));
}
page_state.dlba_encoder.FinishWrite(temp_writer);
break;
case duckdb_parquet::Encoding::BYTE_STREAM_SPLIT:
if (!page_state.bss_initialized) {
page_state.bss_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()));
}
page_state.bss_encoder.FinishWrite(temp_writer);
break;
case duckdb_parquet::Encoding::PLAIN:
break;
default:
throw InternalException("Unknown encoding");
}
}
duckdb_parquet::Encoding::type GetEncoding(PrimitiveColumnWriterState &state_p) override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
return state.encoding;
}
bool HasAnalyze() override {
return true;
}
void Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
auto data_ptr = FlatVector::GetData<SRC>(vector);
idx_t vector_index = 0;
const bool check_parent_empty = parent && !parent->is_empty.empty();
const idx_t parent_index = state.definition_levels.size();
D_ASSERT(!check_parent_empty || parent_index < parent->is_empty.size());
const idx_t vcount =
check_parent_empty ? parent->definition_levels.size() - state.definition_levels.size() : count;
const auto &validity = FlatVector::Validity(vector);
if (!check_parent_empty && validity.AllValid()) {
// Fast path
for (; vector_index < vcount; vector_index++) {
const auto &src_value = data_ptr[vector_index];
state.dictionary.Insert(src_value);
state.total_value_count++;
state.total_string_size += DlbaEncoder::GetStringSize(src_value);
}
} else {
for (idx_t i = 0; i < vcount; i++) {
if (check_parent_empty && parent->is_empty[parent_index + i]) {
continue;
}
if (validity.RowIsValid(vector_index)) {
const auto &src_value = data_ptr[vector_index];
state.dictionary.Insert(src_value);
state.total_value_count++;
state.total_string_size += DlbaEncoder::GetStringSize(src_value);
}
vector_index++;
}
}
}
void FinalizeAnalyze(ColumnWriterState &state_p) override {
const auto type = writer.GetType(SchemaIndex());
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
if (state.dictionary.GetSize() == 0 || state.dictionary.IsFull()) {
state.dictionary.Reset();
if (writer.GetParquetVersion() == ParquetVersion::V1) {
// Can't do the cool stuff for V1
state.encoding = duckdb_parquet::Encoding::PLAIN;
} else {
// If we aren't doing dictionary encoding, these encodings are virtually always better than PLAIN
switch (type) {
case duckdb_parquet::Type::type::INT32:
case duckdb_parquet::Type::type::INT64:
state.encoding = duckdb_parquet::Encoding::DELTA_BINARY_PACKED;
break;
case duckdb_parquet::Type::type::BYTE_ARRAY:
state.encoding = duckdb_parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY;
break;
case duckdb_parquet::Type::type::FLOAT:
case duckdb_parquet::Type::type::DOUBLE:
state.encoding = duckdb_parquet::Encoding::BYTE_STREAM_SPLIT;
break;
default:
state.encoding = duckdb_parquet::Encoding::PLAIN;
}
}
} else {
state.key_bit_width = RleBpDecoder::ComputeBitWidth(state.dictionary.GetSize());
}
}
unique_ptr<ColumnWriterStatistics> InitializeStatsState() override {
return OP::template InitializeStats<SRC, TGT>();
}
bool HasDictionary(PrimitiveColumnWriterState &state_p) override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
return state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY;
}
idx_t DictionarySize(PrimitiveColumnWriterState &state_p) override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
return state.dictionary.GetSize();
}
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats, ColumnWriterPageState *page_state_p,
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override {
const auto &mask = FlatVector::Validity(input_column);
if (mask.AllValid()) {
WriteVectorInternal<true>(temp_writer, stats, page_state_p, input_column, chunk_start, chunk_end);
} else {
WriteVectorInternal<false>(temp_writer, stats, page_state_p, input_column, chunk_start, chunk_end);
}
}
void FlushDictionary(PrimitiveColumnWriterState &state_p, ColumnWriterStatistics *stats) override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY);
if (writer.EnableBloomFilters()) {
state.bloom_filter =
make_uniq<ParquetBloomFilter>(state.dictionary.GetSize(), writer.BloomFilterFalsePositiveRatio());
}
state.dictionary.IterateValues([&](const SRC &src_value, const TGT &tgt_value) {
// update the statistics
OP::template HandleStats<SRC, TGT>(stats, tgt_value);
if (state.bloom_filter) {
// update the bloom filter
auto hash = OP::template XXHash64<SRC, TGT>(tgt_value);
state.bloom_filter->FilterInsert(hash);
}
});
// flush the dictionary page and add it to the to-be-written pages
WriteDictionary(state, state.dictionary.GetTargetMemoryStream(), state.dictionary.GetSize());
// bloom filter will be queued for writing in ParquetWriter::BufferBloomFilter one level up
}
idx_t GetRowSize(const Vector &vector, const idx_t index,
const PrimitiveColumnWriterState &state_p) const override {
auto &state = state_p.Cast<StandardColumnWriterState<SRC, TGT, OP>>();
if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY) {
return (state.key_bit_width + 7) / 8;
} else {
return OP::template GetRowSize<SRC, TGT>(vector, index);
}
}
private:
template <bool ALL_VALID>
void WriteVectorInternal(WriteStream &temp_writer, ColumnWriterStatistics *stats,
ColumnWriterPageState *page_state_p, Vector &input_column, idx_t chunk_start,
idx_t chunk_end) {
auto &page_state = page_state_p->Cast<StandardWriterPageState<SRC, TGT, OP>>();
const auto &mask = FlatVector::Validity(input_column);
const auto *data_ptr = FlatVector::GetData<SRC>(input_column);
switch (page_state.encoding) {
case duckdb_parquet::Encoding::RLE_DICTIONARY: {
idx_t r = chunk_start;
if (!page_state.dict_written_value) {
// find first non-null value
for (; r < chunk_end; r++) {
if (!mask.RowIsValid(r)) {
continue;
}
// write the bit-width as a one-byte entry and initialize writer
temp_writer.Write<uint8_t>(page_state.dict_bit_width);
page_state.dict_encoder.BeginWrite();
page_state.dict_written_value = true;
break;
}
}
for (; r < chunk_end; r++) {
if (!ALL_VALID && !mask.RowIsValid(r)) {
continue;
}
const auto &src_value = data_ptr[r];
const auto value_index = page_state.dictionary.GetIndex(src_value);
page_state.dict_encoder.WriteValue(temp_writer, value_index);
}
break;
}
case duckdb_parquet::Encoding::DELTA_BINARY_PACKED: {
idx_t r = chunk_start;
if (!page_state.dbp_initialized) {
// find first non-null value
for (; r < chunk_end; r++) {
if (!mask.RowIsValid(r)) {
continue;
}
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
OP::template HandleStats<SRC, TGT>(stats, target_value);
page_state.dbp_encoder.BeginWrite(temp_writer, target_value);
page_state.dbp_initialized = true;
r++; // skip over
break;
}
}
for (; r < chunk_end; r++) {
if (!ALL_VALID && !mask.RowIsValid(r)) {
continue;
}
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
OP::template HandleStats<SRC, TGT>(stats, target_value);
page_state.dbp_encoder.WriteValue(temp_writer, target_value);
}
break;
}
case duckdb_parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY: {
idx_t r = chunk_start;
if (!page_state.dlba_initialized) {
// find first non-null value
for (; r < chunk_end; r++) {
if (!mask.RowIsValid(r)) {
continue;
}
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
OP::template HandleStats<SRC, TGT>(stats, target_value);
page_state.dlba_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()), temp_writer,
target_value);
page_state.dlba_initialized = true;
r++; // skip over
break;
}
}
for (; r < chunk_end; r++) {
if (!ALL_VALID && !mask.RowIsValid(r)) {
continue;
}
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
OP::template HandleStats<SRC, TGT>(stats, target_value);
page_state.dlba_encoder.WriteValue(temp_writer, target_value);
}
break;
}
case duckdb_parquet::Encoding::BYTE_STREAM_SPLIT: {
if (!page_state.bss_initialized) {
page_state.bss_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()));
page_state.bss_initialized = true;
}
for (idx_t r = chunk_start; r < chunk_end; r++) {
if (!ALL_VALID && !mask.RowIsValid(r)) {
continue;
}
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
OP::template HandleStats<SRC, TGT>(stats, target_value);
page_state.bss_encoder.WriteValue(target_value);
}
break;
}
case duckdb_parquet::Encoding::PLAIN: {
D_ASSERT(page_state.encoding == duckdb_parquet::Encoding::PLAIN);
if (mask.AllValid()) {
TemplatedWritePlain<SRC, TGT, OP, true>(input_column, stats, chunk_start, chunk_end, mask, temp_writer);
} else {
TemplatedWritePlain<SRC, TGT, OP, false>(input_column, stats, chunk_start, chunk_end, mask,
temp_writer);
}
break;
}
default:
throw InternalException("Unknown encoding");
}
}
};
} // namespace duckdb

View File

@@ -0,0 +1,30 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// writer/variant_column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "struct_column_writer.hpp"
#include "duckdb/planner/expression/bound_function_expression.hpp"
namespace duckdb {
class VariantColumnWriter : public StructColumnWriter {
public:
VariantColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path_p,
vector<unique_ptr<ColumnWriter>> child_writers_p, bool can_have_nulls)
: StructColumnWriter(writer, column_schema, std::move(schema_path_p), std::move(child_writers_p),
can_have_nulls) {
}
~VariantColumnWriter() override = default;
public:
static ScalarFunction GetTransformFunction();
static LogicalType TransformTypedValueRecursive(const LogicalType &type);
};
} // namespace duckdb