should be it
This commit is contained in:
16
external/duckdb/extension/parquet/writer/CMakeLists.txt
vendored
Normal file
16
external/duckdb/extension/parquet/writer/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
add_library_unity(
|
||||
duckdb_parquet_writers
|
||||
OBJECT
|
||||
array_column_writer.cpp
|
||||
boolean_column_writer.cpp
|
||||
decimal_column_writer.cpp
|
||||
enum_column_writer.cpp
|
||||
list_column_writer.cpp
|
||||
primitive_column_writer.cpp
|
||||
struct_column_writer.cpp)
|
||||
|
||||
add_subdirectory(variant)
|
||||
|
||||
set(PARQUET_EXTENSION_FILES
|
||||
${PARQUET_EXTENSION_FILES} $<TARGET_OBJECTS:duckdb_parquet_writers>
|
||||
PARENT_SCOPE)
|
||||
75
external/duckdb/extension/parquet/writer/array_column_writer.cpp
vendored
Normal file
75
external/duckdb/extension/parquet/writer/array_column_writer.cpp
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
#include "writer/array_column_writer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
void ArrayColumnWriter::Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
|
||||
auto &state = state_p.Cast<ListColumnWriterState>();
|
||||
auto &array_child = ArrayVector::GetEntry(vector);
|
||||
auto array_size = ArrayType::GetSize(vector.GetType());
|
||||
GetChildWriter().Analyze(*state.child_state, &state_p, array_child, array_size * count);
|
||||
}
|
||||
|
||||
void ArrayColumnWriter::WriteArrayState(ListColumnWriterState &state, idx_t array_size, uint16_t first_repeat_level,
|
||||
idx_t define_value, const bool is_empty) {
|
||||
state.definition_levels.push_back(define_value);
|
||||
state.repetition_levels.push_back(first_repeat_level);
|
||||
state.is_empty.push_back(is_empty);
|
||||
|
||||
if (is_empty) {
|
||||
return;
|
||||
}
|
||||
for (idx_t k = 1; k < array_size; k++) {
|
||||
state.repetition_levels.push_back(MaxRepeat() + 1);
|
||||
state.definition_levels.push_back(define_value);
|
||||
state.is_empty.push_back(false);
|
||||
}
|
||||
}
|
||||
|
||||
void ArrayColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count,
|
||||
bool vector_can_span_multiple_pages) {
|
||||
auto &state = state_p.Cast<ListColumnWriterState>();
|
||||
|
||||
auto array_size = ArrayType::GetSize(vector.GetType());
|
||||
auto &validity = FlatVector::Validity(vector);
|
||||
|
||||
// write definition levels and repeats
|
||||
// the main difference between this and ListColumnWriter::Prepare is that we need to make sure to write out
|
||||
// repetition levels and definitions for the child elements of the array even if the array itself is NULL.
|
||||
idx_t vcount = parent ? parent->definition_levels.size() - state.parent_index : count;
|
||||
idx_t vector_index = 0;
|
||||
for (idx_t i = 0; i < vcount; i++) {
|
||||
idx_t parent_index = state.parent_index + i;
|
||||
if (parent && !parent->is_empty.empty() && parent->is_empty[parent_index]) {
|
||||
WriteArrayState(state, array_size, parent->repetition_levels[parent_index],
|
||||
parent->definition_levels[parent_index], true);
|
||||
continue;
|
||||
}
|
||||
auto first_repeat_level =
|
||||
parent && !parent->repetition_levels.empty() ? parent->repetition_levels[parent_index] : MaxRepeat();
|
||||
if (parent && parent->definition_levels[parent_index] != PARQUET_DEFINE_VALID) {
|
||||
WriteArrayState(state, array_size, first_repeat_level, parent->definition_levels[parent_index]);
|
||||
} else if (validity.RowIsValid(vector_index)) {
|
||||
// push the repetition levels
|
||||
WriteArrayState(state, array_size, first_repeat_level, PARQUET_DEFINE_VALID);
|
||||
} else {
|
||||
//! Produce a null
|
||||
WriteArrayState(state, array_size, first_repeat_level, MaxDefine() - 1);
|
||||
}
|
||||
vector_index++;
|
||||
}
|
||||
state.parent_index += vcount;
|
||||
|
||||
auto &array_child = ArrayVector::GetEntry(vector);
|
||||
// The elements of a single array should not span multiple Parquet pages
|
||||
// So, we force the entire vector to fit on a single page by setting "vector_can_span_multiple_pages=false"
|
||||
GetChildWriter().Prepare(*state.child_state, &state_p, array_child, count * array_size, false);
|
||||
}
|
||||
|
||||
void ArrayColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
|
||||
auto &state = state_p.Cast<ListColumnWriterState>();
|
||||
auto array_size = ArrayType::GetSize(vector.GetType());
|
||||
auto &array_child = ArrayVector::GetEntry(vector);
|
||||
GetChildWriter().Write(*state.child_state, array_child, count * array_size);
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
105
external/duckdb/extension/parquet/writer/boolean_column_writer.cpp
vendored
Normal file
105
external/duckdb/extension/parquet/writer/boolean_column_writer.cpp
vendored
Normal file
@@ -0,0 +1,105 @@
|
||||
#include "writer/boolean_column_writer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class BooleanStatisticsState : public ColumnWriterStatistics {
|
||||
public:
|
||||
BooleanStatisticsState() : min(true), max(false) {
|
||||
}
|
||||
|
||||
bool min;
|
||||
bool max;
|
||||
|
||||
public:
|
||||
bool HasStats() override {
|
||||
return !(min && !max);
|
||||
}
|
||||
|
||||
string GetMin() override {
|
||||
return GetMinValue();
|
||||
}
|
||||
string GetMax() override {
|
||||
return GetMaxValue();
|
||||
}
|
||||
string GetMinValue() override {
|
||||
return HasStats() ? string(const_char_ptr_cast(&min), sizeof(bool)) : string();
|
||||
}
|
||||
string GetMaxValue() override {
|
||||
return HasStats() ? string(const_char_ptr_cast(&max), sizeof(bool)) : string();
|
||||
}
|
||||
};
|
||||
|
||||
class BooleanWriterPageState : public ColumnWriterPageState {
|
||||
public:
|
||||
uint8_t byte = 0;
|
||||
uint8_t byte_pos = 0;
|
||||
};
|
||||
|
||||
BooleanColumnWriter::BooleanColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
|
||||
vector<string> schema_path_p, bool can_have_nulls)
|
||||
: PrimitiveColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
|
||||
}
|
||||
|
||||
unique_ptr<ColumnWriterStatistics> BooleanColumnWriter::InitializeStatsState() {
|
||||
return make_uniq<BooleanStatisticsState>();
|
||||
}
|
||||
|
||||
void BooleanColumnWriter::WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p,
|
||||
ColumnWriterPageState *state_p, Vector &input_column, idx_t chunk_start,
|
||||
idx_t chunk_end) {
|
||||
auto &stats = stats_p->Cast<BooleanStatisticsState>();
|
||||
auto &state = state_p->Cast<BooleanWriterPageState>();
|
||||
const auto &mask = FlatVector::Validity(input_column);
|
||||
|
||||
const auto *const ptr = FlatVector::GetData<bool>(input_column);
|
||||
if (stats.max && !stats.min && mask.AllValid()) {
|
||||
// Fast path: stats have already been set, and there's no NULLs
|
||||
for (idx_t r = chunk_start; r < chunk_end; r++) {
|
||||
const auto &val = ptr[r];
|
||||
state.byte |= val << state.byte_pos;
|
||||
if (++state.byte_pos == 8) {
|
||||
temp_writer.Write(state.byte);
|
||||
state.byte = 0;
|
||||
state.byte_pos = 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (idx_t r = chunk_start; r < chunk_end; r++) {
|
||||
if (!mask.RowIsValid(r)) {
|
||||
continue;
|
||||
}
|
||||
const auto &val = ptr[r];
|
||||
|
||||
stats.max |= val;
|
||||
stats.min &= val;
|
||||
state.byte |= val << state.byte_pos;
|
||||
|
||||
if (++state.byte_pos == 8) {
|
||||
temp_writer.Write(state.byte);
|
||||
state.byte = 0;
|
||||
state.byte_pos = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<ColumnWriterPageState> BooleanColumnWriter::InitializePageState(PrimitiveColumnWriterState &state,
|
||||
idx_t page_idx) {
|
||||
return make_uniq<BooleanWriterPageState>();
|
||||
}
|
||||
|
||||
void BooleanColumnWriter::FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) {
|
||||
auto &state = state_p->Cast<BooleanWriterPageState>();
|
||||
if (state.byte_pos > 0) {
|
||||
temp_writer.Write<uint8_t>(state.byte);
|
||||
state.byte = 0;
|
||||
state.byte_pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
idx_t BooleanColumnWriter::GetRowSize(const Vector &vector, const idx_t index,
|
||||
const PrimitiveColumnWriterState &state) const {
|
||||
return sizeof(bool);
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
100
external/duckdb/extension/parquet/writer/decimal_column_writer.cpp
vendored
Normal file
100
external/duckdb/extension/parquet/writer/decimal_column_writer.cpp
vendored
Normal file
@@ -0,0 +1,100 @@
|
||||
#include "writer/decimal_column_writer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static void WriteParquetDecimal(hugeint_t input, data_ptr_t result) {
|
||||
bool positive = input >= 0;
|
||||
// numbers are stored as two's complement so some muckery is required
|
||||
if (!positive) {
|
||||
input = NumericLimits<hugeint_t>::Maximum() + input + 1;
|
||||
}
|
||||
uint64_t high_bytes = uint64_t(input.upper);
|
||||
uint64_t low_bytes = input.lower;
|
||||
|
||||
for (idx_t i = 0; i < sizeof(uint64_t); i++) {
|
||||
auto shift_count = (sizeof(uint64_t) - i - 1) * 8;
|
||||
result[i] = (high_bytes >> shift_count) & 0xFF;
|
||||
}
|
||||
for (idx_t i = 0; i < sizeof(uint64_t); i++) {
|
||||
auto shift_count = (sizeof(uint64_t) - i - 1) * 8;
|
||||
result[sizeof(uint64_t) + i] = (low_bytes >> shift_count) & 0xFF;
|
||||
}
|
||||
if (!positive) {
|
||||
result[0] |= 0x80;
|
||||
}
|
||||
}
|
||||
|
||||
class FixedDecimalStatistics : public ColumnWriterStatistics {
|
||||
public:
|
||||
FixedDecimalStatistics() : min(NumericLimits<hugeint_t>::Maximum()), max(NumericLimits<hugeint_t>::Minimum()) {
|
||||
}
|
||||
|
||||
hugeint_t min;
|
||||
hugeint_t max;
|
||||
|
||||
public:
|
||||
string GetStats(hugeint_t &input) {
|
||||
data_t buffer[16];
|
||||
WriteParquetDecimal(input, buffer);
|
||||
return string(const_char_ptr_cast(buffer), 16);
|
||||
}
|
||||
|
||||
bool HasStats() override {
|
||||
return min <= max;
|
||||
}
|
||||
|
||||
void Update(hugeint_t &val) {
|
||||
if (LessThan::Operation(val, min)) {
|
||||
min = val;
|
||||
}
|
||||
if (GreaterThan::Operation(val, max)) {
|
||||
max = val;
|
||||
}
|
||||
}
|
||||
|
||||
string GetMin() override {
|
||||
return GetMinValue();
|
||||
}
|
||||
string GetMax() override {
|
||||
return GetMaxValue();
|
||||
}
|
||||
string GetMinValue() override {
|
||||
return HasStats() ? GetStats(min) : string();
|
||||
}
|
||||
string GetMaxValue() override {
|
||||
return HasStats() ? GetStats(max) : string();
|
||||
}
|
||||
};
|
||||
|
||||
FixedDecimalColumnWriter::FixedDecimalColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
|
||||
vector<string> schema_path_p, bool can_have_nulls)
|
||||
: PrimitiveColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
|
||||
}
|
||||
|
||||
unique_ptr<ColumnWriterStatistics> FixedDecimalColumnWriter::InitializeStatsState() {
|
||||
return make_uniq<FixedDecimalStatistics>();
|
||||
}
|
||||
|
||||
void FixedDecimalColumnWriter::WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p,
|
||||
ColumnWriterPageState *page_state, Vector &input_column, idx_t chunk_start,
|
||||
idx_t chunk_end) {
|
||||
auto &mask = FlatVector::Validity(input_column);
|
||||
auto *ptr = FlatVector::GetData<hugeint_t>(input_column);
|
||||
auto &stats = stats_p->Cast<FixedDecimalStatistics>();
|
||||
|
||||
data_t temp_buffer[16];
|
||||
for (idx_t r = chunk_start; r < chunk_end; r++) {
|
||||
if (mask.RowIsValid(r)) {
|
||||
stats.Update(ptr[r]);
|
||||
WriteParquetDecimal(ptr[r], temp_buffer);
|
||||
temp_writer.WriteData(temp_buffer, 16);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
idx_t FixedDecimalColumnWriter::GetRowSize(const Vector &vector, const idx_t index,
|
||||
const PrimitiveColumnWriterState &state) const {
|
||||
return sizeof(hugeint_t);
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
119
external/duckdb/extension/parquet/writer/enum_column_writer.cpp
vendored
Normal file
119
external/duckdb/extension/parquet/writer/enum_column_writer.cpp
vendored
Normal file
@@ -0,0 +1,119 @@
|
||||
#include "writer/enum_column_writer.hpp"
|
||||
#include "parquet_rle_bp_decoder.hpp"
|
||||
#include "parquet_rle_bp_encoder.hpp"
|
||||
#include "parquet_writer.hpp"
|
||||
#include "duckdb/common/serializer/memory_stream.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
using duckdb_parquet::Encoding;
|
||||
|
||||
class EnumWriterPageState : public ColumnWriterPageState {
|
||||
public:
|
||||
explicit EnumWriterPageState(uint32_t bit_width) : encoder(bit_width), written_value(false) {
|
||||
}
|
||||
|
||||
RleBpEncoder encoder;
|
||||
bool written_value;
|
||||
};
|
||||
|
||||
EnumColumnWriter::EnumColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
|
||||
vector<string> schema_path_p, bool can_have_nulls)
|
||||
: PrimitiveColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
|
||||
bit_width = RleBpDecoder::ComputeBitWidth(EnumType::GetSize(Type()));
|
||||
}
|
||||
|
||||
unique_ptr<ColumnWriterStatistics> EnumColumnWriter::InitializeStatsState() {
|
||||
return make_uniq<StringStatisticsState>();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void EnumColumnWriter::WriteEnumInternal(WriteStream &temp_writer, Vector &input_column, idx_t chunk_start,
|
||||
idx_t chunk_end, EnumWriterPageState &page_state) {
|
||||
auto &mask = FlatVector::Validity(input_column);
|
||||
auto *ptr = FlatVector::GetData<T>(input_column);
|
||||
for (idx_t r = chunk_start; r < chunk_end; r++) {
|
||||
if (mask.RowIsValid(r)) {
|
||||
if (!page_state.written_value) {
|
||||
// first value: write the bit-width as a one-byte entry and initialize writer
|
||||
temp_writer.Write<uint8_t>(bit_width);
|
||||
page_state.encoder.BeginWrite();
|
||||
page_state.written_value = true;
|
||||
}
|
||||
page_state.encoder.WriteValue(temp_writer, ptr[r]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void EnumColumnWriter::WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p,
|
||||
ColumnWriterPageState *page_state_p, Vector &input_column, idx_t chunk_start,
|
||||
idx_t chunk_end) {
|
||||
auto &page_state = page_state_p->Cast<EnumWriterPageState>();
|
||||
switch (Type().InternalType()) {
|
||||
case PhysicalType::UINT8:
|
||||
WriteEnumInternal<uint8_t>(temp_writer, input_column, chunk_start, chunk_end, page_state);
|
||||
break;
|
||||
case PhysicalType::UINT16:
|
||||
WriteEnumInternal<uint16_t>(temp_writer, input_column, chunk_start, chunk_end, page_state);
|
||||
break;
|
||||
case PhysicalType::UINT32:
|
||||
WriteEnumInternal<uint32_t>(temp_writer, input_column, chunk_start, chunk_end, page_state);
|
||||
break;
|
||||
default:
|
||||
throw InternalException("Unsupported internal enum type");
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<ColumnWriterPageState> EnumColumnWriter::InitializePageState(PrimitiveColumnWriterState &state,
|
||||
idx_t page_idx) {
|
||||
return make_uniq<EnumWriterPageState>(bit_width);
|
||||
}
|
||||
|
||||
void EnumColumnWriter::FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) {
|
||||
auto &page_state = state_p->Cast<EnumWriterPageState>();
|
||||
if (!page_state.written_value) {
|
||||
// all values are null
|
||||
// just write the bit width
|
||||
temp_writer.Write<uint8_t>(bit_width);
|
||||
return;
|
||||
}
|
||||
page_state.encoder.FinishWrite(temp_writer);
|
||||
}
|
||||
|
||||
duckdb_parquet::Encoding::type EnumColumnWriter::GetEncoding(PrimitiveColumnWriterState &state) {
|
||||
return Encoding::RLE_DICTIONARY;
|
||||
}
|
||||
|
||||
bool EnumColumnWriter::HasDictionary(PrimitiveColumnWriterState &state) {
|
||||
return true;
|
||||
}
|
||||
|
||||
idx_t EnumColumnWriter::DictionarySize(PrimitiveColumnWriterState &state_p) {
|
||||
return EnumType::GetSize(Type());
|
||||
}
|
||||
|
||||
void EnumColumnWriter::FlushDictionary(PrimitiveColumnWriterState &state, ColumnWriterStatistics *stats_p) {
|
||||
auto &stats = stats_p->Cast<StringStatisticsState>();
|
||||
// write the enum values to a dictionary page
|
||||
auto &enum_values = EnumType::GetValuesInsertOrder(Type());
|
||||
auto enum_count = EnumType::GetSize(Type());
|
||||
auto string_values = FlatVector::GetData<string_t>(enum_values);
|
||||
// first write the contents of the dictionary page to a temporary buffer
|
||||
auto temp_writer = make_uniq<MemoryStream>(BufferAllocator::Get(writer.GetContext()));
|
||||
for (idx_t r = 0; r < enum_count; r++) {
|
||||
D_ASSERT(!FlatVector::IsNull(enum_values, r));
|
||||
// update the statistics
|
||||
stats.Update(string_values[r]);
|
||||
// write this string value to the dictionary
|
||||
temp_writer->Write<uint32_t>(string_values[r].GetSize());
|
||||
temp_writer->WriteData(const_data_ptr_cast(string_values[r].GetData()), string_values[r].GetSize());
|
||||
}
|
||||
// flush the dictionary page and add it to the to-be-written pages
|
||||
WriteDictionary(state, std::move(temp_writer), enum_count);
|
||||
}
|
||||
|
||||
idx_t EnumColumnWriter::GetRowSize(const Vector &vector, const idx_t index,
|
||||
const PrimitiveColumnWriterState &state) const {
|
||||
return (bit_width + 7) / 8;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
144
external/duckdb/extension/parquet/writer/list_column_writer.cpp
vendored
Normal file
144
external/duckdb/extension/parquet/writer/list_column_writer.cpp
vendored
Normal file
@@ -0,0 +1,144 @@
|
||||
#include "writer/list_column_writer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
unique_ptr<ColumnWriterState> ListColumnWriter::InitializeWriteState(duckdb_parquet::RowGroup &row_group) {
|
||||
auto result = make_uniq<ListColumnWriterState>(row_group, row_group.columns.size());
|
||||
result->child_state = GetChildWriter().InitializeWriteState(row_group);
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
bool ListColumnWriter::HasAnalyze() {
|
||||
return GetChildWriter().HasAnalyze();
|
||||
}
|
||||
void ListColumnWriter::Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
|
||||
auto &state = state_p.Cast<ListColumnWriterState>();
|
||||
auto &list_child = ListVector::GetEntry(vector);
|
||||
auto list_count = ListVector::GetListSize(vector);
|
||||
GetChildWriter().Analyze(*state.child_state, &state_p, list_child, list_count);
|
||||
}
|
||||
|
||||
void ListColumnWriter::FinalizeAnalyze(ColumnWriterState &state_p) {
|
||||
auto &state = state_p.Cast<ListColumnWriterState>();
|
||||
GetChildWriter().FinalizeAnalyze(*state.child_state);
|
||||
}
|
||||
|
||||
static idx_t GetConsecutiveChildList(Vector &list, Vector &result, idx_t offset, idx_t count) {
|
||||
// returns a consecutive child list that fully flattens and repeats all required elements
|
||||
auto &validity = FlatVector::Validity(list);
|
||||
auto list_entries = FlatVector::GetData<list_entry_t>(list);
|
||||
bool is_consecutive = true;
|
||||
idx_t total_length = 0;
|
||||
for (idx_t c = offset; c < offset + count; c++) {
|
||||
if (!validity.RowIsValid(c)) {
|
||||
continue;
|
||||
}
|
||||
if (list_entries[c].offset != total_length) {
|
||||
is_consecutive = false;
|
||||
}
|
||||
total_length += list_entries[c].length;
|
||||
}
|
||||
if (is_consecutive) {
|
||||
// already consecutive - leave it as-is
|
||||
return total_length;
|
||||
}
|
||||
SelectionVector sel(total_length);
|
||||
idx_t index = 0;
|
||||
for (idx_t c = offset; c < offset + count; c++) {
|
||||
if (!validity.RowIsValid(c)) {
|
||||
continue;
|
||||
}
|
||||
for (idx_t k = 0; k < list_entries[c].length; k++) {
|
||||
sel.set_index(index++, list_entries[c].offset + k);
|
||||
}
|
||||
}
|
||||
result.Slice(sel, total_length);
|
||||
result.Flatten(total_length);
|
||||
return total_length;
|
||||
}
|
||||
|
||||
void ListColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count,
|
||||
bool vector_can_span_multiple_pages) {
|
||||
auto &state = state_p.Cast<ListColumnWriterState>();
|
||||
|
||||
auto list_data = FlatVector::GetData<list_entry_t>(vector);
|
||||
auto &validity = FlatVector::Validity(vector);
|
||||
|
||||
// write definition levels and repeats
|
||||
idx_t start = 0;
|
||||
idx_t vcount = parent ? parent->definition_levels.size() - state.parent_index : count;
|
||||
idx_t vector_index = 0;
|
||||
for (idx_t i = start; i < vcount; i++) {
|
||||
idx_t parent_index = state.parent_index + i;
|
||||
if (parent && !parent->is_empty.empty() && parent->is_empty[parent_index]) {
|
||||
state.definition_levels.push_back(parent->definition_levels[parent_index]);
|
||||
state.repetition_levels.push_back(parent->repetition_levels[parent_index]);
|
||||
state.is_empty.push_back(true);
|
||||
continue;
|
||||
}
|
||||
auto first_repeat_level =
|
||||
parent && !parent->repetition_levels.empty() ? parent->repetition_levels[parent_index] : MaxRepeat();
|
||||
if (parent && parent->definition_levels[parent_index] != PARQUET_DEFINE_VALID) {
|
||||
state.definition_levels.push_back(parent->definition_levels[parent_index]);
|
||||
state.repetition_levels.push_back(first_repeat_level);
|
||||
state.is_empty.push_back(true);
|
||||
} else if (validity.RowIsValid(vector_index)) {
|
||||
// push the repetition levels
|
||||
if (list_data[vector_index].length == 0) {
|
||||
state.definition_levels.push_back(MaxDefine());
|
||||
state.is_empty.push_back(true);
|
||||
} else {
|
||||
state.definition_levels.push_back(PARQUET_DEFINE_VALID);
|
||||
state.is_empty.push_back(false);
|
||||
}
|
||||
state.repetition_levels.push_back(first_repeat_level);
|
||||
for (idx_t k = 1; k < list_data[vector_index].length; k++) {
|
||||
state.repetition_levels.push_back(MaxRepeat() + 1);
|
||||
state.definition_levels.push_back(PARQUET_DEFINE_VALID);
|
||||
state.is_empty.push_back(false);
|
||||
}
|
||||
} else {
|
||||
if (!can_have_nulls) {
|
||||
throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
|
||||
}
|
||||
state.definition_levels.push_back(MaxDefine() - 1);
|
||||
state.repetition_levels.push_back(first_repeat_level);
|
||||
state.is_empty.push_back(true);
|
||||
}
|
||||
vector_index++;
|
||||
}
|
||||
state.parent_index += vcount;
|
||||
|
||||
auto &list_child = ListVector::GetEntry(vector);
|
||||
Vector child_list(list_child);
|
||||
auto child_length = GetConsecutiveChildList(vector, child_list, 0, count);
|
||||
// The elements of a single list should not span multiple Parquet pages
|
||||
// So, we force the entire vector to fit on a single page by setting "vector_can_span_multiple_pages=false"
|
||||
GetChildWriter().Prepare(*state.child_state, &state_p, child_list, child_length, false);
|
||||
}
|
||||
|
||||
void ListColumnWriter::BeginWrite(ColumnWriterState &state_p) {
|
||||
auto &state = state_p.Cast<ListColumnWriterState>();
|
||||
GetChildWriter().BeginWrite(*state.child_state);
|
||||
}
|
||||
|
||||
void ListColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
|
||||
auto &state = state_p.Cast<ListColumnWriterState>();
|
||||
|
||||
auto &list_child = ListVector::GetEntry(vector);
|
||||
Vector child_list(list_child);
|
||||
auto child_length = GetConsecutiveChildList(vector, child_list, 0, count);
|
||||
GetChildWriter().Write(*state.child_state, child_list, child_length);
|
||||
}
|
||||
|
||||
void ListColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
|
||||
auto &state = state_p.Cast<ListColumnWriterState>();
|
||||
GetChildWriter().FinalizeWrite(*state.child_state);
|
||||
}
|
||||
|
||||
ColumnWriter &ListColumnWriter::GetChildWriter() {
|
||||
D_ASSERT(child_writers.size() == 1);
|
||||
return *child_writers[0];
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
435
external/duckdb/extension/parquet/writer/primitive_column_writer.cpp
vendored
Normal file
435
external/duckdb/extension/parquet/writer/primitive_column_writer.cpp
vendored
Normal file
@@ -0,0 +1,435 @@
|
||||
#include "writer/primitive_column_writer.hpp"
|
||||
#include "parquet_rle_bp_decoder.hpp"
|
||||
#include "parquet_rle_bp_encoder.hpp"
|
||||
#include "parquet_writer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
using duckdb_parquet::Encoding;
|
||||
using duckdb_parquet::PageType;
|
||||
|
||||
constexpr const idx_t PrimitiveColumnWriter::MAX_UNCOMPRESSED_PAGE_SIZE;
|
||||
constexpr const idx_t PrimitiveColumnWriter::MAX_UNCOMPRESSED_DICT_PAGE_SIZE;
|
||||
|
||||
PrimitiveColumnWriter::PrimitiveColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
|
||||
vector<string> schema_path, bool can_have_nulls)
|
||||
: ColumnWriter(writer, column_schema, std::move(schema_path), can_have_nulls) {
|
||||
}
|
||||
|
||||
unique_ptr<ColumnWriterState> PrimitiveColumnWriter::InitializeWriteState(duckdb_parquet::RowGroup &row_group) {
|
||||
auto result = make_uniq<PrimitiveColumnWriterState>(writer, row_group, row_group.columns.size());
|
||||
RegisterToRowGroup(row_group);
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
void PrimitiveColumnWriter::RegisterToRowGroup(duckdb_parquet::RowGroup &row_group) {
|
||||
duckdb_parquet::ColumnChunk column_chunk;
|
||||
column_chunk.__isset.meta_data = true;
|
||||
column_chunk.meta_data.codec = writer.GetCodec();
|
||||
column_chunk.meta_data.path_in_schema = schema_path;
|
||||
column_chunk.meta_data.num_values = 0;
|
||||
column_chunk.meta_data.type = writer.GetType(SchemaIndex());
|
||||
row_group.columns.push_back(std::move(column_chunk));
|
||||
}
|
||||
|
||||
unique_ptr<ColumnWriterPageState> PrimitiveColumnWriter::InitializePageState(PrimitiveColumnWriterState &state,
|
||||
idx_t page_idx) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void PrimitiveColumnWriter::FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state) {
|
||||
}
|
||||
|
||||
void PrimitiveColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count,
|
||||
bool vector_can_span_multiple_pages) {
|
||||
auto &state = state_p.Cast<PrimitiveColumnWriterState>();
|
||||
auto &col_chunk = state.row_group.columns[state.col_idx];
|
||||
|
||||
idx_t vcount = parent ? parent->definition_levels.size() - state.definition_levels.size() : count;
|
||||
idx_t parent_index = state.definition_levels.size();
|
||||
auto &validity = FlatVector::Validity(vector);
|
||||
HandleRepeatLevels(state, parent, count);
|
||||
HandleDefineLevels(state, parent, validity, count, MaxDefine(), MaxDefine() - 1);
|
||||
|
||||
idx_t vector_index = 0;
|
||||
reference<PageInformation> page_info_ref = state.page_info.back();
|
||||
col_chunk.meta_data.num_values += NumericCast<int64_t>(vcount);
|
||||
|
||||
const bool check_parent_empty = parent && !parent->is_empty.empty();
|
||||
if (!check_parent_empty && validity.AllValid() && TypeIsConstantSize(vector.GetType().InternalType()) &&
|
||||
page_info_ref.get().estimated_page_size + GetRowSize(vector, vector_index, state) * vcount <
|
||||
MAX_UNCOMPRESSED_PAGE_SIZE) {
|
||||
// Fast path: fixed-size type, all valid, and it fits on the current page
|
||||
auto &page_info = page_info_ref.get();
|
||||
page_info.row_count += vcount;
|
||||
page_info.estimated_page_size += GetRowSize(vector, vector_index, state) * vcount;
|
||||
} else {
|
||||
for (idx_t i = 0; i < vcount; i++) {
|
||||
auto &page_info = page_info_ref.get();
|
||||
page_info.row_count++;
|
||||
if (check_parent_empty && parent->is_empty[parent_index + i]) {
|
||||
page_info.empty_count++;
|
||||
continue;
|
||||
}
|
||||
if (validity.RowIsValid(vector_index)) {
|
||||
page_info.estimated_page_size += GetRowSize(vector, vector_index, state);
|
||||
if (page_info.estimated_page_size >= MAX_UNCOMPRESSED_PAGE_SIZE) {
|
||||
if (!vector_can_span_multiple_pages && i != 0) {
|
||||
// Vector is not allowed to span multiple pages, and we already started writing it
|
||||
continue;
|
||||
}
|
||||
PageInformation new_info;
|
||||
new_info.offset = page_info.offset + page_info.row_count;
|
||||
state.page_info.push_back(new_info);
|
||||
page_info_ref = state.page_info.back();
|
||||
}
|
||||
} else {
|
||||
page_info.null_count++;
|
||||
}
|
||||
vector_index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
duckdb_parquet::Encoding::type PrimitiveColumnWriter::GetEncoding(PrimitiveColumnWriterState &state) {
|
||||
return Encoding::PLAIN;
|
||||
}
|
||||
|
||||
void PrimitiveColumnWriter::BeginWrite(ColumnWriterState &state_p) {
|
||||
auto &state = state_p.Cast<PrimitiveColumnWriterState>();
|
||||
|
||||
// set up the page write info
|
||||
state.stats_state = InitializeStatsState();
|
||||
for (idx_t page_idx = 0; page_idx < state.page_info.size(); page_idx++) {
|
||||
auto &page_info = state.page_info[page_idx];
|
||||
if (page_info.row_count == 0) {
|
||||
D_ASSERT(page_idx + 1 == state.page_info.size());
|
||||
state.page_info.erase_at(page_idx);
|
||||
break;
|
||||
}
|
||||
PageWriteInformation write_info;
|
||||
// set up the header
|
||||
auto &hdr = write_info.page_header;
|
||||
hdr.compressed_page_size = 0;
|
||||
hdr.uncompressed_page_size = 0;
|
||||
hdr.type = PageType::DATA_PAGE;
|
||||
hdr.__isset.data_page_header = true;
|
||||
|
||||
hdr.data_page_header.num_values = NumericCast<int32_t>(page_info.row_count);
|
||||
hdr.data_page_header.encoding = GetEncoding(state);
|
||||
hdr.data_page_header.definition_level_encoding = Encoding::RLE;
|
||||
hdr.data_page_header.repetition_level_encoding = Encoding::RLE;
|
||||
|
||||
write_info.temp_writer = make_uniq<MemoryStream>(
|
||||
BufferAllocator::Get(writer.GetContext()),
|
||||
MaxValue<idx_t>(NextPowerOfTwo(page_info.estimated_page_size), MemoryStream::DEFAULT_INITIAL_CAPACITY));
|
||||
write_info.write_count = page_info.empty_count;
|
||||
write_info.max_write_count = page_info.row_count;
|
||||
write_info.page_state = InitializePageState(state, page_idx);
|
||||
|
||||
write_info.compressed_size = 0;
|
||||
write_info.compressed_data = nullptr;
|
||||
|
||||
state.write_info.push_back(std::move(write_info));
|
||||
}
|
||||
|
||||
// start writing the first page
|
||||
NextPage(state);
|
||||
}
|
||||
|
||||
void PrimitiveColumnWriter::WriteLevels(Allocator &allocator, WriteStream &temp_writer,
|
||||
const unsafe_vector<uint16_t> &levels, idx_t max_value, idx_t offset,
|
||||
idx_t count, optional_idx null_count) {
|
||||
if (levels.empty() || count == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// write the levels using the RLE-BP encoding
|
||||
const auto bit_width = RleBpDecoder::ComputeBitWidth((max_value));
|
||||
RleBpEncoder rle_encoder(bit_width);
|
||||
|
||||
// have to write to an intermediate stream first because we need to know the size
|
||||
MemoryStream intermediate_stream(allocator);
|
||||
|
||||
rle_encoder.BeginWrite();
|
||||
if (null_count.IsValid() && null_count.GetIndex() == 0) {
|
||||
// Fast path: no nulls
|
||||
rle_encoder.WriteMany(intermediate_stream, levels[0], count);
|
||||
} else {
|
||||
for (idx_t i = offset; i < offset + count; i++) {
|
||||
rle_encoder.WriteValue(intermediate_stream, levels[i]);
|
||||
}
|
||||
}
|
||||
rle_encoder.FinishWrite(intermediate_stream);
|
||||
|
||||
// start off by writing the byte count as a uint32_t
|
||||
temp_writer.Write(NumericCast<uint32_t>(intermediate_stream.GetPosition()));
|
||||
// copy over the written data
|
||||
temp_writer.WriteData(intermediate_stream.GetData(), intermediate_stream.GetPosition());
|
||||
}
|
||||
|
||||
void PrimitiveColumnWriter::NextPage(PrimitiveColumnWriterState &state) {
|
||||
if (state.current_page > 0) {
|
||||
// need to flush the current page
|
||||
FlushPage(state);
|
||||
}
|
||||
if (state.current_page >= state.write_info.size()) {
|
||||
state.current_page = state.write_info.size() + 1;
|
||||
return;
|
||||
}
|
||||
auto &page_info = state.page_info[state.current_page];
|
||||
auto &write_info = state.write_info[state.current_page];
|
||||
state.current_page++;
|
||||
|
||||
auto &temp_writer = *write_info.temp_writer;
|
||||
|
||||
// write the repetition levels
|
||||
auto &allocator = BufferAllocator::Get(writer.GetContext());
|
||||
WriteLevels(allocator, temp_writer, state.repetition_levels, MaxRepeat(), page_info.offset, page_info.row_count);
|
||||
|
||||
// write the definition levels
|
||||
WriteLevels(allocator, temp_writer, state.definition_levels, MaxDefine(), page_info.offset, page_info.row_count,
|
||||
state.null_count + state.parent_null_count);
|
||||
}
|
||||
|
||||
void PrimitiveColumnWriter::FlushPage(PrimitiveColumnWriterState &state) {
|
||||
D_ASSERT(state.current_page > 0);
|
||||
if (state.current_page > state.write_info.size()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// compress the page info
|
||||
auto &write_info = state.write_info[state.current_page - 1];
|
||||
auto &temp_writer = *write_info.temp_writer;
|
||||
auto &hdr = write_info.page_header;
|
||||
|
||||
FlushPageState(temp_writer, write_info.page_state.get());
|
||||
|
||||
// now that we have finished writing the data we know the uncompressed size
|
||||
if (temp_writer.GetPosition() > idx_t(NumericLimits<int32_t>::Maximum())) {
|
||||
throw InternalException("Parquet writer: %d uncompressed page size out of range for type integer",
|
||||
temp_writer.GetPosition());
|
||||
}
|
||||
hdr.uncompressed_page_size = UnsafeNumericCast<int32_t>(temp_writer.GetPosition());
|
||||
|
||||
// compress the data
|
||||
CompressPage(temp_writer, write_info.compressed_size, write_info.compressed_data, write_info.compressed_buf);
|
||||
hdr.compressed_page_size = UnsafeNumericCast<int32_t>(write_info.compressed_size);
|
||||
D_ASSERT(hdr.uncompressed_page_size > 0);
|
||||
D_ASSERT(hdr.compressed_page_size > 0);
|
||||
|
||||
if (write_info.compressed_buf) {
|
||||
// if the data has been compressed, we no longer need the uncompressed data
|
||||
D_ASSERT(write_info.compressed_buf.get() == write_info.compressed_data);
|
||||
write_info.temp_writer.reset();
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<ColumnWriterStatistics> PrimitiveColumnWriter::InitializeStatsState() {
|
||||
return make_uniq<ColumnWriterStatistics>();
|
||||
}
|
||||
|
||||
idx_t PrimitiveColumnWriter::GetRowSize(const Vector &vector, const idx_t index,
|
||||
const PrimitiveColumnWriterState &state) const {
|
||||
throw InternalException("GetRowSize unsupported for struct/list column writers");
|
||||
}
|
||||
|
||||
void PrimitiveColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
|
||||
auto &state = state_p.Cast<PrimitiveColumnWriterState>();
|
||||
|
||||
idx_t remaining = count;
|
||||
idx_t offset = 0;
|
||||
while (remaining > 0) {
|
||||
auto &write_info = state.write_info[state.current_page - 1];
|
||||
if (!write_info.temp_writer) {
|
||||
throw InternalException("Writes are not correctly aligned!?");
|
||||
}
|
||||
auto &temp_writer = *write_info.temp_writer;
|
||||
idx_t write_count = MinValue<idx_t>(remaining, write_info.max_write_count - write_info.write_count);
|
||||
D_ASSERT(write_count > 0);
|
||||
|
||||
WriteVector(temp_writer, state.stats_state.get(), write_info.page_state.get(), vector, offset,
|
||||
offset + write_count);
|
||||
|
||||
write_info.write_count += write_count;
|
||||
if (write_info.write_count == write_info.max_write_count) {
|
||||
NextPage(state);
|
||||
}
|
||||
offset += write_count;
|
||||
remaining -= write_count;
|
||||
}
|
||||
}
|
||||
|
||||
void PrimitiveColumnWriter::SetParquetStatistics(PrimitiveColumnWriterState &state,
|
||||
duckdb_parquet::ColumnChunk &column_chunk) {
|
||||
if (!state.stats_state) {
|
||||
return;
|
||||
}
|
||||
if (MaxRepeat() == 0) {
|
||||
column_chunk.meta_data.statistics.null_count = NumericCast<int64_t>(state.null_count);
|
||||
column_chunk.meta_data.statistics.__isset.null_count = true;
|
||||
column_chunk.meta_data.__isset.statistics = true;
|
||||
}
|
||||
// if we have NaN values - don't write the min/max here
|
||||
if (!state.stats_state->HasNaN()) {
|
||||
// set min/max/min_value/max_value
|
||||
// this code is not going to win any beauty contests, but well
|
||||
auto min = state.stats_state->GetMin();
|
||||
if (!min.empty()) {
|
||||
column_chunk.meta_data.statistics.min = std::move(min);
|
||||
column_chunk.meta_data.statistics.__isset.min = true;
|
||||
column_chunk.meta_data.__isset.statistics = true;
|
||||
}
|
||||
auto max = state.stats_state->GetMax();
|
||||
if (!max.empty()) {
|
||||
column_chunk.meta_data.statistics.max = std::move(max);
|
||||
column_chunk.meta_data.statistics.__isset.max = true;
|
||||
column_chunk.meta_data.__isset.statistics = true;
|
||||
}
|
||||
|
||||
if (state.stats_state->HasStats()) {
|
||||
column_chunk.meta_data.statistics.min_value = state.stats_state->GetMinValue();
|
||||
column_chunk.meta_data.statistics.__isset.min_value = true;
|
||||
column_chunk.meta_data.__isset.statistics = true;
|
||||
column_chunk.meta_data.statistics.is_min_value_exact = state.stats_state->MinIsExact();
|
||||
column_chunk.meta_data.statistics.__isset.is_min_value_exact = true;
|
||||
|
||||
column_chunk.meta_data.statistics.max_value = state.stats_state->GetMaxValue();
|
||||
column_chunk.meta_data.statistics.__isset.max_value = true;
|
||||
column_chunk.meta_data.__isset.statistics = true;
|
||||
column_chunk.meta_data.statistics.is_max_value_exact = state.stats_state->MaxIsExact();
|
||||
column_chunk.meta_data.statistics.__isset.is_max_value_exact = true;
|
||||
}
|
||||
}
|
||||
if (HasDictionary(state)) {
|
||||
column_chunk.meta_data.statistics.distinct_count = UnsafeNumericCast<int64_t>(DictionarySize(state));
|
||||
column_chunk.meta_data.statistics.__isset.distinct_count = true;
|
||||
column_chunk.meta_data.__isset.statistics = true;
|
||||
}
|
||||
|
||||
if (state.stats_state->HasGeoStats()) {
|
||||
|
||||
auto gpq_version = writer.GetGeoParquetVersion();
|
||||
|
||||
const auto has_real_stats = gpq_version == GeoParquetVersion::NONE || gpq_version == GeoParquetVersion::BOTH ||
|
||||
gpq_version == GeoParquetVersion::V2;
|
||||
const auto has_json_stats = gpq_version == GeoParquetVersion::V1 || gpq_version == GeoParquetVersion::BOTH ||
|
||||
gpq_version == GeoParquetVersion::V2;
|
||||
|
||||
if (has_real_stats) {
|
||||
// Write the parquet native geospatial statistics
|
||||
column_chunk.meta_data.__isset.geospatial_statistics = true;
|
||||
state.stats_state->WriteGeoStats(column_chunk.meta_data.geospatial_statistics);
|
||||
}
|
||||
if (has_json_stats) {
|
||||
// Add the geospatial statistics to the extra GeoParquet metadata
|
||||
writer.GetGeoParquetData().AddGeoParquetStats(column_schema.name, column_schema.type,
|
||||
*state.stats_state->GetGeoStats());
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto &write_info : state.write_info) {
|
||||
// only care about data page encodings, data_page_header.encoding is meaningless for dict
|
||||
if (write_info.page_header.type != PageType::DATA_PAGE &&
|
||||
write_info.page_header.type != PageType::DATA_PAGE_V2) {
|
||||
continue;
|
||||
}
|
||||
column_chunk.meta_data.encodings.push_back(write_info.page_header.data_page_header.encoding);
|
||||
}
|
||||
}
|
||||
|
||||
void PrimitiveColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
|
||||
auto &state = state_p.Cast<PrimitiveColumnWriterState>();
|
||||
auto &column_chunk = state.row_group.columns[state.col_idx];
|
||||
|
||||
// flush the last page (if any remains)
|
||||
FlushPage(state);
|
||||
|
||||
auto &column_writer = writer.GetWriter();
|
||||
auto start_offset = column_writer.GetTotalWritten();
|
||||
// flush the dictionary
|
||||
if (HasDictionary(state)) {
|
||||
column_chunk.meta_data.statistics.distinct_count = UnsafeNumericCast<int64_t>(DictionarySize(state));
|
||||
column_chunk.meta_data.statistics.__isset.distinct_count = true;
|
||||
column_chunk.meta_data.dictionary_page_offset = UnsafeNumericCast<int64_t>(column_writer.GetTotalWritten());
|
||||
column_chunk.meta_data.__isset.dictionary_page_offset = true;
|
||||
FlushDictionary(state, state.stats_state.get());
|
||||
}
|
||||
|
||||
// record the start position of the pages for this column
|
||||
column_chunk.meta_data.data_page_offset = 0;
|
||||
SetParquetStatistics(state, column_chunk);
|
||||
|
||||
// write the individual pages to disk
|
||||
idx_t total_uncompressed_size = 0;
|
||||
for (auto &write_info : state.write_info) {
|
||||
// set the data page offset whenever we see the *first* data page
|
||||
if (column_chunk.meta_data.data_page_offset == 0 && (write_info.page_header.type == PageType::DATA_PAGE ||
|
||||
write_info.page_header.type == PageType::DATA_PAGE_V2)) {
|
||||
column_chunk.meta_data.data_page_offset = UnsafeNumericCast<int64_t>(column_writer.GetTotalWritten());
|
||||
}
|
||||
D_ASSERT(write_info.page_header.uncompressed_page_size > 0);
|
||||
auto header_start_offset = column_writer.GetTotalWritten();
|
||||
writer.Write(write_info.page_header);
|
||||
// total uncompressed size in the column chunk includes the header size (!)
|
||||
total_uncompressed_size += column_writer.GetTotalWritten() - header_start_offset;
|
||||
total_uncompressed_size += write_info.page_header.uncompressed_page_size;
|
||||
writer.WriteData(write_info.compressed_data, write_info.compressed_size);
|
||||
}
|
||||
column_chunk.meta_data.total_compressed_size =
|
||||
UnsafeNumericCast<int64_t>(column_writer.GetTotalWritten() - start_offset);
|
||||
column_chunk.meta_data.total_uncompressed_size = UnsafeNumericCast<int64_t>(total_uncompressed_size);
|
||||
state.row_group.total_byte_size += column_chunk.meta_data.total_uncompressed_size;
|
||||
|
||||
if (state.bloom_filter) {
|
||||
writer.BufferBloomFilter(state.col_idx, std::move(state.bloom_filter));
|
||||
}
|
||||
|
||||
// finalize the stats
|
||||
writer.FlushColumnStats(state.col_idx, column_chunk, state.stats_state.get());
|
||||
}
|
||||
|
||||
void PrimitiveColumnWriter::FlushDictionary(PrimitiveColumnWriterState &state, ColumnWriterStatistics *stats) {
|
||||
throw InternalException("This page does not have a dictionary");
|
||||
}
|
||||
|
||||
idx_t PrimitiveColumnWriter::DictionarySize(PrimitiveColumnWriterState &state) {
|
||||
throw InternalException("This page does not have a dictionary");
|
||||
}
|
||||
|
||||
void PrimitiveColumnWriter::WriteDictionary(PrimitiveColumnWriterState &state, unique_ptr<MemoryStream> temp_writer,
|
||||
idx_t row_count) {
|
||||
D_ASSERT(temp_writer);
|
||||
D_ASSERT(temp_writer->GetPosition() > 0);
|
||||
|
||||
// write the dictionary page header
|
||||
PageWriteInformation write_info;
|
||||
// set up the header
|
||||
auto &hdr = write_info.page_header;
|
||||
hdr.uncompressed_page_size = UnsafeNumericCast<int32_t>(temp_writer->GetPosition());
|
||||
hdr.type = PageType::DICTIONARY_PAGE;
|
||||
hdr.__isset.dictionary_page_header = true;
|
||||
|
||||
hdr.dictionary_page_header.encoding = Encoding::PLAIN;
|
||||
hdr.dictionary_page_header.is_sorted = false;
|
||||
hdr.dictionary_page_header.num_values = UnsafeNumericCast<int32_t>(row_count);
|
||||
|
||||
write_info.temp_writer = std::move(temp_writer);
|
||||
write_info.write_count = 0;
|
||||
write_info.max_write_count = 0;
|
||||
|
||||
// compress the contents of the dictionary page
|
||||
CompressPage(*write_info.temp_writer, write_info.compressed_size, write_info.compressed_data,
|
||||
write_info.compressed_buf);
|
||||
hdr.compressed_page_size = UnsafeNumericCast<int32_t>(write_info.compressed_size);
|
||||
|
||||
if (write_info.compressed_buf) {
|
||||
// if the data has been compressed, we no longer need the uncompressed data
|
||||
D_ASSERT(write_info.compressed_buf.get() == write_info.compressed_data);
|
||||
write_info.temp_writer.reset();
|
||||
}
|
||||
|
||||
// insert the dictionary page as the first page to write for this column
|
||||
state.write_info.insert(state.write_info.begin(), std::move(write_info));
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
103
external/duckdb/extension/parquet/writer/struct_column_writer.cpp
vendored
Normal file
103
external/duckdb/extension/parquet/writer/struct_column_writer.cpp
vendored
Normal file
@@ -0,0 +1,103 @@
|
||||
#include "writer/struct_column_writer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class StructColumnWriterState : public ColumnWriterState {
|
||||
public:
|
||||
StructColumnWriterState(duckdb_parquet::RowGroup &row_group, idx_t col_idx)
|
||||
: row_group(row_group), col_idx(col_idx) {
|
||||
}
|
||||
~StructColumnWriterState() override = default;
|
||||
|
||||
duckdb_parquet::RowGroup &row_group;
|
||||
idx_t col_idx;
|
||||
vector<unique_ptr<ColumnWriterState>> child_states;
|
||||
};
|
||||
|
||||
unique_ptr<ColumnWriterState> StructColumnWriter::InitializeWriteState(duckdb_parquet::RowGroup &row_group) {
|
||||
auto result = make_uniq<StructColumnWriterState>(row_group, row_group.columns.size());
|
||||
|
||||
result->child_states.reserve(child_writers.size());
|
||||
for (auto &child_writer : child_writers) {
|
||||
result->child_states.push_back(child_writer->InitializeWriteState(row_group));
|
||||
}
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
bool StructColumnWriter::HasAnalyze() {
|
||||
for (auto &child_writer : child_writers) {
|
||||
if (child_writer->HasAnalyze()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void StructColumnWriter::Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
|
||||
auto &state = state_p.Cast<StructColumnWriterState>();
|
||||
auto &child_vectors = StructVector::GetEntries(vector);
|
||||
for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
|
||||
// Need to check again. It might be that just one child needs it but the rest not
|
||||
if (child_writers[child_idx]->HasAnalyze()) {
|
||||
child_writers[child_idx]->Analyze(*state.child_states[child_idx], &state_p, *child_vectors[child_idx],
|
||||
count);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void StructColumnWriter::FinalizeAnalyze(ColumnWriterState &state_p) {
|
||||
auto &state = state_p.Cast<StructColumnWriterState>();
|
||||
for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
|
||||
// Need to check again. It might be that just one child needs it but the rest not
|
||||
if (child_writers[child_idx]->HasAnalyze()) {
|
||||
child_writers[child_idx]->FinalizeAnalyze(*state.child_states[child_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void StructColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count,
|
||||
bool vector_can_span_multiple_pages) {
|
||||
auto &state = state_p.Cast<StructColumnWriterState>();
|
||||
|
||||
auto &validity = FlatVector::Validity(vector);
|
||||
if (parent) {
|
||||
// propagate empty entries from the parent
|
||||
if (state.is_empty.size() < parent->is_empty.size()) {
|
||||
state.is_empty.insert(state.is_empty.end(), parent->is_empty.begin() + state.is_empty.size(),
|
||||
parent->is_empty.end());
|
||||
}
|
||||
}
|
||||
HandleRepeatLevels(state_p, parent, count);
|
||||
HandleDefineLevels(state_p, parent, validity, count, PARQUET_DEFINE_VALID, MaxDefine() - 1);
|
||||
auto &child_vectors = StructVector::GetEntries(vector);
|
||||
for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
|
||||
child_writers[child_idx]->Prepare(*state.child_states[child_idx], &state_p, *child_vectors[child_idx], count,
|
||||
vector_can_span_multiple_pages);
|
||||
}
|
||||
}
|
||||
|
||||
void StructColumnWriter::BeginWrite(ColumnWriterState &state_p) {
|
||||
auto &state = state_p.Cast<StructColumnWriterState>();
|
||||
for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
|
||||
child_writers[child_idx]->BeginWrite(*state.child_states[child_idx]);
|
||||
}
|
||||
}
|
||||
|
||||
void StructColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
|
||||
auto &state = state_p.Cast<StructColumnWriterState>();
|
||||
auto &child_vectors = StructVector::GetEntries(vector);
|
||||
for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
|
||||
child_writers[child_idx]->Write(*state.child_states[child_idx], *child_vectors[child_idx], count);
|
||||
}
|
||||
}
|
||||
|
||||
void StructColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
|
||||
auto &state = state_p.Cast<StructColumnWriterState>();
|
||||
for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
|
||||
// we add the null count of the struct to the null count of the children
|
||||
state.child_states[child_idx]->null_count += state_p.null_count;
|
||||
child_writers[child_idx]->FinalizeWrite(*state.child_states[child_idx]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
5
external/duckdb/extension/parquet/writer/variant/CMakeLists.txt
vendored
Normal file
5
external/duckdb/extension/parquet/writer/variant/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
add_library_unity(duckdb_parquet_writer_variant OBJECT convert_variant.cpp)
|
||||
|
||||
set(PARQUET_EXTENSION_FILES
|
||||
${PARQUET_EXTENSION_FILES} $<TARGET_OBJECTS:duckdb_parquet_writer_variant>
|
||||
PARENT_SCOPE)
|
||||
1208
external/duckdb/extension/parquet/writer/variant/convert_variant.cpp
vendored
Normal file
1208
external/duckdb/extension/parquet/writer/variant/convert_variant.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user