should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,16 @@
add_library_unity(
duckdb_parquet_writers
OBJECT
array_column_writer.cpp
boolean_column_writer.cpp
decimal_column_writer.cpp
enum_column_writer.cpp
list_column_writer.cpp
primitive_column_writer.cpp
struct_column_writer.cpp)
add_subdirectory(variant)
set(PARQUET_EXTENSION_FILES
${PARQUET_EXTENSION_FILES} $<TARGET_OBJECTS:duckdb_parquet_writers>
PARENT_SCOPE)

View File

@@ -0,0 +1,75 @@
#include "writer/array_column_writer.hpp"
namespace duckdb {
void ArrayColumnWriter::Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
auto &state = state_p.Cast<ListColumnWriterState>();
auto &array_child = ArrayVector::GetEntry(vector);
auto array_size = ArrayType::GetSize(vector.GetType());
GetChildWriter().Analyze(*state.child_state, &state_p, array_child, array_size * count);
}
void ArrayColumnWriter::WriteArrayState(ListColumnWriterState &state, idx_t array_size, uint16_t first_repeat_level,
idx_t define_value, const bool is_empty) {
state.definition_levels.push_back(define_value);
state.repetition_levels.push_back(first_repeat_level);
state.is_empty.push_back(is_empty);
if (is_empty) {
return;
}
for (idx_t k = 1; k < array_size; k++) {
state.repetition_levels.push_back(MaxRepeat() + 1);
state.definition_levels.push_back(define_value);
state.is_empty.push_back(false);
}
}
void ArrayColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) {
auto &state = state_p.Cast<ListColumnWriterState>();
auto array_size = ArrayType::GetSize(vector.GetType());
auto &validity = FlatVector::Validity(vector);
// write definition levels and repeats
// the main difference between this and ListColumnWriter::Prepare is that we need to make sure to write out
// repetition levels and definitions for the child elements of the array even if the array itself is NULL.
idx_t vcount = parent ? parent->definition_levels.size() - state.parent_index : count;
idx_t vector_index = 0;
for (idx_t i = 0; i < vcount; i++) {
idx_t parent_index = state.parent_index + i;
if (parent && !parent->is_empty.empty() && parent->is_empty[parent_index]) {
WriteArrayState(state, array_size, parent->repetition_levels[parent_index],
parent->definition_levels[parent_index], true);
continue;
}
auto first_repeat_level =
parent && !parent->repetition_levels.empty() ? parent->repetition_levels[parent_index] : MaxRepeat();
if (parent && parent->definition_levels[parent_index] != PARQUET_DEFINE_VALID) {
WriteArrayState(state, array_size, first_repeat_level, parent->definition_levels[parent_index]);
} else if (validity.RowIsValid(vector_index)) {
// push the repetition levels
WriteArrayState(state, array_size, first_repeat_level, PARQUET_DEFINE_VALID);
} else {
//! Produce a null
WriteArrayState(state, array_size, first_repeat_level, MaxDefine() - 1);
}
vector_index++;
}
state.parent_index += vcount;
auto &array_child = ArrayVector::GetEntry(vector);
// The elements of a single array should not span multiple Parquet pages
// So, we force the entire vector to fit on a single page by setting "vector_can_span_multiple_pages=false"
GetChildWriter().Prepare(*state.child_state, &state_p, array_child, count * array_size, false);
}
void ArrayColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
auto &state = state_p.Cast<ListColumnWriterState>();
auto array_size = ArrayType::GetSize(vector.GetType());
auto &array_child = ArrayVector::GetEntry(vector);
GetChildWriter().Write(*state.child_state, array_child, count * array_size);
}
} // namespace duckdb

View File

@@ -0,0 +1,105 @@
#include "writer/boolean_column_writer.hpp"
namespace duckdb {
class BooleanStatisticsState : public ColumnWriterStatistics {
public:
BooleanStatisticsState() : min(true), max(false) {
}
bool min;
bool max;
public:
bool HasStats() override {
return !(min && !max);
}
string GetMin() override {
return GetMinValue();
}
string GetMax() override {
return GetMaxValue();
}
string GetMinValue() override {
return HasStats() ? string(const_char_ptr_cast(&min), sizeof(bool)) : string();
}
string GetMaxValue() override {
return HasStats() ? string(const_char_ptr_cast(&max), sizeof(bool)) : string();
}
};
class BooleanWriterPageState : public ColumnWriterPageState {
public:
uint8_t byte = 0;
uint8_t byte_pos = 0;
};
BooleanColumnWriter::BooleanColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
vector<string> schema_path_p, bool can_have_nulls)
: PrimitiveColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
}
unique_ptr<ColumnWriterStatistics> BooleanColumnWriter::InitializeStatsState() {
return make_uniq<BooleanStatisticsState>();
}
void BooleanColumnWriter::WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p,
ColumnWriterPageState *state_p, Vector &input_column, idx_t chunk_start,
idx_t chunk_end) {
auto &stats = stats_p->Cast<BooleanStatisticsState>();
auto &state = state_p->Cast<BooleanWriterPageState>();
const auto &mask = FlatVector::Validity(input_column);
const auto *const ptr = FlatVector::GetData<bool>(input_column);
if (stats.max && !stats.min && mask.AllValid()) {
// Fast path: stats have already been set, and there's no NULLs
for (idx_t r = chunk_start; r < chunk_end; r++) {
const auto &val = ptr[r];
state.byte |= val << state.byte_pos;
if (++state.byte_pos == 8) {
temp_writer.Write(state.byte);
state.byte = 0;
state.byte_pos = 0;
}
}
} else {
for (idx_t r = chunk_start; r < chunk_end; r++) {
if (!mask.RowIsValid(r)) {
continue;
}
const auto &val = ptr[r];
stats.max |= val;
stats.min &= val;
state.byte |= val << state.byte_pos;
if (++state.byte_pos == 8) {
temp_writer.Write(state.byte);
state.byte = 0;
state.byte_pos = 0;
}
}
}
}
unique_ptr<ColumnWriterPageState> BooleanColumnWriter::InitializePageState(PrimitiveColumnWriterState &state,
idx_t page_idx) {
return make_uniq<BooleanWriterPageState>();
}
void BooleanColumnWriter::FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) {
auto &state = state_p->Cast<BooleanWriterPageState>();
if (state.byte_pos > 0) {
temp_writer.Write<uint8_t>(state.byte);
state.byte = 0;
state.byte_pos = 0;
}
}
idx_t BooleanColumnWriter::GetRowSize(const Vector &vector, const idx_t index,
const PrimitiveColumnWriterState &state) const {
return sizeof(bool);
}
} // namespace duckdb

View File

@@ -0,0 +1,100 @@
#include "writer/decimal_column_writer.hpp"
namespace duckdb {
static void WriteParquetDecimal(hugeint_t input, data_ptr_t result) {
bool positive = input >= 0;
// numbers are stored as two's complement so some muckery is required
if (!positive) {
input = NumericLimits<hugeint_t>::Maximum() + input + 1;
}
uint64_t high_bytes = uint64_t(input.upper);
uint64_t low_bytes = input.lower;
for (idx_t i = 0; i < sizeof(uint64_t); i++) {
auto shift_count = (sizeof(uint64_t) - i - 1) * 8;
result[i] = (high_bytes >> shift_count) & 0xFF;
}
for (idx_t i = 0; i < sizeof(uint64_t); i++) {
auto shift_count = (sizeof(uint64_t) - i - 1) * 8;
result[sizeof(uint64_t) + i] = (low_bytes >> shift_count) & 0xFF;
}
if (!positive) {
result[0] |= 0x80;
}
}
class FixedDecimalStatistics : public ColumnWriterStatistics {
public:
FixedDecimalStatistics() : min(NumericLimits<hugeint_t>::Maximum()), max(NumericLimits<hugeint_t>::Minimum()) {
}
hugeint_t min;
hugeint_t max;
public:
string GetStats(hugeint_t &input) {
data_t buffer[16];
WriteParquetDecimal(input, buffer);
return string(const_char_ptr_cast(buffer), 16);
}
bool HasStats() override {
return min <= max;
}
void Update(hugeint_t &val) {
if (LessThan::Operation(val, min)) {
min = val;
}
if (GreaterThan::Operation(val, max)) {
max = val;
}
}
string GetMin() override {
return GetMinValue();
}
string GetMax() override {
return GetMaxValue();
}
string GetMinValue() override {
return HasStats() ? GetStats(min) : string();
}
string GetMaxValue() override {
return HasStats() ? GetStats(max) : string();
}
};
FixedDecimalColumnWriter::FixedDecimalColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
vector<string> schema_path_p, bool can_have_nulls)
: PrimitiveColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
}
unique_ptr<ColumnWriterStatistics> FixedDecimalColumnWriter::InitializeStatsState() {
return make_uniq<FixedDecimalStatistics>();
}
void FixedDecimalColumnWriter::WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p,
ColumnWriterPageState *page_state, Vector &input_column, idx_t chunk_start,
idx_t chunk_end) {
auto &mask = FlatVector::Validity(input_column);
auto *ptr = FlatVector::GetData<hugeint_t>(input_column);
auto &stats = stats_p->Cast<FixedDecimalStatistics>();
data_t temp_buffer[16];
for (idx_t r = chunk_start; r < chunk_end; r++) {
if (mask.RowIsValid(r)) {
stats.Update(ptr[r]);
WriteParquetDecimal(ptr[r], temp_buffer);
temp_writer.WriteData(temp_buffer, 16);
}
}
}
idx_t FixedDecimalColumnWriter::GetRowSize(const Vector &vector, const idx_t index,
const PrimitiveColumnWriterState &state) const {
return sizeof(hugeint_t);
}
} // namespace duckdb

View File

@@ -0,0 +1,119 @@
#include "writer/enum_column_writer.hpp"
#include "parquet_rle_bp_decoder.hpp"
#include "parquet_rle_bp_encoder.hpp"
#include "parquet_writer.hpp"
#include "duckdb/common/serializer/memory_stream.hpp"
namespace duckdb {
using duckdb_parquet::Encoding;
class EnumWriterPageState : public ColumnWriterPageState {
public:
explicit EnumWriterPageState(uint32_t bit_width) : encoder(bit_width), written_value(false) {
}
RleBpEncoder encoder;
bool written_value;
};
EnumColumnWriter::EnumColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
vector<string> schema_path_p, bool can_have_nulls)
: PrimitiveColumnWriter(writer, column_schema, std::move(schema_path_p), can_have_nulls) {
bit_width = RleBpDecoder::ComputeBitWidth(EnumType::GetSize(Type()));
}
unique_ptr<ColumnWriterStatistics> EnumColumnWriter::InitializeStatsState() {
return make_uniq<StringStatisticsState>();
}
template <class T>
void EnumColumnWriter::WriteEnumInternal(WriteStream &temp_writer, Vector &input_column, idx_t chunk_start,
idx_t chunk_end, EnumWriterPageState &page_state) {
auto &mask = FlatVector::Validity(input_column);
auto *ptr = FlatVector::GetData<T>(input_column);
for (idx_t r = chunk_start; r < chunk_end; r++) {
if (mask.RowIsValid(r)) {
if (!page_state.written_value) {
// first value: write the bit-width as a one-byte entry and initialize writer
temp_writer.Write<uint8_t>(bit_width);
page_state.encoder.BeginWrite();
page_state.written_value = true;
}
page_state.encoder.WriteValue(temp_writer, ptr[r]);
}
}
}
void EnumColumnWriter::WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p,
ColumnWriterPageState *page_state_p, Vector &input_column, idx_t chunk_start,
idx_t chunk_end) {
auto &page_state = page_state_p->Cast<EnumWriterPageState>();
switch (Type().InternalType()) {
case PhysicalType::UINT8:
WriteEnumInternal<uint8_t>(temp_writer, input_column, chunk_start, chunk_end, page_state);
break;
case PhysicalType::UINT16:
WriteEnumInternal<uint16_t>(temp_writer, input_column, chunk_start, chunk_end, page_state);
break;
case PhysicalType::UINT32:
WriteEnumInternal<uint32_t>(temp_writer, input_column, chunk_start, chunk_end, page_state);
break;
default:
throw InternalException("Unsupported internal enum type");
}
}
unique_ptr<ColumnWriterPageState> EnumColumnWriter::InitializePageState(PrimitiveColumnWriterState &state,
idx_t page_idx) {
return make_uniq<EnumWriterPageState>(bit_width);
}
void EnumColumnWriter::FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) {
auto &page_state = state_p->Cast<EnumWriterPageState>();
if (!page_state.written_value) {
// all values are null
// just write the bit width
temp_writer.Write<uint8_t>(bit_width);
return;
}
page_state.encoder.FinishWrite(temp_writer);
}
duckdb_parquet::Encoding::type EnumColumnWriter::GetEncoding(PrimitiveColumnWriterState &state) {
return Encoding::RLE_DICTIONARY;
}
bool EnumColumnWriter::HasDictionary(PrimitiveColumnWriterState &state) {
return true;
}
idx_t EnumColumnWriter::DictionarySize(PrimitiveColumnWriterState &state_p) {
return EnumType::GetSize(Type());
}
void EnumColumnWriter::FlushDictionary(PrimitiveColumnWriterState &state, ColumnWriterStatistics *stats_p) {
auto &stats = stats_p->Cast<StringStatisticsState>();
// write the enum values to a dictionary page
auto &enum_values = EnumType::GetValuesInsertOrder(Type());
auto enum_count = EnumType::GetSize(Type());
auto string_values = FlatVector::GetData<string_t>(enum_values);
// first write the contents of the dictionary page to a temporary buffer
auto temp_writer = make_uniq<MemoryStream>(BufferAllocator::Get(writer.GetContext()));
for (idx_t r = 0; r < enum_count; r++) {
D_ASSERT(!FlatVector::IsNull(enum_values, r));
// update the statistics
stats.Update(string_values[r]);
// write this string value to the dictionary
temp_writer->Write<uint32_t>(string_values[r].GetSize());
temp_writer->WriteData(const_data_ptr_cast(string_values[r].GetData()), string_values[r].GetSize());
}
// flush the dictionary page and add it to the to-be-written pages
WriteDictionary(state, std::move(temp_writer), enum_count);
}
idx_t EnumColumnWriter::GetRowSize(const Vector &vector, const idx_t index,
const PrimitiveColumnWriterState &state) const {
return (bit_width + 7) / 8;
}
} // namespace duckdb

View File

@@ -0,0 +1,144 @@
#include "writer/list_column_writer.hpp"
namespace duckdb {
unique_ptr<ColumnWriterState> ListColumnWriter::InitializeWriteState(duckdb_parquet::RowGroup &row_group) {
auto result = make_uniq<ListColumnWriterState>(row_group, row_group.columns.size());
result->child_state = GetChildWriter().InitializeWriteState(row_group);
return std::move(result);
}
bool ListColumnWriter::HasAnalyze() {
return GetChildWriter().HasAnalyze();
}
void ListColumnWriter::Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
auto &state = state_p.Cast<ListColumnWriterState>();
auto &list_child = ListVector::GetEntry(vector);
auto list_count = ListVector::GetListSize(vector);
GetChildWriter().Analyze(*state.child_state, &state_p, list_child, list_count);
}
void ListColumnWriter::FinalizeAnalyze(ColumnWriterState &state_p) {
auto &state = state_p.Cast<ListColumnWriterState>();
GetChildWriter().FinalizeAnalyze(*state.child_state);
}
static idx_t GetConsecutiveChildList(Vector &list, Vector &result, idx_t offset, idx_t count) {
// returns a consecutive child list that fully flattens and repeats all required elements
auto &validity = FlatVector::Validity(list);
auto list_entries = FlatVector::GetData<list_entry_t>(list);
bool is_consecutive = true;
idx_t total_length = 0;
for (idx_t c = offset; c < offset + count; c++) {
if (!validity.RowIsValid(c)) {
continue;
}
if (list_entries[c].offset != total_length) {
is_consecutive = false;
}
total_length += list_entries[c].length;
}
if (is_consecutive) {
// already consecutive - leave it as-is
return total_length;
}
SelectionVector sel(total_length);
idx_t index = 0;
for (idx_t c = offset; c < offset + count; c++) {
if (!validity.RowIsValid(c)) {
continue;
}
for (idx_t k = 0; k < list_entries[c].length; k++) {
sel.set_index(index++, list_entries[c].offset + k);
}
}
result.Slice(sel, total_length);
result.Flatten(total_length);
return total_length;
}
void ListColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) {
auto &state = state_p.Cast<ListColumnWriterState>();
auto list_data = FlatVector::GetData<list_entry_t>(vector);
auto &validity = FlatVector::Validity(vector);
// write definition levels and repeats
idx_t start = 0;
idx_t vcount = parent ? parent->definition_levels.size() - state.parent_index : count;
idx_t vector_index = 0;
for (idx_t i = start; i < vcount; i++) {
idx_t parent_index = state.parent_index + i;
if (parent && !parent->is_empty.empty() && parent->is_empty[parent_index]) {
state.definition_levels.push_back(parent->definition_levels[parent_index]);
state.repetition_levels.push_back(parent->repetition_levels[parent_index]);
state.is_empty.push_back(true);
continue;
}
auto first_repeat_level =
parent && !parent->repetition_levels.empty() ? parent->repetition_levels[parent_index] : MaxRepeat();
if (parent && parent->definition_levels[parent_index] != PARQUET_DEFINE_VALID) {
state.definition_levels.push_back(parent->definition_levels[parent_index]);
state.repetition_levels.push_back(first_repeat_level);
state.is_empty.push_back(true);
} else if (validity.RowIsValid(vector_index)) {
// push the repetition levels
if (list_data[vector_index].length == 0) {
state.definition_levels.push_back(MaxDefine());
state.is_empty.push_back(true);
} else {
state.definition_levels.push_back(PARQUET_DEFINE_VALID);
state.is_empty.push_back(false);
}
state.repetition_levels.push_back(first_repeat_level);
for (idx_t k = 1; k < list_data[vector_index].length; k++) {
state.repetition_levels.push_back(MaxRepeat() + 1);
state.definition_levels.push_back(PARQUET_DEFINE_VALID);
state.is_empty.push_back(false);
}
} else {
if (!can_have_nulls) {
throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
}
state.definition_levels.push_back(MaxDefine() - 1);
state.repetition_levels.push_back(first_repeat_level);
state.is_empty.push_back(true);
}
vector_index++;
}
state.parent_index += vcount;
auto &list_child = ListVector::GetEntry(vector);
Vector child_list(list_child);
auto child_length = GetConsecutiveChildList(vector, child_list, 0, count);
// The elements of a single list should not span multiple Parquet pages
// So, we force the entire vector to fit on a single page by setting "vector_can_span_multiple_pages=false"
GetChildWriter().Prepare(*state.child_state, &state_p, child_list, child_length, false);
}
void ListColumnWriter::BeginWrite(ColumnWriterState &state_p) {
auto &state = state_p.Cast<ListColumnWriterState>();
GetChildWriter().BeginWrite(*state.child_state);
}
void ListColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
auto &state = state_p.Cast<ListColumnWriterState>();
auto &list_child = ListVector::GetEntry(vector);
Vector child_list(list_child);
auto child_length = GetConsecutiveChildList(vector, child_list, 0, count);
GetChildWriter().Write(*state.child_state, child_list, child_length);
}
void ListColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
auto &state = state_p.Cast<ListColumnWriterState>();
GetChildWriter().FinalizeWrite(*state.child_state);
}
ColumnWriter &ListColumnWriter::GetChildWriter() {
D_ASSERT(child_writers.size() == 1);
return *child_writers[0];
}
} // namespace duckdb

View File

@@ -0,0 +1,435 @@
#include "writer/primitive_column_writer.hpp"
#include "parquet_rle_bp_decoder.hpp"
#include "parquet_rle_bp_encoder.hpp"
#include "parquet_writer.hpp"
namespace duckdb {
using duckdb_parquet::Encoding;
using duckdb_parquet::PageType;
constexpr const idx_t PrimitiveColumnWriter::MAX_UNCOMPRESSED_PAGE_SIZE;
constexpr const idx_t PrimitiveColumnWriter::MAX_UNCOMPRESSED_DICT_PAGE_SIZE;
PrimitiveColumnWriter::PrimitiveColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema,
vector<string> schema_path, bool can_have_nulls)
: ColumnWriter(writer, column_schema, std::move(schema_path), can_have_nulls) {
}
unique_ptr<ColumnWriterState> PrimitiveColumnWriter::InitializeWriteState(duckdb_parquet::RowGroup &row_group) {
auto result = make_uniq<PrimitiveColumnWriterState>(writer, row_group, row_group.columns.size());
RegisterToRowGroup(row_group);
return std::move(result);
}
void PrimitiveColumnWriter::RegisterToRowGroup(duckdb_parquet::RowGroup &row_group) {
duckdb_parquet::ColumnChunk column_chunk;
column_chunk.__isset.meta_data = true;
column_chunk.meta_data.codec = writer.GetCodec();
column_chunk.meta_data.path_in_schema = schema_path;
column_chunk.meta_data.num_values = 0;
column_chunk.meta_data.type = writer.GetType(SchemaIndex());
row_group.columns.push_back(std::move(column_chunk));
}
unique_ptr<ColumnWriterPageState> PrimitiveColumnWriter::InitializePageState(PrimitiveColumnWriterState &state,
idx_t page_idx) {
return nullptr;
}
void PrimitiveColumnWriter::FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state) {
}
void PrimitiveColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) {
auto &state = state_p.Cast<PrimitiveColumnWriterState>();
auto &col_chunk = state.row_group.columns[state.col_idx];
idx_t vcount = parent ? parent->definition_levels.size() - state.definition_levels.size() : count;
idx_t parent_index = state.definition_levels.size();
auto &validity = FlatVector::Validity(vector);
HandleRepeatLevels(state, parent, count);
HandleDefineLevels(state, parent, validity, count, MaxDefine(), MaxDefine() - 1);
idx_t vector_index = 0;
reference<PageInformation> page_info_ref = state.page_info.back();
col_chunk.meta_data.num_values += NumericCast<int64_t>(vcount);
const bool check_parent_empty = parent && !parent->is_empty.empty();
if (!check_parent_empty && validity.AllValid() && TypeIsConstantSize(vector.GetType().InternalType()) &&
page_info_ref.get().estimated_page_size + GetRowSize(vector, vector_index, state) * vcount <
MAX_UNCOMPRESSED_PAGE_SIZE) {
// Fast path: fixed-size type, all valid, and it fits on the current page
auto &page_info = page_info_ref.get();
page_info.row_count += vcount;
page_info.estimated_page_size += GetRowSize(vector, vector_index, state) * vcount;
} else {
for (idx_t i = 0; i < vcount; i++) {
auto &page_info = page_info_ref.get();
page_info.row_count++;
if (check_parent_empty && parent->is_empty[parent_index + i]) {
page_info.empty_count++;
continue;
}
if (validity.RowIsValid(vector_index)) {
page_info.estimated_page_size += GetRowSize(vector, vector_index, state);
if (page_info.estimated_page_size >= MAX_UNCOMPRESSED_PAGE_SIZE) {
if (!vector_can_span_multiple_pages && i != 0) {
// Vector is not allowed to span multiple pages, and we already started writing it
continue;
}
PageInformation new_info;
new_info.offset = page_info.offset + page_info.row_count;
state.page_info.push_back(new_info);
page_info_ref = state.page_info.back();
}
} else {
page_info.null_count++;
}
vector_index++;
}
}
}
duckdb_parquet::Encoding::type PrimitiveColumnWriter::GetEncoding(PrimitiveColumnWriterState &state) {
return Encoding::PLAIN;
}
void PrimitiveColumnWriter::BeginWrite(ColumnWriterState &state_p) {
auto &state = state_p.Cast<PrimitiveColumnWriterState>();
// set up the page write info
state.stats_state = InitializeStatsState();
for (idx_t page_idx = 0; page_idx < state.page_info.size(); page_idx++) {
auto &page_info = state.page_info[page_idx];
if (page_info.row_count == 0) {
D_ASSERT(page_idx + 1 == state.page_info.size());
state.page_info.erase_at(page_idx);
break;
}
PageWriteInformation write_info;
// set up the header
auto &hdr = write_info.page_header;
hdr.compressed_page_size = 0;
hdr.uncompressed_page_size = 0;
hdr.type = PageType::DATA_PAGE;
hdr.__isset.data_page_header = true;
hdr.data_page_header.num_values = NumericCast<int32_t>(page_info.row_count);
hdr.data_page_header.encoding = GetEncoding(state);
hdr.data_page_header.definition_level_encoding = Encoding::RLE;
hdr.data_page_header.repetition_level_encoding = Encoding::RLE;
write_info.temp_writer = make_uniq<MemoryStream>(
BufferAllocator::Get(writer.GetContext()),
MaxValue<idx_t>(NextPowerOfTwo(page_info.estimated_page_size), MemoryStream::DEFAULT_INITIAL_CAPACITY));
write_info.write_count = page_info.empty_count;
write_info.max_write_count = page_info.row_count;
write_info.page_state = InitializePageState(state, page_idx);
write_info.compressed_size = 0;
write_info.compressed_data = nullptr;
state.write_info.push_back(std::move(write_info));
}
// start writing the first page
NextPage(state);
}
void PrimitiveColumnWriter::WriteLevels(Allocator &allocator, WriteStream &temp_writer,
const unsafe_vector<uint16_t> &levels, idx_t max_value, idx_t offset,
idx_t count, optional_idx null_count) {
if (levels.empty() || count == 0) {
return;
}
// write the levels using the RLE-BP encoding
const auto bit_width = RleBpDecoder::ComputeBitWidth((max_value));
RleBpEncoder rle_encoder(bit_width);
// have to write to an intermediate stream first because we need to know the size
MemoryStream intermediate_stream(allocator);
rle_encoder.BeginWrite();
if (null_count.IsValid() && null_count.GetIndex() == 0) {
// Fast path: no nulls
rle_encoder.WriteMany(intermediate_stream, levels[0], count);
} else {
for (idx_t i = offset; i < offset + count; i++) {
rle_encoder.WriteValue(intermediate_stream, levels[i]);
}
}
rle_encoder.FinishWrite(intermediate_stream);
// start off by writing the byte count as a uint32_t
temp_writer.Write(NumericCast<uint32_t>(intermediate_stream.GetPosition()));
// copy over the written data
temp_writer.WriteData(intermediate_stream.GetData(), intermediate_stream.GetPosition());
}
void PrimitiveColumnWriter::NextPage(PrimitiveColumnWriterState &state) {
if (state.current_page > 0) {
// need to flush the current page
FlushPage(state);
}
if (state.current_page >= state.write_info.size()) {
state.current_page = state.write_info.size() + 1;
return;
}
auto &page_info = state.page_info[state.current_page];
auto &write_info = state.write_info[state.current_page];
state.current_page++;
auto &temp_writer = *write_info.temp_writer;
// write the repetition levels
auto &allocator = BufferAllocator::Get(writer.GetContext());
WriteLevels(allocator, temp_writer, state.repetition_levels, MaxRepeat(), page_info.offset, page_info.row_count);
// write the definition levels
WriteLevels(allocator, temp_writer, state.definition_levels, MaxDefine(), page_info.offset, page_info.row_count,
state.null_count + state.parent_null_count);
}
void PrimitiveColumnWriter::FlushPage(PrimitiveColumnWriterState &state) {
D_ASSERT(state.current_page > 0);
if (state.current_page > state.write_info.size()) {
return;
}
// compress the page info
auto &write_info = state.write_info[state.current_page - 1];
auto &temp_writer = *write_info.temp_writer;
auto &hdr = write_info.page_header;
FlushPageState(temp_writer, write_info.page_state.get());
// now that we have finished writing the data we know the uncompressed size
if (temp_writer.GetPosition() > idx_t(NumericLimits<int32_t>::Maximum())) {
throw InternalException("Parquet writer: %d uncompressed page size out of range for type integer",
temp_writer.GetPosition());
}
hdr.uncompressed_page_size = UnsafeNumericCast<int32_t>(temp_writer.GetPosition());
// compress the data
CompressPage(temp_writer, write_info.compressed_size, write_info.compressed_data, write_info.compressed_buf);
hdr.compressed_page_size = UnsafeNumericCast<int32_t>(write_info.compressed_size);
D_ASSERT(hdr.uncompressed_page_size > 0);
D_ASSERT(hdr.compressed_page_size > 0);
if (write_info.compressed_buf) {
// if the data has been compressed, we no longer need the uncompressed data
D_ASSERT(write_info.compressed_buf.get() == write_info.compressed_data);
write_info.temp_writer.reset();
}
}
unique_ptr<ColumnWriterStatistics> PrimitiveColumnWriter::InitializeStatsState() {
return make_uniq<ColumnWriterStatistics>();
}
idx_t PrimitiveColumnWriter::GetRowSize(const Vector &vector, const idx_t index,
const PrimitiveColumnWriterState &state) const {
throw InternalException("GetRowSize unsupported for struct/list column writers");
}
void PrimitiveColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
auto &state = state_p.Cast<PrimitiveColumnWriterState>();
idx_t remaining = count;
idx_t offset = 0;
while (remaining > 0) {
auto &write_info = state.write_info[state.current_page - 1];
if (!write_info.temp_writer) {
throw InternalException("Writes are not correctly aligned!?");
}
auto &temp_writer = *write_info.temp_writer;
idx_t write_count = MinValue<idx_t>(remaining, write_info.max_write_count - write_info.write_count);
D_ASSERT(write_count > 0);
WriteVector(temp_writer, state.stats_state.get(), write_info.page_state.get(), vector, offset,
offset + write_count);
write_info.write_count += write_count;
if (write_info.write_count == write_info.max_write_count) {
NextPage(state);
}
offset += write_count;
remaining -= write_count;
}
}
void PrimitiveColumnWriter::SetParquetStatistics(PrimitiveColumnWriterState &state,
duckdb_parquet::ColumnChunk &column_chunk) {
if (!state.stats_state) {
return;
}
if (MaxRepeat() == 0) {
column_chunk.meta_data.statistics.null_count = NumericCast<int64_t>(state.null_count);
column_chunk.meta_data.statistics.__isset.null_count = true;
column_chunk.meta_data.__isset.statistics = true;
}
// if we have NaN values - don't write the min/max here
if (!state.stats_state->HasNaN()) {
// set min/max/min_value/max_value
// this code is not going to win any beauty contests, but well
auto min = state.stats_state->GetMin();
if (!min.empty()) {
column_chunk.meta_data.statistics.min = std::move(min);
column_chunk.meta_data.statistics.__isset.min = true;
column_chunk.meta_data.__isset.statistics = true;
}
auto max = state.stats_state->GetMax();
if (!max.empty()) {
column_chunk.meta_data.statistics.max = std::move(max);
column_chunk.meta_data.statistics.__isset.max = true;
column_chunk.meta_data.__isset.statistics = true;
}
if (state.stats_state->HasStats()) {
column_chunk.meta_data.statistics.min_value = state.stats_state->GetMinValue();
column_chunk.meta_data.statistics.__isset.min_value = true;
column_chunk.meta_data.__isset.statistics = true;
column_chunk.meta_data.statistics.is_min_value_exact = state.stats_state->MinIsExact();
column_chunk.meta_data.statistics.__isset.is_min_value_exact = true;
column_chunk.meta_data.statistics.max_value = state.stats_state->GetMaxValue();
column_chunk.meta_data.statistics.__isset.max_value = true;
column_chunk.meta_data.__isset.statistics = true;
column_chunk.meta_data.statistics.is_max_value_exact = state.stats_state->MaxIsExact();
column_chunk.meta_data.statistics.__isset.is_max_value_exact = true;
}
}
if (HasDictionary(state)) {
column_chunk.meta_data.statistics.distinct_count = UnsafeNumericCast<int64_t>(DictionarySize(state));
column_chunk.meta_data.statistics.__isset.distinct_count = true;
column_chunk.meta_data.__isset.statistics = true;
}
if (state.stats_state->HasGeoStats()) {
auto gpq_version = writer.GetGeoParquetVersion();
const auto has_real_stats = gpq_version == GeoParquetVersion::NONE || gpq_version == GeoParquetVersion::BOTH ||
gpq_version == GeoParquetVersion::V2;
const auto has_json_stats = gpq_version == GeoParquetVersion::V1 || gpq_version == GeoParquetVersion::BOTH ||
gpq_version == GeoParquetVersion::V2;
if (has_real_stats) {
// Write the parquet native geospatial statistics
column_chunk.meta_data.__isset.geospatial_statistics = true;
state.stats_state->WriteGeoStats(column_chunk.meta_data.geospatial_statistics);
}
if (has_json_stats) {
// Add the geospatial statistics to the extra GeoParquet metadata
writer.GetGeoParquetData().AddGeoParquetStats(column_schema.name, column_schema.type,
*state.stats_state->GetGeoStats());
}
}
for (const auto &write_info : state.write_info) {
// only care about data page encodings, data_page_header.encoding is meaningless for dict
if (write_info.page_header.type != PageType::DATA_PAGE &&
write_info.page_header.type != PageType::DATA_PAGE_V2) {
continue;
}
column_chunk.meta_data.encodings.push_back(write_info.page_header.data_page_header.encoding);
}
}
void PrimitiveColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
auto &state = state_p.Cast<PrimitiveColumnWriterState>();
auto &column_chunk = state.row_group.columns[state.col_idx];
// flush the last page (if any remains)
FlushPage(state);
auto &column_writer = writer.GetWriter();
auto start_offset = column_writer.GetTotalWritten();
// flush the dictionary
if (HasDictionary(state)) {
column_chunk.meta_data.statistics.distinct_count = UnsafeNumericCast<int64_t>(DictionarySize(state));
column_chunk.meta_data.statistics.__isset.distinct_count = true;
column_chunk.meta_data.dictionary_page_offset = UnsafeNumericCast<int64_t>(column_writer.GetTotalWritten());
column_chunk.meta_data.__isset.dictionary_page_offset = true;
FlushDictionary(state, state.stats_state.get());
}
// record the start position of the pages for this column
column_chunk.meta_data.data_page_offset = 0;
SetParquetStatistics(state, column_chunk);
// write the individual pages to disk
idx_t total_uncompressed_size = 0;
for (auto &write_info : state.write_info) {
// set the data page offset whenever we see the *first* data page
if (column_chunk.meta_data.data_page_offset == 0 && (write_info.page_header.type == PageType::DATA_PAGE ||
write_info.page_header.type == PageType::DATA_PAGE_V2)) {
column_chunk.meta_data.data_page_offset = UnsafeNumericCast<int64_t>(column_writer.GetTotalWritten());
}
D_ASSERT(write_info.page_header.uncompressed_page_size > 0);
auto header_start_offset = column_writer.GetTotalWritten();
writer.Write(write_info.page_header);
// total uncompressed size in the column chunk includes the header size (!)
total_uncompressed_size += column_writer.GetTotalWritten() - header_start_offset;
total_uncompressed_size += write_info.page_header.uncompressed_page_size;
writer.WriteData(write_info.compressed_data, write_info.compressed_size);
}
column_chunk.meta_data.total_compressed_size =
UnsafeNumericCast<int64_t>(column_writer.GetTotalWritten() - start_offset);
column_chunk.meta_data.total_uncompressed_size = UnsafeNumericCast<int64_t>(total_uncompressed_size);
state.row_group.total_byte_size += column_chunk.meta_data.total_uncompressed_size;
if (state.bloom_filter) {
writer.BufferBloomFilter(state.col_idx, std::move(state.bloom_filter));
}
// finalize the stats
writer.FlushColumnStats(state.col_idx, column_chunk, state.stats_state.get());
}
void PrimitiveColumnWriter::FlushDictionary(PrimitiveColumnWriterState &state, ColumnWriterStatistics *stats) {
throw InternalException("This page does not have a dictionary");
}
idx_t PrimitiveColumnWriter::DictionarySize(PrimitiveColumnWriterState &state) {
throw InternalException("This page does not have a dictionary");
}
void PrimitiveColumnWriter::WriteDictionary(PrimitiveColumnWriterState &state, unique_ptr<MemoryStream> temp_writer,
idx_t row_count) {
D_ASSERT(temp_writer);
D_ASSERT(temp_writer->GetPosition() > 0);
// write the dictionary page header
PageWriteInformation write_info;
// set up the header
auto &hdr = write_info.page_header;
hdr.uncompressed_page_size = UnsafeNumericCast<int32_t>(temp_writer->GetPosition());
hdr.type = PageType::DICTIONARY_PAGE;
hdr.__isset.dictionary_page_header = true;
hdr.dictionary_page_header.encoding = Encoding::PLAIN;
hdr.dictionary_page_header.is_sorted = false;
hdr.dictionary_page_header.num_values = UnsafeNumericCast<int32_t>(row_count);
write_info.temp_writer = std::move(temp_writer);
write_info.write_count = 0;
write_info.max_write_count = 0;
// compress the contents of the dictionary page
CompressPage(*write_info.temp_writer, write_info.compressed_size, write_info.compressed_data,
write_info.compressed_buf);
hdr.compressed_page_size = UnsafeNumericCast<int32_t>(write_info.compressed_size);
if (write_info.compressed_buf) {
// if the data has been compressed, we no longer need the uncompressed data
D_ASSERT(write_info.compressed_buf.get() == write_info.compressed_data);
write_info.temp_writer.reset();
}
// insert the dictionary page as the first page to write for this column
state.write_info.insert(state.write_info.begin(), std::move(write_info));
}
} // namespace duckdb

View File

@@ -0,0 +1,103 @@
#include "writer/struct_column_writer.hpp"
namespace duckdb {
class StructColumnWriterState : public ColumnWriterState {
public:
StructColumnWriterState(duckdb_parquet::RowGroup &row_group, idx_t col_idx)
: row_group(row_group), col_idx(col_idx) {
}
~StructColumnWriterState() override = default;
duckdb_parquet::RowGroup &row_group;
idx_t col_idx;
vector<unique_ptr<ColumnWriterState>> child_states;
};
unique_ptr<ColumnWriterState> StructColumnWriter::InitializeWriteState(duckdb_parquet::RowGroup &row_group) {
auto result = make_uniq<StructColumnWriterState>(row_group, row_group.columns.size());
result->child_states.reserve(child_writers.size());
for (auto &child_writer : child_writers) {
result->child_states.push_back(child_writer->InitializeWriteState(row_group));
}
return std::move(result);
}
bool StructColumnWriter::HasAnalyze() {
for (auto &child_writer : child_writers) {
if (child_writer->HasAnalyze()) {
return true;
}
}
return false;
}
void StructColumnWriter::Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
auto &state = state_p.Cast<StructColumnWriterState>();
auto &child_vectors = StructVector::GetEntries(vector);
for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
// Need to check again. It might be that just one child needs it but the rest not
if (child_writers[child_idx]->HasAnalyze()) {
child_writers[child_idx]->Analyze(*state.child_states[child_idx], &state_p, *child_vectors[child_idx],
count);
}
}
}
void StructColumnWriter::FinalizeAnalyze(ColumnWriterState &state_p) {
auto &state = state_p.Cast<StructColumnWriterState>();
for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
// Need to check again. It might be that just one child needs it but the rest not
if (child_writers[child_idx]->HasAnalyze()) {
child_writers[child_idx]->FinalizeAnalyze(*state.child_states[child_idx]);
}
}
}
void StructColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) {
auto &state = state_p.Cast<StructColumnWriterState>();
auto &validity = FlatVector::Validity(vector);
if (parent) {
// propagate empty entries from the parent
if (state.is_empty.size() < parent->is_empty.size()) {
state.is_empty.insert(state.is_empty.end(), parent->is_empty.begin() + state.is_empty.size(),
parent->is_empty.end());
}
}
HandleRepeatLevels(state_p, parent, count);
HandleDefineLevels(state_p, parent, validity, count, PARQUET_DEFINE_VALID, MaxDefine() - 1);
auto &child_vectors = StructVector::GetEntries(vector);
for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
child_writers[child_idx]->Prepare(*state.child_states[child_idx], &state_p, *child_vectors[child_idx], count,
vector_can_span_multiple_pages);
}
}
void StructColumnWriter::BeginWrite(ColumnWriterState &state_p) {
auto &state = state_p.Cast<StructColumnWriterState>();
for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
child_writers[child_idx]->BeginWrite(*state.child_states[child_idx]);
}
}
void StructColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
auto &state = state_p.Cast<StructColumnWriterState>();
auto &child_vectors = StructVector::GetEntries(vector);
for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
child_writers[child_idx]->Write(*state.child_states[child_idx], *child_vectors[child_idx], count);
}
}
void StructColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
auto &state = state_p.Cast<StructColumnWriterState>();
for (idx_t child_idx = 0; child_idx < child_writers.size(); child_idx++) {
// we add the null count of the struct to the null count of the children
state.child_states[child_idx]->null_count += state_p.null_count;
child_writers[child_idx]->FinalizeWrite(*state.child_states[child_idx]);
}
}
} // namespace duckdb

View File

@@ -0,0 +1,5 @@
add_library_unity(duckdb_parquet_writer_variant OBJECT convert_variant.cpp)
set(PARQUET_EXTENSION_FILES
${PARQUET_EXTENSION_FILES} $<TARGET_OBJECTS:duckdb_parquet_writer_variant>
PARENT_SCOPE)

File diff suppressed because it is too large Load Diff