Files
email-tracker/external/duckdb/extension/parquet/include/column_writer.hpp
2025-10-24 19:21:19 -05:00

146 lines
4.6 KiB
C++

//===----------------------------------------------------------------------===//
// DuckDB
//
// column_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "parquet_types.h"
#include "parquet_column_schema.hpp"
namespace duckdb {
class MemoryStream;
class ParquetWriter;
class ColumnWriterPageState;
class PrimitiveColumnWriterState;
struct ChildFieldIDs;
struct ShreddingType;
class ResizeableBuffer;
class ParquetBloomFilter;
class ColumnWriterState {
public:
virtual ~ColumnWriterState();
unsafe_vector<uint16_t> definition_levels;
unsafe_vector<uint16_t> repetition_levels;
unsafe_vector<uint8_t> is_empty;
idx_t parent_null_count = 0;
idx_t null_count = 0;
public:
template <class TARGET>
TARGET &Cast() {
DynamicCastCheck<TARGET>(this);
return reinterpret_cast<TARGET &>(*this);
}
template <class TARGET>
const TARGET &Cast() const {
D_ASSERT(dynamic_cast<const TARGET *>(this));
return reinterpret_cast<const TARGET &>(*this);
}
};
class ColumnWriterPageState {
public:
virtual ~ColumnWriterPageState() {
}
public:
template <class TARGET>
TARGET &Cast() {
DynamicCastCheck<TARGET>(this);
return reinterpret_cast<TARGET &>(*this);
}
template <class TARGET>
const TARGET &Cast() const {
D_ASSERT(dynamic_cast<const TARGET *>(this));
return reinterpret_cast<const TARGET &>(*this);
}
};
class ColumnWriter {
protected:
static constexpr uint16_t PARQUET_DEFINE_VALID = UINT16_C(65535);
public:
ColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path,
bool can_have_nulls);
virtual ~ColumnWriter();
public:
const LogicalType &Type() const {
return column_schema.type;
}
const ParquetColumnSchema &Schema() const {
return column_schema;
}
inline idx_t SchemaIndex() const {
return column_schema.schema_index;
}
inline idx_t MaxDefine() const {
return column_schema.max_define;
}
idx_t MaxRepeat() const {
return column_schema.max_repeat;
}
static ParquetColumnSchema FillParquetSchema(vector<duckdb_parquet::SchemaElement> &schemas,
const LogicalType &type, const string &name, bool allow_geometry,
optional_ptr<const ChildFieldIDs> field_ids,
optional_ptr<const ShreddingType> shredding_types,
idx_t max_repeat = 0, idx_t max_define = 1,
bool can_have_nulls = true);
//! Create the column writer for a specific type recursively
static unique_ptr<ColumnWriter> CreateWriterRecursive(ClientContext &context, ParquetWriter &writer,
const vector<duckdb_parquet::SchemaElement> &parquet_schemas,
const ParquetColumnSchema &schema,
vector<string> path_in_schema);
virtual unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) = 0;
//! indicates whether the write need to analyse the data before preparing it
virtual bool HasAnalyze() {
return false;
}
virtual void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) {
throw NotImplementedException("Writer does not need analysis");
}
//! Called after all data has been passed to Analyze
virtual void FinalizeAnalyze(ColumnWriterState &state) {
throw NotImplementedException("Writer does not need analysis");
}
virtual void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) = 0;
virtual void BeginWrite(ColumnWriterState &state) = 0;
virtual void Write(ColumnWriterState &state, Vector &vector, idx_t count) = 0;
virtual void FinalizeWrite(ColumnWriterState &state) = 0;
protected:
void HandleDefineLevels(ColumnWriterState &state, ColumnWriterState *parent, const ValidityMask &validity,
const idx_t count, const uint16_t define_value, const uint16_t null_value) const;
void HandleRepeatLevels(ColumnWriterState &state_p, ColumnWriterState *parent, idx_t count) const;
void CompressPage(MemoryStream &temp_writer, size_t &compressed_size, data_ptr_t &compressed_data,
AllocatedData &compressed_buf);
public:
ParquetWriter &writer;
const ParquetColumnSchema &column_schema;
vector<string> schema_path;
bool can_have_nulls;
protected:
vector<unique_ptr<ColumnWriter>> child_writers;
};
} // namespace duckdb