//===----------------------------------------------------------------------===// // DuckDB // // column_writer.hpp // // //===----------------------------------------------------------------------===// #pragma once #include "duckdb.hpp" #include "parquet_types.h" #include "parquet_column_schema.hpp" namespace duckdb { class MemoryStream; class ParquetWriter; class ColumnWriterPageState; class PrimitiveColumnWriterState; struct ChildFieldIDs; struct ShreddingType; class ResizeableBuffer; class ParquetBloomFilter; class ColumnWriterState { public: virtual ~ColumnWriterState(); unsafe_vector definition_levels; unsafe_vector repetition_levels; unsafe_vector is_empty; idx_t parent_null_count = 0; idx_t null_count = 0; public: template TARGET &Cast() { DynamicCastCheck(this); return reinterpret_cast(*this); } template const TARGET &Cast() const { D_ASSERT(dynamic_cast(this)); return reinterpret_cast(*this); } }; class ColumnWriterPageState { public: virtual ~ColumnWriterPageState() { } public: template TARGET &Cast() { DynamicCastCheck(this); return reinterpret_cast(*this); } template const TARGET &Cast() const { D_ASSERT(dynamic_cast(this)); return reinterpret_cast(*this); } }; class ColumnWriter { protected: static constexpr uint16_t PARQUET_DEFINE_VALID = UINT16_C(65535); public: ColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector schema_path, bool can_have_nulls); virtual ~ColumnWriter(); public: const LogicalType &Type() const { return column_schema.type; } const ParquetColumnSchema &Schema() const { return column_schema; } inline idx_t SchemaIndex() const { return column_schema.schema_index; } inline idx_t MaxDefine() const { return column_schema.max_define; } idx_t MaxRepeat() const { return column_schema.max_repeat; } static ParquetColumnSchema FillParquetSchema(vector &schemas, const LogicalType &type, const string &name, bool allow_geometry, optional_ptr field_ids, optional_ptr shredding_types, idx_t max_repeat = 0, idx_t max_define = 1, bool can_have_nulls = true); //! Create the column writer for a specific type recursively static unique_ptr CreateWriterRecursive(ClientContext &context, ParquetWriter &writer, const vector &parquet_schemas, const ParquetColumnSchema &schema, vector path_in_schema); virtual unique_ptr InitializeWriteState(duckdb_parquet::RowGroup &row_group) = 0; //! indicates whether the write need to analyse the data before preparing it virtual bool HasAnalyze() { return false; } virtual void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) { throw NotImplementedException("Writer does not need analysis"); } //! Called after all data has been passed to Analyze virtual void FinalizeAnalyze(ColumnWriterState &state) { throw NotImplementedException("Writer does not need analysis"); } virtual void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count, bool vector_can_span_multiple_pages) = 0; virtual void BeginWrite(ColumnWriterState &state) = 0; virtual void Write(ColumnWriterState &state, Vector &vector, idx_t count) = 0; virtual void FinalizeWrite(ColumnWriterState &state) = 0; protected: void HandleDefineLevels(ColumnWriterState &state, ColumnWriterState *parent, const ValidityMask &validity, const idx_t count, const uint16_t define_value, const uint16_t null_value) const; void HandleRepeatLevels(ColumnWriterState &state_p, ColumnWriterState *parent, idx_t count) const; void CompressPage(MemoryStream &temp_writer, size_t &compressed_size, data_ptr_t &compressed_data, AllocatedData &compressed_buf); public: ParquetWriter &writer; const ParquetColumnSchema &column_schema; vector schema_path; bool can_have_nulls; protected: vector> child_writers; }; } // namespace duckdb