Files
email-tracker/external/duckdb/extension/parquet/include/parquet_writer.hpp
2025-10-24 19:21:19 -05:00

183 lines
5.8 KiB
C++

//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_writer.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb.hpp"
#include "duckdb/common/common.hpp"
#include "duckdb/common/optional_idx.hpp"
#include "duckdb/common/encryption_state.hpp"
#include "duckdb/common/exception.hpp"
#include "duckdb/common/mutex.hpp"
#include "duckdb/common/atomic.hpp"
#include "duckdb/common/serializer/buffered_file_writer.hpp"
#include "duckdb/common/types/column/column_data_collection.hpp"
#include "duckdb/function/copy_function.hpp"
#include "parquet_statistics.hpp"
#include "column_writer.hpp"
#include "parquet_field_id.hpp"
#include "parquet_shredding.hpp"
#include "parquet_types.h"
#include "geo_parquet.hpp"
#include "writer/parquet_write_stats.hpp"
#include "thrift/protocol/TCompactProtocol.h"
namespace duckdb {
class FileSystem;
class FileOpener;
class ParquetEncryptionConfig;
class ParquetStatsAccumulator;
class Serializer;
class Deserializer;
class ColumnWriterStatistics;
struct CopyFunctionFileStatistics;
struct PreparedRowGroup {
duckdb_parquet::RowGroup row_group;
vector<unique_ptr<ColumnWriterState>> states;
};
struct ParquetBloomFilterEntry {
unique_ptr<ParquetBloomFilter> bloom_filter;
idx_t row_group_idx;
idx_t column_idx;
};
enum class ParquetVersion : uint8_t {
V1 = 1, //! Excludes DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY, BYTE_STREAM_SPLIT
V2 = 2, //! Includes the encodings above
};
class ParquetWriter {
public:
ParquetWriter(ClientContext &context, FileSystem &fs, string file_name, vector<LogicalType> types,
vector<string> names, duckdb_parquet::CompressionCodec::type codec, ChildFieldIDs field_ids,
ShreddingType shredding_types, const vector<pair<string, string>> &kv_metadata,
shared_ptr<ParquetEncryptionConfig> encryption_config, optional_idx dictionary_size_limit,
idx_t string_dictionary_page_size_limit, bool enable_bloom_filters,
double bloom_filter_false_positive_ratio, int64_t compression_level, bool debug_use_openssl,
ParquetVersion parquet_version, GeoParquetVersion geoparquet_version);
~ParquetWriter();
public:
void PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGroup &result);
void FlushRowGroup(PreparedRowGroup &row_group);
void Flush(ColumnDataCollection &buffer);
void Finalize();
static duckdb_parquet::Type::type DuckDBTypeToParquetType(const LogicalType &duckdb_type);
static void SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele,
bool allow_geometry);
ClientContext &GetContext() {
return context;
}
duckdb_apache::thrift::protocol::TProtocol *GetProtocol() {
return protocol.get();
}
duckdb_parquet::CompressionCodec::type GetCodec() {
return codec;
}
duckdb_parquet::Type::type GetType(idx_t schema_idx) {
return file_meta_data.schema[schema_idx].type;
}
LogicalType GetSQLType(idx_t schema_idx) const {
return sql_types[schema_idx];
}
BufferedFileWriter &GetWriter() {
return *writer;
}
idx_t FileSize() {
return total_written;
}
optional_idx DictionarySizeLimit() const {
return dictionary_size_limit;
}
idx_t StringDictionaryPageSizeLimit() const {
return string_dictionary_page_size_limit;
}
double EnableBloomFilters() const {
return enable_bloom_filters;
}
double BloomFilterFalsePositiveRatio() const {
return bloom_filter_false_positive_ratio;
}
int64_t CompressionLevel() const {
return compression_level;
}
idx_t NumberOfRowGroups() {
return num_row_groups;
}
ParquetVersion GetParquetVersion() const {
return parquet_version;
}
GeoParquetVersion GetGeoParquetVersion() const {
return geoparquet_version;
}
const string &GetFileName() const {
return file_name;
}
uint32_t Write(const duckdb_apache::thrift::TBase &object);
uint32_t WriteData(const const_data_ptr_t buffer, const uint32_t buffer_size);
GeoParquetFileMetadata &GetGeoParquetData();
static bool TryGetParquetType(const LogicalType &duckdb_type,
optional_ptr<duckdb_parquet::Type::type> type = nullptr);
void BufferBloomFilter(idx_t col_idx, unique_ptr<ParquetBloomFilter> bloom_filter);
void SetWrittenStatistics(CopyFunctionFileStatistics &written_stats);
void FlushColumnStats(idx_t col_idx, duckdb_parquet::ColumnChunk &chunk,
optional_ptr<ColumnWriterStatistics> writer_stats);
private:
void GatherWrittenStatistics();
private:
ClientContext &context;
string file_name;
vector<LogicalType> sql_types;
vector<string> column_names;
duckdb_parquet::CompressionCodec::type codec;
ChildFieldIDs field_ids;
ShreddingType shredding_types;
shared_ptr<ParquetEncryptionConfig> encryption_config;
optional_idx dictionary_size_limit;
idx_t string_dictionary_page_size_limit;
bool enable_bloom_filters;
double bloom_filter_false_positive_ratio;
int64_t compression_level;
bool debug_use_openssl;
shared_ptr<EncryptionUtil> encryption_util;
ParquetVersion parquet_version;
GeoParquetVersion geoparquet_version;
vector<ParquetColumnSchema> column_schemas;
unique_ptr<BufferedFileWriter> writer;
//! Atomics to reduce contention when rotating writes to multiple Parquet files
atomic<idx_t> total_written;
atomic<idx_t> num_row_groups;
std::shared_ptr<duckdb_apache::thrift::protocol::TProtocol> protocol;
duckdb_parquet::FileMetaData file_meta_data;
std::mutex lock;
vector<unique_ptr<ColumnWriter>> column_writers;
unique_ptr<GeoParquetFileMetadata> geoparquet_data;
vector<ParquetBloomFilterEntry> bloom_filters;
optional_ptr<CopyFunctionFileStatistics> written_stats;
unique_ptr<ParquetStatsAccumulator> stats_accumulator;
};
} // namespace duckdb