#include "parquet_multi_file_info.hpp" #include "duckdb/common/multi_file/multi_file_function.hpp" #include "duckdb/parser/parsed_data/create_table_function_info.hpp" #include "duckdb/common/serializer/serializer.hpp" #include "duckdb/common/serializer/deserializer.hpp" #include "parquet_crypto.hpp" #include "duckdb/function/table_function.hpp" namespace duckdb { struct ParquetReadBindData : public TableFunctionData { // These come from the initial_reader, but need to be stored in case the initial_reader is removed by a filter idx_t initial_file_cardinality; idx_t initial_file_row_groups; idx_t explicit_cardinality = 0; // can be set to inject exterior cardinality knowledge (e.g. from a data lake) unique_ptr options; ParquetOptions &GetParquetOptions() { return options->options; } const ParquetOptions &GetParquetOptions() const { return options->options; } unique_ptr Copy() const override { auto result = make_uniq(); result->initial_file_cardinality = initial_file_cardinality; result->initial_file_row_groups = initial_file_row_groups; result->explicit_cardinality = explicit_cardinality; result->options = make_uniq(options->options); return std::move(result); } }; struct ParquetReadGlobalState : public GlobalTableFunctionState { explicit ParquetReadGlobalState(optional_ptr op_p) : row_group_index(0), batch_index(0), op(op_p) { } //! Index of row group within file currently up for scanning idx_t row_group_index; //! Batch index of the next row group to be scanned idx_t batch_index; //! (Optional) pointer to physical operator performing the scan optional_ptr op; }; struct ParquetReadLocalState : public LocalTableFunctionState { ParquetReaderScanState scan_state; }; static void ParseFileRowNumberOption(MultiFileReaderBindData &bind_data, ParquetOptions &options, vector &return_types, vector &names) { if (options.file_row_number) { if (StringUtil::CIFind(names, "file_row_number") != DConstants::INVALID_INDEX) { throw BinderException( "Using file_row_number option on file with column named file_row_number is not supported"); } return_types.emplace_back(LogicalType::BIGINT); names.emplace_back("file_row_number"); } } static void BindSchema(ClientContext &context, vector &return_types, vector &names, MultiFileBindData &bind_data) { auto &parquet_bind = bind_data.bind_data->Cast(); auto &options = parquet_bind.GetParquetOptions(); D_ASSERT(!options.schema.empty()); auto &file_options = bind_data.file_options; if (file_options.union_by_name || file_options.hive_partitioning) { throw BinderException("Parquet schema cannot be combined with union_by_name=true or hive_partitioning=true"); } auto &reader_bind = bind_data.reader_bind; vector schema_col_names; vector schema_col_types; schema_col_names.reserve(options.schema.size()); schema_col_types.reserve(options.schema.size()); bool match_by_field_id; if (!options.schema.empty()) { auto &column = options.schema[0]; if (column.identifier.type().id() == LogicalTypeId::INTEGER) { match_by_field_id = true; } else { match_by_field_id = false; } } else { match_by_field_id = false; } for (idx_t i = 0; i < options.schema.size(); i++) { const auto &column = options.schema[i]; schema_col_names.push_back(column.name); schema_col_types.push_back(column.type); auto res = MultiFileColumnDefinition(column.name, column.type); res.identifier = column.identifier; #ifdef DEBUG if (match_by_field_id) { D_ASSERT(res.identifier.type().id() == LogicalTypeId::INTEGER); } else { D_ASSERT(res.identifier.type().id() == LogicalTypeId::VARCHAR); } #endif res.default_expression = make_uniq(column.default_value); reader_bind.schema.emplace_back(res); } ParseFileRowNumberOption(reader_bind, options, return_types, names); if (options.file_row_number) { MultiFileColumnDefinition res("file_row_number", LogicalType::BIGINT); res.identifier = Value::INTEGER(MultiFileReader::ORDINAL_FIELD_ID); schema_col_names.push_back(res.name); schema_col_types.push_back(res.type); reader_bind.schema.emplace_back(res); } if (match_by_field_id) { reader_bind.mapping = MultiFileColumnMappingMode::BY_FIELD_ID; } else { reader_bind.mapping = MultiFileColumnMappingMode::BY_NAME; } // perform the binding on the obtained set of names + types bind_data.multi_file_reader->BindOptions(file_options, *bind_data.file_list, schema_col_types, schema_col_names, reader_bind); names = schema_col_names; return_types = schema_col_types; D_ASSERT(names.size() == return_types.size()); } unique_ptr ParquetMultiFileInfo::CreateInterface(ClientContext &context) { return make_uniq(); } void ParquetMultiFileInfo::BindReader(ClientContext &context, vector &return_types, vector &names, MultiFileBindData &bind_data) { auto &parquet_bind = bind_data.bind_data->Cast(); auto &options = parquet_bind.GetParquetOptions(); if (!options.schema.empty()) { BindSchema(context, return_types, names, bind_data); } else { bind_data.reader_bind = bind_data.multi_file_reader->BindReader(context, return_types, names, *bind_data.file_list, bind_data, *parquet_bind.options, bind_data.file_options); } } static bool GetBooleanArgument(const string &key, const vector &option_values) { if (option_values.empty()) { return true; } Value boolean_value; string error_message; if (!option_values[0].DefaultTryCastAs(LogicalType::BOOLEAN, boolean_value, &error_message)) { throw InvalidInputException("Unable to cast \"%s\" to BOOLEAN for Parquet option \"%s\"", option_values[0].ToString(), key); } return BooleanValue::Get(boolean_value); } static bool ParquetScanPushdownExpression(ClientContext &context, const LogicalGet &get, Expression &expr) { return true; } static void VerifyParquetSchemaParameter(const Value &schema) { LogicalType::MAP(LogicalType::BLOB, LogicalType::STRUCT({{{"name", LogicalType::VARCHAR}, {"type", LogicalType::VARCHAR}, {"default_value", LogicalType::VARCHAR}}})); auto &map_type = schema.type(); if (map_type.id() != LogicalTypeId::MAP) { throw InvalidInputException("'schema' expects a value of type MAP, not %s", LogicalTypeIdToString(map_type.id())); } auto &key_type = MapType::KeyType(map_type); auto &value_type = MapType::ValueType(map_type); if (value_type.id() != LogicalTypeId::STRUCT) { throw InvalidInputException("'schema' expects a STRUCT as the value type of the map"); } auto &children = StructType::GetChildTypes(value_type); if (children.size() < 3) { throw InvalidInputException( "'schema' expects the STRUCT to have 3 children, 'name', 'type' and 'default_value"); } if (!StringUtil::CIEquals(children[0].first, "name")) { throw InvalidInputException("'schema' expects the first field of the struct to be called 'name'"); } if (children[0].second.id() != LogicalTypeId::VARCHAR) { throw InvalidInputException("'schema' expects the 'name' field to be of type VARCHAR, not %s", LogicalTypeIdToString(children[0].second.id())); } if (!StringUtil::CIEquals(children[1].first, "type")) { throw InvalidInputException("'schema' expects the second field of the struct to be called 'type'"); } if (children[1].second.id() != LogicalTypeId::VARCHAR) { throw InvalidInputException("'schema' expects the 'type' field to be of type VARCHAR, not %s", LogicalTypeIdToString(children[1].second.id())); } if (!StringUtil::CIEquals(children[2].first, "default_value")) { throw InvalidInputException("'schema' expects the third field of the struct to be called 'default_value'"); } //! NOTE: default_value can be any type if (key_type.id() != LogicalTypeId::INTEGER && key_type.id() != LogicalTypeId::VARCHAR) { throw InvalidInputException( "'schema' expects the value type of the map to be either INTEGER or VARCHAR, not %s", LogicalTypeIdToString(key_type.id())); } } static void ParquetScanSerialize(Serializer &serializer, const optional_ptr bind_data_p, const TableFunction &function) { auto &bind_data = bind_data_p->Cast(); auto &parquet_data = bind_data.bind_data->Cast(); vector files; for (auto &file : bind_data.file_list->GetAllFiles()) { files.emplace_back(file.path); } serializer.WriteProperty(100, "files", files); serializer.WriteProperty(101, "types", bind_data.types); serializer.WriteProperty(102, "names", bind_data.names); ParquetOptionsSerialization serialization(parquet_data.GetParquetOptions(), bind_data.file_options); serializer.WriteProperty(103, "parquet_options", serialization); if (serializer.ShouldSerialize(3)) { serializer.WriteProperty(104, "table_columns", bind_data.table_columns); } } static unique_ptr ParquetScanDeserialize(Deserializer &deserializer, TableFunction &function) { auto &context = deserializer.Get(); auto files = deserializer.ReadProperty>(100, "files"); auto types = deserializer.ReadProperty>(101, "types"); auto names = deserializer.ReadProperty>(102, "names"); auto serialization = deserializer.ReadProperty(103, "parquet_options"); auto table_columns = deserializer.ReadPropertyWithExplicitDefault>(104, "table_columns", vector {}); vector file_path; for (auto &path : files) { file_path.emplace_back(path); } FileGlobInput input(FileGlobOptions::FALLBACK_GLOB, "parquet"); auto multi_file_reader = MultiFileReader::Create(function); auto file_list = multi_file_reader->CreateFileList(context, Value::LIST(LogicalType::VARCHAR, file_path), input); auto parquet_options = make_uniq(std::move(serialization.parquet_options)); auto interface = make_uniq(); auto bind_data = MultiFileFunction::MultiFileBindInternal( context, std::move(multi_file_reader), std::move(file_list), types, names, std::move(serialization.file_options), std::move(parquet_options), std::move(interface)); bind_data->Cast().table_columns = std::move(table_columns); return bind_data; } static vector ParquetGetRowIdColumns(ClientContext &context, optional_ptr bind_data) { vector result; result.emplace_back(MultiFileReader::COLUMN_IDENTIFIER_FILE_INDEX); result.emplace_back(MultiFileReader::COLUMN_IDENTIFIER_FILE_ROW_NUMBER); return result; } static vector ParquetGetPartitionStats(ClientContext &context, GetPartitionStatsInput &input) { auto &bind_data = input.bind_data->Cast(); vector result; if (bind_data.file_list->GetExpandResult() == FileExpandResult::SINGLE_FILE && bind_data.initial_reader) { // we have read the metadata - get the partitions for this reader auto &reader = bind_data.initial_reader->Cast(); reader.GetPartitionStats(result); return result; } // if we are reading multiple files - we check if we have caching enabled if (!ParquetReader::MetadataCacheEnabled(context)) { // no caching - bail return result; } // caching is enabled - check if we have ALL of the metadata cached vector> caches; for (auto &file : bind_data.file_list->Files()) { auto metadata_entry = ParquetReader::GetMetadataCacheEntry(context, file); if (!metadata_entry) { // no cache entry found return result; } // check if the file has any deletes if (file.extended_info) { auto entry = file.extended_info->options.find("has_deletes"); if (entry != file.extended_info->options.end()) { if (BooleanValue::Get(entry->second)) { // the file has deletes - skip emitting partition stats // FIXME: we could emit partition stats but set count to `COUNT_APPROXIMATE` instead of // `COUNT_EXACT` return result; } } } // check if the cache is valid based ONLY on the OpenFileInfo (do not do any file system requests here) auto is_valid = metadata_entry->IsValid(file); if (is_valid != ParquetCacheValidity::VALID) { return result; } caches.push_back(std::move(metadata_entry)); } // all caches are valid! we can return the partition stats for (auto &cache : caches) { ParquetReader::GetPartitionStats(*cache->metadata, result); } return result; } TableFunctionSet ParquetScanFunction::GetFunctionSet() { MultiFileFunction table_function("parquet_scan"); table_function.named_parameters["binary_as_string"] = LogicalType::BOOLEAN; table_function.named_parameters["file_row_number"] = LogicalType::BOOLEAN; table_function.named_parameters["debug_use_openssl"] = LogicalType::BOOLEAN; table_function.named_parameters["compression"] = LogicalType::VARCHAR; table_function.named_parameters["explicit_cardinality"] = LogicalType::UBIGINT; table_function.named_parameters["schema"] = LogicalTypeId::ANY; table_function.named_parameters["encryption_config"] = LogicalTypeId::ANY; table_function.named_parameters["parquet_version"] = LogicalType::VARCHAR; table_function.named_parameters["can_have_nan"] = LogicalType::BOOLEAN; table_function.statistics = MultiFileFunction::MultiFileScanStats; table_function.serialize = ParquetScanSerialize; table_function.deserialize = ParquetScanDeserialize; table_function.get_row_id_columns = ParquetGetRowIdColumns; table_function.pushdown_expression = ParquetScanPushdownExpression; table_function.get_partition_stats = ParquetGetPartitionStats; table_function.filter_pushdown = true; table_function.filter_prune = true; table_function.late_materialization = true; return MultiFileReader::CreateFunctionSet(static_cast(table_function)); } unique_ptr ParquetMultiFileInfo::InitializeOptions(ClientContext &context, optional_ptr info) { return make_uniq(context); } bool ParquetMultiFileInfo::ParseCopyOption(ClientContext &context, const string &key, const vector &values, BaseFileReaderOptions &file_options, vector &expected_names, vector &expected_types) { auto &parquet_options = file_options.Cast(); auto &options = parquet_options.options; if (key == "compression" || key == "codec" || key == "row_group_size") { // CODEC/COMPRESSION and ROW_GROUP_SIZE options have no effect on parquet read. // These options are determined from the file. return true; } if (key == "binary_as_string") { options.binary_as_string = GetBooleanArgument(key, values); return true; } if (key == "file_row_number") { options.file_row_number = GetBooleanArgument(key, values); return true; } if (key == "debug_use_openssl") { options.debug_use_openssl = GetBooleanArgument(key, values); return true; } if (key == "encryption_config") { if (values.size() != 1) { throw BinderException("Parquet encryption_config cannot be empty!"); } options.encryption_config = ParquetEncryptionConfig::Create(context, values[0]); return true; } if (key == "can_have_nan") { if (values.size() != 1) { throw BinderException("Parquet can_have_nan cannot be empty!"); } options.can_have_nan = GetBooleanArgument(key, values); return true; } return false; } bool ParquetMultiFileInfo::ParseOption(ClientContext &context, const string &original_key, const Value &val, MultiFileOptions &file_options, BaseFileReaderOptions &base_options) { auto &parquet_options = base_options.Cast(); auto &options = parquet_options.options; auto key = StringUtil::Lower(original_key); if (val.IsNull()) { throw BinderException("Cannot use NULL as argument to %s", original_key); } if (key == "compression") { // COMPRESSION has no effect on parquet read. // These options are determined from the file. return true; } if (key == "binary_as_string") { options.binary_as_string = BooleanValue::Get(val); return true; } if (key == "variant_legacy_encoding") { options.variant_legacy_encoding = BooleanValue::Get(val); return true; } if (key == "file_row_number") { options.file_row_number = BooleanValue::Get(val); return true; } if (key == "debug_use_openssl") { options.debug_use_openssl = BooleanValue::Get(val); return true; } if (key == "can_have_nan") { options.can_have_nan = BooleanValue::Get(val); return true; } if (key == "schema") { // Argument is a map that defines the schema const auto &schema_value = val; VerifyParquetSchemaParameter(schema_value); const auto column_values = ListValue::GetChildren(schema_value); if (column_values.empty()) { throw BinderException("Parquet schema cannot be empty"); } options.schema.reserve(column_values.size()); for (idx_t i = 0; i < column_values.size(); i++) { options.schema.emplace_back(ParquetColumnDefinition::FromSchemaValue(context, column_values[i])); } file_options.auto_detect_hive_partitioning = false; return true; } if (key == "explicit_cardinality") { options.explicit_cardinality = UBigIntValue::Get(val); return true; } if (key == "encryption_config") { options.encryption_config = ParquetEncryptionConfig::Create(context, val); return true; } return false; } unique_ptr ParquetMultiFileInfo::InitializeBindData(MultiFileBindData &multi_file_data, unique_ptr options_p) { auto result = make_uniq(); // Set the explicit cardinality if requested result->options = unique_ptr_cast(std::move(options_p)); auto &parquet_options = result->GetParquetOptions(); if (parquet_options.explicit_cardinality) { auto file_count = multi_file_data.file_list->GetTotalFileCount(); result->explicit_cardinality = parquet_options.explicit_cardinality; result->initial_file_cardinality = result->explicit_cardinality / (file_count ? file_count : 1); } return std::move(result); } void ParquetMultiFileInfo::GetBindInfo(const TableFunctionData &bind_data_p, BindInfo &info) { auto &bind_data = bind_data_p.Cast(); auto &parquet_options = bind_data.GetParquetOptions(); info.type = ScanType::PARQUET; info.InsertOption("binary_as_string", Value::BOOLEAN(parquet_options.binary_as_string)); info.InsertOption("file_row_number", Value::BOOLEAN(parquet_options.file_row_number)); info.InsertOption("debug_use_openssl", Value::BOOLEAN(parquet_options.debug_use_openssl)); } optional_idx ParquetMultiFileInfo::MaxThreads(const MultiFileBindData &bind_data_p, const MultiFileGlobalState &global_state, FileExpandResult expand_result) { if (expand_result == FileExpandResult::MULTIPLE_FILES) { // always launch max threads if we are reading multiple files return optional_idx(); } auto &bind_data = bind_data_p.bind_data->Cast(); return MaxValue(bind_data.initial_file_row_groups, static_cast(1)); } void ParquetMultiFileInfo::FinalizeBindData(MultiFileBindData &multi_file_data) { auto &bind_data = multi_file_data.bind_data->Cast(); if (multi_file_data.initial_reader) { auto &initial_reader = multi_file_data.initial_reader->Cast(); bind_data.initial_file_cardinality = initial_reader.NumRows(); bind_data.initial_file_row_groups = initial_reader.NumRowGroups(); bind_data.options->options = initial_reader.parquet_options; } } unique_ptr ParquetMultiFileInfo::GetCardinality(const MultiFileBindData &bind_data_p, idx_t file_count) { auto &bind_data = bind_data_p.bind_data->Cast(); if (bind_data.explicit_cardinality) { return make_uniq(bind_data.explicit_cardinality); } return make_uniq(MaxValue(bind_data.initial_file_cardinality, (idx_t)1) * file_count); } unique_ptr ParquetReader::GetStatistics(ClientContext &context, const string &name) { return ReadStatistics(name); } double ParquetReader::GetProgressInFile(ClientContext &context) { auto read_rows = rows_read.load(); return 100.0 * (static_cast(read_rows) / static_cast(NumRows())); } void ParquetMultiFileInfo::GetVirtualColumns(ClientContext &, MultiFileBindData &, virtual_column_map_t &result) { result.insert(make_pair(MultiFileReader::COLUMN_IDENTIFIER_FILE_ROW_NUMBER, TableColumn("file_row_number", LogicalType::BIGINT))); } shared_ptr ParquetMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &, BaseUnionData &union_data_p, const MultiFileBindData &bind_data_p) { auto &union_data = union_data_p.Cast(); return make_shared_ptr(context, union_data.file, union_data.options, union_data.metadata); } shared_ptr ParquetMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &, const OpenFileInfo &file, idx_t file_idx, const MultiFileBindData &multi_bind_data) { auto &bind_data = multi_bind_data.bind_data->Cast(); return make_shared_ptr(context, file, bind_data.GetParquetOptions()); } shared_ptr ParquetMultiFileInfo::CreateReader(ClientContext &context, const OpenFileInfo &file, BaseFileReaderOptions &options_p, const MultiFileOptions &) { auto &options = options_p.Cast(); return make_shared_ptr(context, file, options.options); } shared_ptr ParquetReader::GetUnionData(idx_t file_idx) { auto result = make_uniq(file); for (auto &column : columns) { result->names.push_back(column.name); result->types.push_back(column.type); } if (file_idx == 0) { result->options = parquet_options; result->metadata = metadata; result->reader = shared_from_this(); } else { result->options = std::move(parquet_options); result->metadata = std::move(metadata); result->root_schema = std::move(root_schema); } return std::move(result); } unique_ptr ParquetMultiFileInfo::InitializeGlobalState(ClientContext &, MultiFileBindData &, MultiFileGlobalState &global_state) { return make_uniq(global_state.op); } unique_ptr ParquetMultiFileInfo::InitializeLocalState(ExecutionContext &, GlobalTableFunctionState &) { return make_uniq(); } bool ParquetReader::TryInitializeScan(ClientContext &context, GlobalTableFunctionState &gstate_p, LocalTableFunctionState &lstate_p) { auto &gstate = gstate_p.Cast(); auto &lstate = lstate_p.Cast(); if (gstate.row_group_index >= NumRowGroups()) { // scanned all row groups in this file return false; } // The current reader has rowgroups left to be scanned vector group_indexes {gstate.row_group_index}; InitializeScan(context, lstate.scan_state, group_indexes); gstate.row_group_index++; return true; } void ParquetReader::FinishFile(ClientContext &context, GlobalTableFunctionState &gstate_p) { auto &gstate = gstate_p.Cast(); gstate.row_group_index = 0; } void ParquetReader::Scan(ClientContext &context, GlobalTableFunctionState &gstate_p, LocalTableFunctionState &local_state_p, DataChunk &chunk) { auto &gstate = gstate_p.Cast(); auto &local_state = local_state_p.Cast(); local_state.scan_state.op = gstate.op; Scan(context, local_state.scan_state, chunk); } unique_ptr ParquetMultiFileInfo::Copy() { return make_uniq(); } FileGlobInput ParquetMultiFileInfo::GetGlobInput() { return FileGlobInput(FileGlobOptions::FALLBACK_GLOB, "parquet"); } } // namespace duckdb