Files
email-tracker/external/duckdb/extension/parquet/parquet_field_id.cpp
2025-10-24 19:21:19 -05:00

181 lines
6.6 KiB
C++

#include "parquet_field_id.hpp"
#include "duckdb/common/exception/binder_exception.hpp"
namespace duckdb {
constexpr const char *FieldID::DUCKDB_FIELD_ID;
ChildFieldIDs::ChildFieldIDs() : ids(make_uniq<case_insensitive_map_t<FieldID>>()) {
}
ChildFieldIDs ChildFieldIDs::Copy() const {
ChildFieldIDs result;
for (const auto &id : *ids) {
result.ids->emplace(id.first, id.second.Copy());
}
return result;
}
FieldID::FieldID() : set(false) {
}
FieldID::FieldID(int32_t field_id_p) : set(true), field_id(field_id_p) {
}
FieldID FieldID::Copy() const {
auto result = set ? FieldID(field_id) : FieldID();
result.child_field_ids = child_field_ids.Copy();
return result;
}
static case_insensitive_map_t<LogicalType> GetChildNameToTypeMap(const LogicalType &type) {
case_insensitive_map_t<LogicalType> name_to_type_map;
switch (type.id()) {
case LogicalTypeId::LIST:
name_to_type_map.emplace("element", ListType::GetChildType(type));
break;
case LogicalTypeId::MAP:
name_to_type_map.emplace("key", MapType::KeyType(type));
name_to_type_map.emplace("value", MapType::ValueType(type));
break;
case LogicalTypeId::STRUCT:
for (auto &child_type : StructType::GetChildTypes(type)) {
if (child_type.first == FieldID::DUCKDB_FIELD_ID) {
throw BinderException("Cannot have column named \"%s\" with FIELD_IDS", FieldID::DUCKDB_FIELD_ID);
}
name_to_type_map.emplace(child_type);
}
break;
default: // LCOV_EXCL_START
throw InternalException("Unexpected type in GetChildNameToTypeMap");
} // LCOV_EXCL_STOP
return name_to_type_map;
}
static void GetChildNamesAndTypes(const LogicalType &type, vector<string> &child_names,
vector<LogicalType> &child_types) {
switch (type.id()) {
case LogicalTypeId::LIST:
child_names.emplace_back("element");
child_types.emplace_back(ListType::GetChildType(type));
break;
case LogicalTypeId::MAP:
child_names.emplace_back("key");
child_names.emplace_back("value");
child_types.emplace_back(MapType::KeyType(type));
child_types.emplace_back(MapType::ValueType(type));
break;
case LogicalTypeId::STRUCT:
for (auto &child_type : StructType::GetChildTypes(type)) {
child_names.emplace_back(child_type.first);
child_types.emplace_back(child_type.second);
}
break;
default: // LCOV_EXCL_START
throw InternalException("Unexpected type in GetChildNamesAndTypes");
} // LCOV_EXCL_STOP
}
void FieldID::GenerateFieldIDs(ChildFieldIDs &field_ids, idx_t &field_id, const vector<string> &names,
const vector<LogicalType> &sql_types) {
D_ASSERT(names.size() == sql_types.size());
for (idx_t col_idx = 0; col_idx < names.size(); col_idx++) {
const auto &col_name = names[col_idx];
auto inserted = field_ids.ids->insert(make_pair(col_name, FieldID(UnsafeNumericCast<int32_t>(field_id++))));
D_ASSERT(inserted.second);
const auto &col_type = sql_types[col_idx];
if (col_type.id() != LogicalTypeId::LIST && col_type.id() != LogicalTypeId::MAP &&
col_type.id() != LogicalTypeId::STRUCT) {
continue;
}
// Cannot use GetChildNameToTypeMap here because we lose order, and we want to generate depth-first
vector<string> child_names;
vector<LogicalType> child_types;
GetChildNamesAndTypes(col_type, child_names, child_types);
GenerateFieldIDs(inserted.first->second.child_field_ids, field_id, child_names, child_types);
}
}
void FieldID::GetFieldIDs(const Value &field_ids_value, ChildFieldIDs &field_ids,
unordered_set<uint32_t> &unique_field_ids,
const case_insensitive_map_t<LogicalType> &name_to_type_map) {
const auto &struct_type = field_ids_value.type();
if (struct_type.id() != LogicalTypeId::STRUCT) {
throw BinderException(
"Expected FIELD_IDS to be a STRUCT, e.g., {col1: 42, col2: {%s: 43, nested_col: 44}, col3: 44}",
FieldID::DUCKDB_FIELD_ID);
}
const auto &struct_children = StructValue::GetChildren(field_ids_value);
D_ASSERT(StructType::GetChildTypes(struct_type).size() == struct_children.size());
for (idx_t i = 0; i < struct_children.size(); i++) {
const auto &col_name = StringUtil::Lower(StructType::GetChildName(struct_type, i));
if (col_name == FieldID::DUCKDB_FIELD_ID) {
continue;
}
auto it = name_to_type_map.find(col_name);
if (it == name_to_type_map.end()) {
string names;
for (const auto &name : name_to_type_map) {
if (!names.empty()) {
names += ", ";
}
names += name.first;
}
throw BinderException(
"Column name \"%s\" specified in FIELD_IDS not found. Consider using WRITE_PARTITION_COLUMNS if this "
"column is a partition column. Available column names: [%s]",
col_name, names);
}
D_ASSERT(field_ids.ids->find(col_name) == field_ids.ids->end()); // Caught by STRUCT - deduplicates keys
const auto &child_value = struct_children[i];
const auto &child_type = child_value.type();
optional_ptr<const Value> field_id_value;
optional_ptr<const Value> child_field_ids_value;
if (child_type.id() == LogicalTypeId::STRUCT) {
const auto &nested_children = StructValue::GetChildren(child_value);
D_ASSERT(StructType::GetChildTypes(child_type).size() == nested_children.size());
for (idx_t nested_i = 0; nested_i < nested_children.size(); nested_i++) {
const auto &field_id_or_nested_col = StructType::GetChildName(child_type, nested_i);
if (field_id_or_nested_col == FieldID::DUCKDB_FIELD_ID) {
field_id_value = &nested_children[nested_i];
} else {
child_field_ids_value = &child_value;
}
}
} else {
field_id_value = &child_value;
}
FieldID field_id;
if (field_id_value) {
Value field_id_integer_value = field_id_value->DefaultCastAs(LogicalType::INTEGER);
const uint32_t field_id_int = IntegerValue::Get(field_id_integer_value);
if (!unique_field_ids.insert(field_id_int).second) {
throw BinderException("Duplicate field_id %s found in FIELD_IDS", field_id_integer_value.ToString());
}
field_id = FieldID(UnsafeNumericCast<int32_t>(field_id_int));
}
auto inserted = field_ids.ids->insert(make_pair(col_name, std::move(field_id)));
D_ASSERT(inserted.second);
if (child_field_ids_value) {
const auto &col_type = it->second;
if (col_type.id() != LogicalTypeId::LIST && col_type.id() != LogicalTypeId::MAP &&
col_type.id() != LogicalTypeId::STRUCT) {
throw BinderException("Column \"%s\" with type \"%s\" cannot have a nested FIELD_IDS specification",
col_name, LogicalTypeIdToString(col_type.id()));
}
GetFieldIDs(*child_field_ids_value, inserted.first->second.child_field_ids, unique_field_ids,
GetChildNameToTypeMap(col_type));
}
}
}
} // namespace duckdb