should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,25 @@
add_library_unity(
duckdb_json_functions
OBJECT
copy_json.cpp
json_array_length.cpp
json_contains.cpp
json_create.cpp
json_exists.cpp
json_extract.cpp
json_keys.cpp
json_merge_patch.cpp
json_pretty.cpp
json_structure.cpp
json_table_in_out.cpp
json_transform.cpp
json_type.cpp
json_valid.cpp
json_value.cpp
json_serialize_plan.cpp
json_serialize_sql.cpp
read_json.cpp
read_json_objects.cpp)
set(ALL_OBJECT_FILES
${ALL_OBJECT_FILES} $<TARGET_OBJECTS:duckdb_json_functions>
PARENT_SCOPE)

View File

@@ -0,0 +1,133 @@
#include "duckdb/function/copy_function.hpp"
#include "duckdb/parser/expression/constant_expression.hpp"
#include "duckdb/parser/expression/function_expression.hpp"
#include "duckdb/parser/expression/positional_reference_expression.hpp"
#include "duckdb/parser/query_node/select_node.hpp"
#include "duckdb/parser/tableref/subqueryref.hpp"
#include "duckdb/planner/binder.hpp"
#include "duckdb/common/helper.hpp"
#include "json_functions.hpp"
#include "json_scan.hpp"
#include "json_transform.hpp"
#include "json_multi_file_info.hpp"
namespace duckdb {
static void ThrowJSONCopyParameterException(const string &loption) {
throw BinderException("COPY (FORMAT JSON) parameter %s expects a single argument.", loption);
}
static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
static const unordered_set<string> SUPPORTED_BASE_OPTIONS {
"compression", "encoding", "use_tmp_file", "overwrite_or_ignore", "overwrite", "append", "filename_pattern",
"file_extension", "per_thread_output", "file_size_bytes",
// "partition_by", unsupported
"return_files", "preserve_order", "return_stats", "write_partition_columns", "write_empty_file",
"hive_file_pattern"};
auto stmt_copy = stmt.Copy();
auto &copy = stmt_copy->Cast<CopyStatement>();
auto &copied_info = *copy.info;
// Parse the options, creating options for the CSV writer while doing so
string date_format;
string timestamp_format;
// We insert the JSON file extension here so it works properly with PER_THREAD_OUTPUT/FILE_SIZE_BYTES etc.
case_insensitive_map_t<vector<Value>> csv_copy_options {{"file_extension", {"json"}}};
for (const auto &kv : copied_info.options) {
const auto &loption = StringUtil::Lower(kv.first);
if (loption == "dateformat" || loption == "date_format") {
if (kv.second.size() != 1) {
ThrowJSONCopyParameterException(loption);
}
date_format = StringValue::Get(kv.second.back());
} else if (loption == "timestampformat" || loption == "timestamp_format") {
if (kv.second.size() != 1) {
ThrowJSONCopyParameterException(loption);
}
timestamp_format = StringValue::Get(kv.second.back());
} else if (loption == "array") {
if (kv.second.size() > 1) {
ThrowJSONCopyParameterException(loption);
}
if (kv.second.empty() || BooleanValue::Get(kv.second.back().DefaultCastAs(LogicalTypeId::BOOLEAN))) {
csv_copy_options["prefix"] = {"[\n\t"};
csv_copy_options["suffix"] = {"\n]\n"};
csv_copy_options["new_line"] = {",\n\t"};
}
} else if (SUPPORTED_BASE_OPTIONS.find(loption) != SUPPORTED_BASE_OPTIONS.end()) {
// We support these base options
csv_copy_options.insert(kv);
} else {
throw BinderException("Unknown option for COPY ... TO ... (FORMAT JSON): \"%s\".", loption);
}
}
// Bind the select statement of the original to resolve the types
auto dummy_binder = Binder::CreateBinder(binder.context, &binder);
auto bound_original = dummy_binder->Bind(*stmt.info->select_statement);
// Create new SelectNode with the original SelectNode as a subquery in the FROM clause
auto select_stmt = make_uniq<SelectStatement>();
select_stmt->node = std::move(copied_info.select_statement);
auto subquery_ref = make_uniq<SubqueryRef>(std::move(select_stmt));
copied_info.select_statement = make_uniq_base<QueryNode, SelectNode>();
auto &select_node = copied_info.select_statement->Cast<SelectNode>();
select_node.from_table = std::move(subquery_ref);
// Create new select list
vector<unique_ptr<ParsedExpression>> select_list;
select_list.reserve(bound_original.types.size());
// strftime if the user specified a format (loop also gives columns a name, needed for struct_pack)
// TODO: deal with date/timestamp within nested types
vector<unique_ptr<ParsedExpression>> strftime_children;
for (idx_t col_idx = 0; col_idx < bound_original.types.size(); col_idx++) {
auto column = make_uniq_base<ParsedExpression, PositionalReferenceExpression>(col_idx + 1);
strftime_children = vector<unique_ptr<ParsedExpression>>();
const auto &type = bound_original.types[col_idx];
const auto &name = bound_original.names[col_idx];
if (!date_format.empty() && type == LogicalTypeId::DATE) {
strftime_children.emplace_back(std::move(column));
strftime_children.emplace_back(make_uniq<ConstantExpression>(date_format));
column = make_uniq<FunctionExpression>("strftime", std::move(strftime_children));
} else if (!timestamp_format.empty() && type == LogicalTypeId::TIMESTAMP) {
strftime_children.emplace_back(std::move(column));
strftime_children.emplace_back(make_uniq<ConstantExpression>(timestamp_format));
column = make_uniq<FunctionExpression>("strftime", std::move(strftime_children));
}
column->SetAlias(name);
select_list.emplace_back(std::move(column));
}
// Now create the struct_pack/to_json to create a JSON object per row
vector<unique_ptr<ParsedExpression>> struct_pack_child;
struct_pack_child.emplace_back(make_uniq<FunctionExpression>("struct_pack", std::move(select_list)));
select_node.select_list.emplace_back(make_uniq<FunctionExpression>("to_json", std::move(struct_pack_child)));
// Now we can just use the CSV writer
copied_info.format = "csv";
copied_info.options = std::move(csv_copy_options);
copied_info.options["quote"] = {""};
copied_info.options["escape"] = {""};
copied_info.options["delimiter"] = {"\n"};
copied_info.options["header"] = {{0}};
return binder.Bind(*stmt_copy);
}
CopyFunction JSONFunctions::GetJSONCopyFunction() {
CopyFunction function("json");
function.extension = "json";
function.plan = CopyToJSONPlan;
function.copy_from_bind = MultiFileFunction<JSONMultiFileInfo>::MultiFileBindCopy;
function.copy_from_function = JSONFunctions::GetReadJSONTableFunction(make_shared_ptr<JSONScanInfo>(
JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT, JSONRecordType::RECORDS, false));
return function;
}
} // namespace duckdb

View File

@@ -0,0 +1,38 @@
#include "json_executors.hpp"
namespace duckdb {
static inline uint64_t GetArrayLength(yyjson_val *val, yyjson_alc *, Vector &, ValidityMask &, idx_t) {
return yyjson_arr_size(val);
}
static void UnaryArrayLengthFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::UnaryExecute<uint64_t>(args, state, result, GetArrayLength);
}
static void BinaryArrayLengthFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::BinaryExecute<uint64_t>(args, state, result, GetArrayLength);
}
static void ManyArrayLengthFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::ExecuteMany<uint64_t>(args, state, result, GetArrayLength);
}
static void GetArrayLengthFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
set.AddFunction(ScalarFunction({input_type}, LogicalType::UBIGINT, UnaryArrayLengthFunction, nullptr, nullptr,
nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::UBIGINT, BinaryArrayLengthFunction,
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
LogicalType::LIST(LogicalType::UBIGINT), ManyArrayLengthFunction,
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
}
ScalarFunctionSet JSONFunctions::GetArrayLengthFunction() {
ScalarFunctionSet set("json_array_length");
GetArrayLengthFunctionsInternal(set, LogicalType::VARCHAR);
GetArrayLengthFunctionsInternal(set, LogicalType::JSON());
return set;
}
} // namespace duckdb

View File

@@ -0,0 +1,155 @@
#include "json_executors.hpp"
namespace duckdb {
static inline bool JSONContains(yyjson_val *haystack, yyjson_val *needle);
static inline bool JSONFuzzyEquals(yyjson_val *haystack, yyjson_val *needle);
static inline bool JSONArrayFuzzyEquals(yyjson_val *haystack, yyjson_val *needle) {
D_ASSERT(yyjson_get_tag(haystack) == (YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE) &&
yyjson_get_tag(needle) == (YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE));
size_t needle_idx, needle_max, haystack_idx, haystack_max;
yyjson_val *needle_child, *haystack_child;
yyjson_arr_foreach(needle, needle_idx, needle_max, needle_child) {
bool found = false;
yyjson_arr_foreach(haystack, haystack_idx, haystack_max, haystack_child) {
if (JSONFuzzyEquals(haystack_child, needle_child)) {
found = true;
break;
}
}
if (!found) {
return false;
}
}
return true;
}
static inline bool JSONObjectFuzzyEquals(yyjson_val *haystack, yyjson_val *needle) {
D_ASSERT(yyjson_get_tag(haystack) == (YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE) &&
yyjson_get_tag(needle) == (YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE));
size_t idx, max;
yyjson_val *key, *needle_child;
yyjson_obj_foreach(needle, idx, max, key, needle_child) {
auto haystack_child = yyjson_obj_getn(haystack, unsafe_yyjson_get_str(key), unsafe_yyjson_get_len(key));
if (!haystack_child || !JSONFuzzyEquals(haystack_child, needle_child)) {
return false;
}
}
return true;
}
static inline bool JSONFuzzyEquals(yyjson_val *haystack, yyjson_val *needle) {
D_ASSERT(haystack && needle);
// Strict equality
if (unsafe_yyjson_equals(haystack, needle)) {
return true;
}
auto haystack_tag = yyjson_get_tag(needle);
if (haystack_tag != yyjson_get_tag(haystack)) {
return false;
}
// Fuzzy equality (contained in)
switch (haystack_tag) {
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
return JSONArrayFuzzyEquals(haystack, needle);
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
return JSONObjectFuzzyEquals(haystack, needle);
default:
return false;
}
}
static inline bool JSONArrayContains(yyjson_val *haystack_array, yyjson_val *needle) {
D_ASSERT(yyjson_get_tag(haystack_array) == (YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE));
size_t idx, max;
yyjson_val *child_haystack;
yyjson_arr_foreach(haystack_array, idx, max, child_haystack) {
if (JSONContains(child_haystack, needle)) {
return true;
}
}
return false;
}
static inline bool JSONObjectContains(yyjson_val *haystack_object, yyjson_val *needle) {
D_ASSERT(yyjson_get_tag(haystack_object) == (YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE));
size_t idx, max;
yyjson_val *key, *child_haystack;
yyjson_obj_foreach(haystack_object, idx, max, key, child_haystack) {
if (JSONContains(child_haystack, needle)) {
return true;
}
}
return false;
}
static inline bool JSONContains(yyjson_val *haystack, yyjson_val *needle) {
if (JSONFuzzyEquals(haystack, needle)) {
return true;
}
switch (yyjson_get_tag(haystack)) {
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
return JSONArrayContains(haystack, needle);
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
return JSONObjectContains(haystack, needle);
default:
return false;
}
}
static void JSONContainsFunction(DataChunk &args, ExpressionState &state, Vector &result) {
D_ASSERT(args.data.size() == 2);
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
auto alc = lstate.json_allocator->GetYYAlc();
auto &haystacks = args.data[0];
auto &needles = args.data[1];
if (needles.GetVectorType() == VectorType::CONSTANT_VECTOR) {
if (ConstantVector::IsNull(needles)) {
result.SetVectorType(VectorType::CONSTANT_VECTOR);
ConstantVector::SetNull(result, true);
return;
}
auto &needle_str = *ConstantVector::GetData<string_t>(needles);
auto needle_doc = JSONCommon::ReadDocument(needle_str, JSONCommon::READ_FLAG, alc);
UnaryExecutor::Execute<string_t, bool>(haystacks, result, args.size(), [&](string_t haystack_str) {
auto haystack_doc = JSONCommon::ReadDocument(haystack_str, JSONCommon::READ_FLAG, alc);
return JSONContains(haystack_doc->root, needle_doc->root);
});
} else {
BinaryExecutor::Execute<string_t, string_t, bool>(
haystacks, needles, result, args.size(), [&](string_t haystack_str, string_t needle_str) {
auto needle_doc = JSONCommon::ReadDocument(needle_str, JSONCommon::READ_FLAG, alc);
auto haystack_doc = JSONCommon::ReadDocument(haystack_str, JSONCommon::READ_FLAG, alc);
return JSONContains(haystack_doc->root, needle_doc->root);
});
}
}
static void GetContainsFunctionInternal(ScalarFunctionSet &set, const LogicalType &lhs, const LogicalType &rhs) {
set.AddFunction(ScalarFunction({lhs, rhs}, LogicalType::BOOLEAN, JSONContainsFunction, nullptr, nullptr, nullptr,
JSONFunctionLocalState::Init));
}
ScalarFunctionSet JSONFunctions::GetContainsFunction() {
ScalarFunctionSet set("json_contains");
GetContainsFunctionInternal(set, LogicalType::VARCHAR, LogicalType::VARCHAR);
GetContainsFunctionInternal(set, LogicalType::VARCHAR, LogicalType::JSON());
GetContainsFunctionInternal(set, LogicalType::JSON(), LogicalType::VARCHAR);
GetContainsFunctionInternal(set, LogicalType::JSON(), LogicalType::JSON());
// TODO: implement json_contains that accepts path argument as well
return set;
}
} // namespace duckdb

View File

@@ -0,0 +1,831 @@
#include "duckdb/function/cast/cast_function_set.hpp"
#include "duckdb/function/cast/default_casts.hpp"
#include "duckdb/planner/expression/bound_parameter_expression.hpp"
#include "json_common.hpp"
#include "json_functions.hpp"
namespace duckdb {
using StructNames = unordered_map<string, unique_ptr<Vector>>;
struct JSONCreateFunctionData : public FunctionData {
public:
explicit JSONCreateFunctionData(unordered_map<string, unique_ptr<Vector>> const_struct_names)
: const_struct_names(std::move(const_struct_names)) {
}
unique_ptr<FunctionData> Copy() const override {
// Have to do this because we can't implicitly copy Vector
unordered_map<string, unique_ptr<Vector>> map_copy;
for (const auto &kv : const_struct_names) {
// The vectors are const vectors of the key value
map_copy[kv.first] = make_uniq<Vector>(Value(kv.first));
}
return make_uniq<JSONCreateFunctionData>(std::move(map_copy));
}
bool Equals(const FunctionData &other_p) const override {
return true;
}
public:
// Const struct name vectors live here so they don't have to be re-initialized for every DataChunk
StructNames const_struct_names;
};
static LogicalType GetJSONType(StructNames &const_struct_names, const LogicalType &type) {
if (type.IsJSONType()) {
return type;
}
switch (type.id()) {
// These types can go directly into JSON
case LogicalTypeId::SQLNULL:
case LogicalTypeId::BOOLEAN:
case LogicalTypeId::TINYINT:
case LogicalTypeId::SMALLINT:
case LogicalTypeId::INTEGER:
case LogicalTypeId::BIGINT:
case LogicalTypeId::HUGEINT:
case LogicalTypeId::UHUGEINT:
case LogicalTypeId::UTINYINT:
case LogicalTypeId::USMALLINT:
case LogicalTypeId::UINTEGER:
case LogicalTypeId::UBIGINT:
case LogicalTypeId::FLOAT:
case LogicalTypeId::DOUBLE:
case LogicalTypeId::BIT:
case LogicalTypeId::BLOB:
case LogicalTypeId::VARCHAR:
case LogicalTypeId::AGGREGATE_STATE:
case LogicalTypeId::ENUM:
case LogicalTypeId::DATE:
case LogicalTypeId::INTERVAL:
case LogicalTypeId::TIME:
case LogicalTypeId::TIME_TZ:
case LogicalTypeId::TIMESTAMP:
case LogicalTypeId::TIMESTAMP_TZ:
case LogicalTypeId::TIMESTAMP_NS:
case LogicalTypeId::TIMESTAMP_MS:
case LogicalTypeId::TIMESTAMP_SEC:
case LogicalTypeId::UUID:
case LogicalTypeId::BIGNUM:
case LogicalTypeId::DECIMAL:
return type;
case LogicalTypeId::LIST:
return LogicalType::LIST(GetJSONType(const_struct_names, ListType::GetChildType(type)));
case LogicalTypeId::ARRAY:
return LogicalType::ARRAY(GetJSONType(const_struct_names, ArrayType::GetChildType(type)),
ArrayType::GetSize(type));
// Struct and MAP are treated as JSON values
case LogicalTypeId::STRUCT: {
child_list_t<LogicalType> child_types;
for (const auto &child_type : StructType::GetChildTypes(type)) {
const_struct_names[child_type.first] = make_uniq<Vector>(Value(child_type.first));
child_types.emplace_back(child_type.first, GetJSONType(const_struct_names, child_type.second));
}
return LogicalType::STRUCT(child_types);
}
case LogicalTypeId::MAP: {
return LogicalType::MAP(LogicalType::VARCHAR, GetJSONType(const_struct_names, MapType::ValueType(type)));
}
case LogicalTypeId::UNION: {
child_list_t<LogicalType> member_types;
for (idx_t member_idx = 0; member_idx < UnionType::GetMemberCount(type); member_idx++) {
auto &member_name = UnionType::GetMemberName(type, member_idx);
auto &member_type = UnionType::GetMemberType(type, member_idx);
const_struct_names[member_name] = make_uniq<Vector>(Value(member_name));
member_types.emplace_back(member_name, GetJSONType(const_struct_names, member_type));
}
return LogicalType::UNION(member_types);
}
// All other types (e.g. date) are cast to VARCHAR
default:
return LogicalTypeId::VARCHAR;
}
}
static unique_ptr<FunctionData> JSONCreateBindParams(ScalarFunction &bound_function,
vector<unique_ptr<Expression>> &arguments, bool object) {
unordered_map<string, unique_ptr<Vector>> const_struct_names;
for (idx_t i = 0; i < arguments.size(); i++) {
auto &type = arguments[i]->return_type;
if (arguments[i]->HasParameter()) {
throw ParameterNotResolvedException();
} else if (object && i % 2 == 0) {
if (type != LogicalType::VARCHAR) {
throw BinderException("json_object() keys must be VARCHAR, add an explicit cast to argument \"%s\"",
arguments[i]->GetName());
}
bound_function.arguments.push_back(LogicalType::VARCHAR);
} else {
// Value, cast to types that we can put in JSON
bound_function.arguments.push_back(GetJSONType(const_struct_names, type));
}
}
return make_uniq<JSONCreateFunctionData>(std::move(const_struct_names));
}
static unique_ptr<FunctionData> JSONObjectBind(ClientContext &context, ScalarFunction &bound_function,
vector<unique_ptr<Expression>> &arguments) {
if (arguments.size() % 2 != 0) {
throw BinderException("json_object() requires an even number of arguments");
}
return JSONCreateBindParams(bound_function, arguments, true);
}
static unique_ptr<FunctionData> JSONArrayBind(ClientContext &context, ScalarFunction &bound_function,
vector<unique_ptr<Expression>> &arguments) {
return JSONCreateBindParams(bound_function, arguments, false);
}
static unique_ptr<FunctionData> ToJSONBind(ClientContext &context, ScalarFunction &bound_function,
vector<unique_ptr<Expression>> &arguments) {
if (arguments.size() != 1) {
throw BinderException("to_json() takes exactly one argument");
}
return JSONCreateBindParams(bound_function, arguments, false);
}
static unique_ptr<FunctionData> ArrayToJSONBind(ClientContext &context, ScalarFunction &bound_function,
vector<unique_ptr<Expression>> &arguments) {
if (arguments.size() != 1) {
throw BinderException("array_to_json() takes exactly one argument");
}
auto arg_id = arguments[0]->return_type.id();
if (arguments[0]->HasParameter()) {
throw ParameterNotResolvedException();
}
if (arg_id != LogicalTypeId::LIST && arg_id != LogicalTypeId::SQLNULL) {
throw BinderException("array_to_json() argument type must be LIST");
}
return JSONCreateBindParams(bound_function, arguments, false);
}
static unique_ptr<FunctionData> RowToJSONBind(ClientContext &context, ScalarFunction &bound_function,
vector<unique_ptr<Expression>> &arguments) {
if (arguments.size() != 1) {
throw BinderException("row_to_json() takes exactly one argument");
}
auto arg_id = arguments[0]->return_type.id();
if (arguments[0]->HasParameter()) {
throw ParameterNotResolvedException();
}
if (arguments[0]->return_type.id() != LogicalTypeId::STRUCT && arg_id != LogicalTypeId::SQLNULL) {
throw BinderException("row_to_json() argument type must be STRUCT");
}
return JSONCreateBindParams(bound_function, arguments, false);
}
template <class INPUT_TYPE, class RESULT_TYPE>
struct CreateJSONValue {
static inline RESULT_TYPE Operation(const INPUT_TYPE &input) {
throw NotImplementedException("Unsupported type for CreateJSONValue");
}
};
template <class INPUT_TYPE>
struct CreateJSONValue<INPUT_TYPE, bool> {
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const INPUT_TYPE &input) {
return yyjson_mut_bool(doc, input);
}
};
template <class INPUT_TYPE>
struct CreateJSONValue<INPUT_TYPE, uint64_t> {
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const INPUT_TYPE &input) {
return yyjson_mut_uint(doc, input);
}
};
template <class INPUT_TYPE>
struct CreateJSONValue<INPUT_TYPE, int64_t> {
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const INPUT_TYPE &input) {
return yyjson_mut_sint(doc, input);
}
};
template <class INPUT_TYPE>
struct CreateJSONValue<INPUT_TYPE, double> {
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const INPUT_TYPE &input) {
return yyjson_mut_real(doc, input);
}
};
template <>
struct CreateJSONValue<string_t, string_t> {
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const string_t &input) {
return yyjson_mut_strncpy(doc, input.GetData(), input.GetSize());
}
};
template <>
struct CreateJSONValue<hugeint_t, string_t> {
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const hugeint_t &input) {
const auto input_string = input.ToString();
return yyjson_mut_rawncpy(doc, input_string.c_str(), input_string.length());
}
};
template <>
struct CreateJSONValue<uhugeint_t, string_t> {
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const uhugeint_t &input) {
const auto input_string = input.ToString();
return yyjson_mut_rawncpy(doc, input_string.c_str(), input_string.length());
}
};
template <class T>
inline yyjson_mut_val *CreateJSONValueFromJSON(yyjson_mut_doc *doc, const T &value) {
return nullptr; // This function should only be called with string_t as template
}
template <>
inline yyjson_mut_val *CreateJSONValueFromJSON(yyjson_mut_doc *doc, const string_t &value) {
auto value_doc = JSONCommon::ReadDocument(value, JSONCommon::READ_FLAG, &doc->alc);
auto result = yyjson_val_mut_copy(doc, value_doc->root);
return result;
}
// Forward declaration so we can recurse for nested types
static void CreateValues(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
idx_t count);
static void AddKeyValuePairs(yyjson_mut_doc *doc, yyjson_mut_val *objs[], Vector &key_v, yyjson_mut_val *vals[],
idx_t count) {
UnifiedVectorFormat key_data;
key_v.ToUnifiedFormat(count, key_data);
auto keys = UnifiedVectorFormat::GetData<string_t>(key_data);
for (idx_t i = 0; i < count; i++) {
auto key_idx = key_data.sel->get_index(i);
if (!key_data.validity.RowIsValid(key_idx)) {
continue;
}
auto key = CreateJSONValue<string_t, string_t>::Operation(doc, keys[key_idx]);
yyjson_mut_obj_add(objs[i], key, vals[i]);
}
}
static void CreateKeyValuePairs(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *objs[],
yyjson_mut_val *vals[], Vector &key_v, Vector &value_v, idx_t count) {
CreateValues(names, doc, vals, value_v, count);
AddKeyValuePairs(doc, objs, key_v, vals, count);
}
static void CreateValuesNull(yyjson_mut_doc *doc, yyjson_mut_val *vals[], idx_t count) {
for (idx_t i = 0; i < count; i++) {
vals[i] = yyjson_mut_null(doc);
}
}
template <class INPUT_TYPE, class TARGET_TYPE>
static void TemplatedCreateValues(yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v, idx_t count) {
UnifiedVectorFormat value_data;
value_v.ToUnifiedFormat(count, value_data);
auto values = UnifiedVectorFormat::GetData<INPUT_TYPE>(value_data);
const auto type_is_json = value_v.GetType().IsJSONType();
for (idx_t i = 0; i < count; i++) {
idx_t val_idx = value_data.sel->get_index(i);
if (!value_data.validity.RowIsValid(val_idx)) {
vals[i] = yyjson_mut_null(doc);
} else if (type_is_json) {
vals[i] = CreateJSONValueFromJSON(doc, values[val_idx]);
} else {
vals[i] = CreateJSONValue<INPUT_TYPE, TARGET_TYPE>::Operation(doc, values[val_idx]);
}
D_ASSERT(vals[i] != nullptr);
}
}
static void CreateRawValues(yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v, idx_t count) {
UnifiedVectorFormat value_data;
value_v.ToUnifiedFormat(count, value_data);
auto values = UnifiedVectorFormat::GetData<string_t>(value_data);
for (idx_t i = 0; i < count; i++) {
idx_t val_idx = value_data.sel->get_index(i);
if (!value_data.validity.RowIsValid(val_idx)) {
vals[i] = yyjson_mut_null(doc);
} else {
const auto &str = values[val_idx];
vals[i] = yyjson_mut_rawncpy(doc, str.GetData(), str.GetSize());
}
D_ASSERT(vals[i] != nullptr);
}
}
static void CreateValuesStruct(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
idx_t count) {
// Structs become values, therefore we initialize vals to JSON values
for (idx_t i = 0; i < count; i++) {
vals[i] = yyjson_mut_obj(doc);
}
// Initialize re-usable array for the nested values
auto nested_vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
// Add the key/value pairs to the values
auto &entries = StructVector::GetEntries(value_v);
for (idx_t entry_i = 0; entry_i < entries.size(); entry_i++) {
auto &struct_key_v = *names.at(StructType::GetChildName(value_v.GetType(), entry_i));
auto &struct_val_v = *entries[entry_i];
CreateKeyValuePairs(names, doc, vals, nested_vals, struct_key_v, struct_val_v, count);
}
// Whole struct can be NULL
UnifiedVectorFormat struct_data;
value_v.ToUnifiedFormat(count, struct_data);
for (idx_t i = 0; i < count; i++) {
idx_t idx = struct_data.sel->get_index(i);
if (!struct_data.validity.RowIsValid(idx)) {
vals[i] = yyjson_mut_null(doc);
}
}
}
static void CreateValuesMap(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
idx_t count) {
// Create nested keys
auto &map_key_v = MapVector::GetKeys(value_v);
auto map_key_count = ListVector::GetListSize(value_v);
Vector map_keys_string(LogicalType::VARCHAR, map_key_count);
VectorOperations::DefaultCast(map_key_v, map_keys_string, map_key_count);
auto nested_keys = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, map_key_count);
TemplatedCreateValues<string_t, string_t>(doc, nested_keys, map_keys_string, map_key_count);
// Create nested values
auto &map_val_v = MapVector::GetValues(value_v);
auto map_val_count = ListVector::GetListSize(value_v);
auto nested_vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, map_val_count);
CreateValues(names, doc, nested_vals, map_val_v, map_val_count);
// Add the key/value pairs to the values
UnifiedVectorFormat map_data;
value_v.ToUnifiedFormat(count, map_data);
auto map_key_list_entries = UnifiedVectorFormat::GetData<list_entry_t>(map_data);
for (idx_t i = 0; i < count; i++) {
idx_t idx = map_data.sel->get_index(i);
if (!map_data.validity.RowIsValid(idx)) {
// Whole map can be NULL
vals[i] = yyjson_mut_null(doc);
} else {
vals[i] = yyjson_mut_obj(doc);
const auto &key_list_entry = map_key_list_entries[idx];
for (idx_t child_i = key_list_entry.offset; child_i < key_list_entry.offset + key_list_entry.length;
child_i++) {
if (!unsafe_yyjson_is_null(nested_keys[child_i])) {
yyjson_mut_obj_add(vals[i], nested_keys[child_i], nested_vals[child_i]);
}
}
}
}
}
static void CreateValuesUnion(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
idx_t count) {
// Structs become values, therefore we initialize vals to JSON values
UnifiedVectorFormat value_data;
value_v.ToUnifiedFormat(count, value_data);
if (value_data.validity.AllValid()) {
for (idx_t i = 0; i < count; i++) {
vals[i] = yyjson_mut_obj(doc);
}
} else {
for (idx_t i = 0; i < count; i++) {
auto index = value_data.sel->get_index(i);
if (!value_data.validity.RowIsValid(index)) {
// Make the entry NULL if the Union value is NULL
vals[i] = yyjson_mut_null(doc);
} else {
vals[i] = yyjson_mut_obj(doc);
}
}
}
// Initialize re-usable array for the nested values
auto nested_vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
auto &tag_v = UnionVector::GetTags(value_v);
UnifiedVectorFormat tag_data;
tag_v.ToUnifiedFormat(count, tag_data);
// Add the key/value pairs to the values
for (idx_t member_idx = 0; member_idx < UnionType::GetMemberCount(value_v.GetType()); member_idx++) {
auto &member_val_v = UnionVector::GetMember(value_v, member_idx);
auto &member_key_v = *names.at(UnionType::GetMemberName(value_v.GetType(), member_idx));
// This implementation is not optimal since we convert the entire member vector,
// and then skip the rows not matching the tag afterwards.
CreateValues(names, doc, nested_vals, member_val_v, count);
// This is a inlined copy of AddKeyValuePairs but we also skip null tags
// and the rows where the member is not matching the tag
UnifiedVectorFormat key_data;
member_key_v.ToUnifiedFormat(count, key_data);
auto keys = UnifiedVectorFormat::GetData<string_t>(key_data);
for (idx_t i = 0; i < count; i++) {
auto value_index = value_data.sel->get_index(i);
if (!value_data.validity.RowIsValid(value_index)) {
// This entry is just NULL in it's entirety
continue;
}
auto tag_idx = tag_data.sel->get_index(i);
if (!tag_data.validity.RowIsValid(tag_idx)) {
continue;
}
auto tag = (UnifiedVectorFormat::GetData<uint8_t>(tag_data))[tag_idx];
if (tag != member_idx) {
continue;
}
auto key_idx = key_data.sel->get_index(i);
if (!key_data.validity.RowIsValid(key_idx)) {
continue;
}
auto key = CreateJSONValue<string_t, string_t>::Operation(doc, keys[key_idx]);
yyjson_mut_obj_add(vals[i], key, nested_vals[i]);
}
}
}
static void CreateValuesList(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
idx_t count) {
// Initialize array for the nested values
auto &child_v = ListVector::GetEntry(value_v);
auto child_count = ListVector::GetListSize(value_v);
auto nested_vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, child_count);
// Fill nested_vals with list values
CreateValues(names, doc, nested_vals, child_v, child_count);
// Now we add the values to the appropriate JSON arrays
UnifiedVectorFormat list_data;
value_v.ToUnifiedFormat(count, list_data);
auto list_entries = UnifiedVectorFormat::GetData<list_entry_t>(list_data);
for (idx_t i = 0; i < count; i++) {
idx_t idx = list_data.sel->get_index(i);
if (!list_data.validity.RowIsValid(idx)) {
vals[i] = yyjson_mut_null(doc);
} else {
vals[i] = yyjson_mut_arr(doc);
const auto &entry = list_entries[idx];
for (idx_t child_i = entry.offset; child_i < entry.offset + entry.length; child_i++) {
yyjson_mut_arr_append(vals[i], nested_vals[child_i]);
}
}
}
}
static void CreateValuesArray(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
idx_t count) {
value_v.Flatten(count);
// Initialize array for the nested values
auto &child_v = ArrayVector::GetEntry(value_v);
auto array_size = ArrayType::GetSize(value_v.GetType());
auto child_count = count * array_size;
auto nested_vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, child_count);
// Fill nested_vals with list values
CreateValues(names, doc, nested_vals, child_v, child_count);
// Now we add the values to the appropriate JSON arrays
UnifiedVectorFormat list_data;
value_v.ToUnifiedFormat(count, list_data);
for (idx_t i = 0; i < count; i++) {
idx_t idx = list_data.sel->get_index(i);
if (!list_data.validity.RowIsValid(idx)) {
vals[i] = yyjson_mut_null(doc);
} else {
vals[i] = yyjson_mut_arr(doc);
auto offset = idx * array_size;
for (idx_t child_i = offset; child_i < offset + array_size; child_i++) {
yyjson_mut_arr_append(vals[i], nested_vals[child_i]);
}
}
}
}
static void CreateValues(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
idx_t count) {
const auto &type = value_v.GetType();
switch (type.id()) {
case LogicalTypeId::SQLNULL:
CreateValuesNull(doc, vals, count);
break;
case LogicalTypeId::BOOLEAN:
TemplatedCreateValues<bool, bool>(doc, vals, value_v, count);
break;
case LogicalTypeId::TINYINT:
TemplatedCreateValues<int8_t, int64_t>(doc, vals, value_v, count);
break;
case LogicalTypeId::SMALLINT:
TemplatedCreateValues<int16_t, int64_t>(doc, vals, value_v, count);
break;
case LogicalTypeId::INTEGER:
TemplatedCreateValues<int32_t, int64_t>(doc, vals, value_v, count);
break;
case LogicalTypeId::BIGINT:
TemplatedCreateValues<int64_t, int64_t>(doc, vals, value_v, count);
break;
case LogicalTypeId::HUGEINT:
TemplatedCreateValues<hugeint_t, string_t>(doc, vals, value_v, count);
break;
case LogicalTypeId::UHUGEINT:
TemplatedCreateValues<uhugeint_t, string_t>(doc, vals, value_v, count);
break;
case LogicalTypeId::UTINYINT:
TemplatedCreateValues<uint8_t, uint64_t>(doc, vals, value_v, count);
break;
case LogicalTypeId::USMALLINT:
TemplatedCreateValues<uint16_t, uint64_t>(doc, vals, value_v, count);
break;
case LogicalTypeId::UINTEGER:
TemplatedCreateValues<uint32_t, uint64_t>(doc, vals, value_v, count);
break;
case LogicalTypeId::UBIGINT:
TemplatedCreateValues<uint64_t, uint64_t>(doc, vals, value_v, count);
break;
case LogicalTypeId::FLOAT:
TemplatedCreateValues<float, double>(doc, vals, value_v, count);
break;
case LogicalTypeId::DOUBLE:
TemplatedCreateValues<double, double>(doc, vals, value_v, count);
break;
case LogicalTypeId::VARCHAR:
TemplatedCreateValues<string_t, string_t>(doc, vals, value_v, count);
break;
case LogicalTypeId::STRUCT:
CreateValuesStruct(names, doc, vals, value_v, count);
break;
case LogicalTypeId::MAP:
CreateValuesMap(names, doc, vals, value_v, count);
break;
case LogicalTypeId::LIST:
CreateValuesList(names, doc, vals, value_v, count);
break;
case LogicalTypeId::UNION:
CreateValuesUnion(names, doc, vals, value_v, count);
break;
case LogicalTypeId::ARRAY:
CreateValuesArray(names, doc, vals, value_v, count);
break;
case LogicalTypeId::BIT:
case LogicalTypeId::BLOB:
case LogicalTypeId::AGGREGATE_STATE:
case LogicalTypeId::ENUM:
case LogicalTypeId::DATE:
case LogicalTypeId::INTERVAL:
case LogicalTypeId::TIME:
case LogicalTypeId::TIME_NS:
case LogicalTypeId::TIME_TZ:
case LogicalTypeId::TIMESTAMP:
case LogicalTypeId::TIMESTAMP_TZ:
case LogicalTypeId::TIMESTAMP_NS:
case LogicalTypeId::TIMESTAMP_MS:
case LogicalTypeId::TIMESTAMP_SEC:
case LogicalTypeId::UUID: {
Vector string_vector(LogicalTypeId::VARCHAR, count);
VectorOperations::DefaultCast(value_v, string_vector, count);
TemplatedCreateValues<string_t, string_t>(doc, vals, string_vector, count);
break;
}
case LogicalTypeId::BIGNUM: {
Vector string_vector(LogicalTypeId::VARCHAR, count);
VectorOperations::DefaultCast(value_v, string_vector, count);
CreateRawValues(doc, vals, string_vector, count);
break;
}
case LogicalTypeId::DECIMAL: {
if (DecimalType::GetWidth(type) > 15) {
Vector string_vector(LogicalTypeId::VARCHAR, count);
VectorOperations::DefaultCast(value_v, string_vector, count);
CreateRawValues(doc, vals, string_vector, count);
} else {
Vector double_vector(LogicalType::DOUBLE, count);
VectorOperations::DefaultCast(value_v, double_vector, count);
TemplatedCreateValues<double, double>(doc, vals, double_vector, count);
}
break;
}
case LogicalTypeId::INVALID:
case LogicalTypeId::UNKNOWN:
case LogicalTypeId::ANY:
case LogicalTypeId::USER:
case LogicalTypeId::TEMPLATE:
case LogicalTypeId::VARIANT:
case LogicalTypeId::CHAR:
case LogicalTypeId::STRING_LITERAL:
case LogicalTypeId::INTEGER_LITERAL:
case LogicalTypeId::POINTER:
case LogicalTypeId::VALIDITY:
case LogicalTypeId::TABLE:
case LogicalTypeId::LAMBDA:
case LogicalTypeId::GEOMETRY: // TODO! Add support for GEOMETRY
throw InternalException("Unsupported type arrived at JSON create function");
}
}
static void ObjectFunction(DataChunk &args, ExpressionState &state, Vector &result) {
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
const auto &info = func_expr.bind_info->Cast<JSONCreateFunctionData>();
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
auto alc = lstate.json_allocator->GetYYAlc();
// Initialize values
const idx_t count = args.size();
auto doc = JSONCommon::CreateDocument(alc);
auto objs = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
for (idx_t i = 0; i < count; i++) {
objs[i] = yyjson_mut_obj(doc);
}
// Initialize a re-usable value array
auto vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
// Loop through key/value pairs
for (idx_t pair_idx = 0; pair_idx < args.data.size() / 2; pair_idx++) {
Vector &key_v = args.data[pair_idx * 2];
Vector &value_v = args.data[pair_idx * 2 + 1];
CreateKeyValuePairs(info.const_struct_names, doc, objs, vals, key_v, value_v, count);
}
// Write JSON values to string
auto objects = FlatVector::GetData<string_t>(result);
for (idx_t i = 0; i < count; i++) {
objects[i] = JSONCommon::WriteVal<yyjson_mut_val>(objs[i], alc);
}
if (args.AllConstant()) {
result.SetVectorType(VectorType::CONSTANT_VECTOR);
}
JSONAllocator::AddBuffer(result, alc);
}
static void ArrayFunction(DataChunk &args, ExpressionState &state, Vector &result) {
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
const auto &info = func_expr.bind_info->Cast<JSONCreateFunctionData>();
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
auto alc = lstate.json_allocator->GetYYAlc();
// Initialize arrays
const idx_t count = args.size();
auto doc = JSONCommon::CreateDocument(alc);
auto arrs = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
for (idx_t i = 0; i < count; i++) {
arrs[i] = yyjson_mut_arr(doc);
}
// Initialize a re-usable value array
auto vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
// Loop through args
for (auto &v : args.data) {
CreateValues(info.const_struct_names, doc, vals, v, count);
for (idx_t i = 0; i < count; i++) {
yyjson_mut_arr_append(arrs[i], vals[i]);
}
}
// Write JSON arrays to string
auto objects = FlatVector::GetData<string_t>(result);
for (idx_t i = 0; i < count; i++) {
objects[i] = JSONCommon::WriteVal<yyjson_mut_val>(arrs[i], alc);
}
if (args.AllConstant()) {
result.SetVectorType(VectorType::CONSTANT_VECTOR);
}
JSONAllocator::AddBuffer(result, alc);
}
static void ToJSONFunctionInternal(const StructNames &names, Vector &input, const idx_t count, Vector &result,
yyjson_alc *alc) {
// Initialize array for values
auto doc = JSONCommon::CreateDocument(alc);
auto vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
CreateValues(names, doc, vals, input, count);
// Write JSON values to string
auto objects = FlatVector::GetData<string_t>(result);
auto &result_validity = FlatVector::Validity(result);
UnifiedVectorFormat input_data;
input.ToUnifiedFormat(count, input_data);
for (idx_t i = 0; i < count; i++) {
idx_t idx = input_data.sel->get_index(i);
if (input_data.validity.RowIsValid(idx)) {
objects[i] = JSONCommon::WriteVal<yyjson_mut_val>(vals[i], alc);
} else {
result_validity.SetInvalid(i);
}
}
if (input.GetVectorType() == VectorType::CONSTANT_VECTOR || count == 1) {
result.SetVectorType(VectorType::CONSTANT_VECTOR);
}
JSONAllocator::AddBuffer(result, alc);
}
static void ToJSONFunction(DataChunk &args, ExpressionState &state, Vector &result) {
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
const auto &info = func_expr.bind_info->Cast<JSONCreateFunctionData>();
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
auto alc = lstate.json_allocator->GetYYAlc();
ToJSONFunctionInternal(info.const_struct_names, args.data[0], args.size(), result, alc);
}
ScalarFunctionSet JSONFunctions::GetObjectFunction() {
ScalarFunction fun("json_object", {}, LogicalType::JSON(), ObjectFunction, JSONObjectBind, nullptr, nullptr,
JSONFunctionLocalState::Init);
fun.varargs = LogicalType::ANY;
fun.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
return ScalarFunctionSet(fun);
}
ScalarFunctionSet JSONFunctions::GetArrayFunction() {
ScalarFunction fun("json_array", {}, LogicalType::JSON(), ArrayFunction, JSONArrayBind, nullptr, nullptr,
JSONFunctionLocalState::Init);
fun.varargs = LogicalType::ANY;
fun.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
return ScalarFunctionSet(fun);
}
ScalarFunctionSet JSONFunctions::GetToJSONFunction() {
ScalarFunction fun("to_json", {}, LogicalType::JSON(), ToJSONFunction, ToJSONBind, nullptr, nullptr,
JSONFunctionLocalState::Init);
fun.varargs = LogicalType::ANY;
return ScalarFunctionSet(fun);
}
ScalarFunctionSet JSONFunctions::GetArrayToJSONFunction() {
ScalarFunction fun("array_to_json", {}, LogicalType::JSON(), ToJSONFunction, ArrayToJSONBind, nullptr, nullptr,
JSONFunctionLocalState::Init);
fun.varargs = LogicalType::ANY;
return ScalarFunctionSet(fun);
}
ScalarFunctionSet JSONFunctions::GetRowToJSONFunction() {
ScalarFunction fun("row_to_json", {}, LogicalType::JSON(), ToJSONFunction, RowToJSONBind, nullptr, nullptr,
JSONFunctionLocalState::Init);
fun.varargs = LogicalType::ANY;
return ScalarFunctionSet(fun);
}
struct NestedToJSONCastData : public BoundCastData {
public:
NestedToJSONCastData() {
}
unique_ptr<BoundCastData> Copy() const override {
auto result = make_uniq<NestedToJSONCastData>();
for (auto &csn : const_struct_names) {
result->const_struct_names.emplace(csn.first, make_uniq<Vector>(csn.second->GetValue(0)));
}
return std::move(result);
}
public:
StructNames const_struct_names;
};
static bool AnyToJSONCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
auto &lstate = parameters.local_state->Cast<JSONFunctionLocalState>();
lstate.json_allocator->Reset();
auto alc = lstate.json_allocator->GetYYAlc();
const auto &names = parameters.cast_data->Cast<NestedToJSONCastData>().const_struct_names;
ToJSONFunctionInternal(names, source, count, result, alc);
return true;
}
BoundCastInfo AnyToJSONCastBind(BindCastInput &input, const LogicalType &source, const LogicalType &target) {
auto cast_data = make_uniq<NestedToJSONCastData>();
GetJSONType(cast_data->const_struct_names, source);
return BoundCastInfo(AnyToJSONCast, std::move(cast_data), JSONFunctionLocalState::InitCastLocalState);
}
void JSONFunctions::RegisterJSONCreateCastFunctions(ExtensionLoader &loader) {
// Anything can be cast to JSON
for (const auto &type : LogicalType::AllTypes()) {
LogicalType source_type;
switch (type.id()) {
case LogicalTypeId::STRUCT:
source_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
break;
case LogicalTypeId::LIST:
source_type = LogicalType::LIST(LogicalType::ANY);
break;
case LogicalTypeId::MAP:
source_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
break;
case LogicalTypeId::UNION:
source_type = LogicalType::UNION({{"any", LogicalType::ANY}});
break;
case LogicalTypeId::ARRAY:
source_type = LogicalType::ARRAY(LogicalType::ANY, optional_idx());
break;
case LogicalTypeId::VARCHAR:
// We skip this one here as it's handled in json_functions.cpp
continue;
default:
source_type = type;
}
// We prefer going to JSON over going to VARCHAR if a function can do either
const auto source_to_json_cost = MaxValue<int64_t>(
CastFunctionSet::ImplicitCastCost(loader.GetDatabaseInstance(), source_type, LogicalType::VARCHAR) - 1, 0);
loader.RegisterCastFunction(source_type, LogicalType::JSON(), AnyToJSONCastBind, source_to_json_cost);
}
}
} // namespace duckdb

View File

@@ -0,0 +1,32 @@
#include "json_executors.hpp"
namespace duckdb {
static inline bool JSONExists(yyjson_val *val, yyjson_alc *, Vector &, ValidityMask &, idx_t) {
return val;
}
static void BinaryExistsFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::BinaryExecute<bool, false>(args, state, result, JSONExists);
}
static void ManyExistsFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::ExecuteMany<bool, false>(args, state, result, JSONExists);
}
static void GetExistsFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::BOOLEAN, BinaryExistsFunction,
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
LogicalType::LIST(LogicalType::BOOLEAN), ManyExistsFunction,
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
}
ScalarFunctionSet JSONFunctions::GetExistsFunction() {
ScalarFunctionSet set("json_exists");
GetExistsFunctionsInternal(set, LogicalType::VARCHAR);
GetExistsFunctionsInternal(set, LogicalType::JSON());
return set;
}
} // namespace duckdb

View File

@@ -0,0 +1,74 @@
#include "json_executors.hpp"
namespace duckdb {
static inline string_t ExtractFromVal(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &, idx_t) {
return JSONCommon::WriteVal<yyjson_val>(val, alc);
}
static inline string_t ExtractStringFromVal(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &mask, idx_t idx) {
switch (yyjson_get_tag(val)) {
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
mask.SetInvalid(idx);
return string_t {};
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC:
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
return string_t(unsafe_yyjson_get_str(val), unsafe_yyjson_get_len(val));
default:
return JSONCommon::WriteVal<yyjson_val>(val, alc);
}
}
static void ExtractFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::BinaryExecute<string_t>(args, state, result, ExtractFromVal);
}
static void ExtractManyFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::ExecuteMany<string_t>(args, state, result, ExtractFromVal);
}
static void ExtractStringFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::BinaryExecute<string_t>(args, state, result, ExtractStringFromVal);
}
static void ExtractStringManyFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::ExecuteMany<string_t>(args, state, result, ExtractStringFromVal);
}
static void GetExtractFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
set.AddFunction(ScalarFunction({input_type, LogicalType::BIGINT}, LogicalType::JSON(), ExtractFunction,
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::JSON(), ExtractFunction,
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
LogicalType::LIST(LogicalType::JSON()), ExtractManyFunction,
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
}
ScalarFunctionSet JSONFunctions::GetExtractFunction() {
// Generic extract function
ScalarFunctionSet set("json_extract");
GetExtractFunctionsInternal(set, LogicalType::VARCHAR);
GetExtractFunctionsInternal(set, LogicalType::JSON());
return set;
}
static void GetExtractStringFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
set.AddFunction(ScalarFunction({input_type, LogicalType::BIGINT}, LogicalType::VARCHAR, ExtractStringFunction,
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::VARCHAR, ExtractStringFunction,
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
LogicalType::LIST(LogicalType::VARCHAR), ExtractStringManyFunction,
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
}
ScalarFunctionSet JSONFunctions::GetExtractStringFunction() {
// String extract function
ScalarFunctionSet set("json_extract_string");
GetExtractStringFunctionsInternal(set, LogicalType::VARCHAR);
GetExtractStringFunctionsInternal(set, LogicalType::JSON());
return set;
}
} // namespace duckdb

View File

@@ -0,0 +1,59 @@
#include "json_executors.hpp"
namespace duckdb {
static inline list_entry_t GetJSONKeys(yyjson_val *val, yyjson_alc *, Vector &result, ValidityMask &, idx_t) {
auto num_keys = yyjson_obj_size(val);
auto current_size = ListVector::GetListSize(result);
auto new_size = current_size + num_keys;
// Grow list if needed
if (ListVector::GetListCapacity(result) < new_size) {
ListVector::Reserve(result, new_size);
}
// Write the strings to the child vector
auto keys = FlatVector::GetData<string_t>(ListVector::GetEntry(result));
size_t idx, max;
yyjson_val *key, *child_val;
yyjson_obj_foreach(val, idx, max, key, child_val) {
keys[current_size + idx] = string_t(unsafe_yyjson_get_str(key), unsafe_yyjson_get_len(key));
}
// Update size
ListVector::SetListSize(result, current_size + num_keys);
return {current_size, num_keys};
}
static void UnaryJSONKeysFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::UnaryExecute<list_entry_t>(args, state, result, GetJSONKeys);
}
static void BinaryJSONKeysFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::BinaryExecute<list_entry_t>(args, state, result, GetJSONKeys);
}
static void ManyJSONKeysFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::ExecuteMany<list_entry_t>(args, state, result, GetJSONKeys);
}
static void GetJSONKeysFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
set.AddFunction(ScalarFunction({input_type}, LogicalType::LIST(LogicalType::VARCHAR), UnaryJSONKeysFunction,
nullptr, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::LIST(LogicalType::VARCHAR),
BinaryJSONKeysFunction, JSONReadFunctionData::Bind, nullptr, nullptr,
JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
LogicalType::LIST(LogicalType::LIST(LogicalType::VARCHAR)), ManyJSONKeysFunction,
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
}
ScalarFunctionSet JSONFunctions::GetKeysFunction() {
ScalarFunctionSet set("json_keys");
GetJSONKeysFunctionsInternal(set, LogicalType::VARCHAR);
GetJSONKeysFunctionsInternal(set, LogicalType::JSON());
return set;
}
} // namespace duckdb

View File

@@ -0,0 +1,92 @@
#include "json_common.hpp"
#include "json_functions.hpp"
namespace duckdb {
static inline yyjson_mut_val *MergePatch(yyjson_mut_doc *doc, yyjson_mut_val *orig, yyjson_mut_val *patch) {
if ((yyjson_mut_get_tag(orig) != (YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE)) ||
(yyjson_mut_get_tag(patch) != (YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE))) {
// If either is not an object, we just return the second argument
return patch;
}
// Both are object, do the merge
return yyjson_mut_merge_patch(doc, orig, patch);
}
static inline void ReadObjects(yyjson_mut_doc *doc, Vector &input, yyjson_mut_val *objs[], const idx_t count) {
UnifiedVectorFormat input_data;
auto &input_vector = input;
input_vector.ToUnifiedFormat(count, input_data);
auto inputs = UnifiedVectorFormat::GetData<string_t>(input_data);
// Read the documents
for (idx_t i = 0; i < count; i++) {
auto idx = input_data.sel->get_index(i);
if (!input_data.validity.RowIsValid(idx)) {
objs[i] = nullptr;
} else {
objs[i] =
yyjson_val_mut_copy(doc, JSONCommon::ReadDocument(inputs[idx], JSONCommon::READ_FLAG, &doc->alc)->root);
}
}
}
//! Follows MySQL behaviour
static void MergePatchFunction(DataChunk &args, ExpressionState &state, Vector &result) {
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
auto alc = lstate.json_allocator->GetYYAlc();
auto doc = JSONCommon::CreateDocument(alc);
const auto count = args.size();
// Read the first json arg
auto origs = JSONCommon::AllocateArray<yyjson_mut_val *>(alc, count);
ReadObjects(doc, args.data[0], origs, count);
// Read the next json args one by one and merge them into the first json arg
auto patches = JSONCommon::AllocateArray<yyjson_mut_val *>(alc, count);
for (idx_t arg_idx = 1; arg_idx < args.data.size(); arg_idx++) {
ReadObjects(doc, args.data[arg_idx], patches, count);
for (idx_t i = 0; i < count; i++) {
if (patches[i] == nullptr) {
// Next json arg is NULL, obj becomes NULL
origs[i] = nullptr;
} else if (origs[i] == nullptr) {
// Current obj is NULL, obj becomes next json arg
origs[i] = patches[i];
} else {
// Neither is NULL, merge them
origs[i] = MergePatch(doc, origs[i], patches[i]);
}
}
}
// Write to result vector
auto result_data = FlatVector::GetData<string_t>(result);
auto &result_validity = FlatVector::Validity(result);
for (idx_t i = 0; i < count; i++) {
if (origs[i] == nullptr) {
result_validity.SetInvalid(i);
} else {
result_data[i] = JSONCommon::WriteVal<yyjson_mut_val>(origs[i], alc);
}
}
if (args.AllConstant()) {
result.SetVectorType(VectorType::CONSTANT_VECTOR);
}
JSONAllocator::AddBuffer(result, alc);
}
ScalarFunctionSet JSONFunctions::GetMergePatchFunction() {
ScalarFunction fun("json_merge_patch", {LogicalType::JSON(), LogicalType::JSON()}, LogicalType::JSON(),
MergePatchFunction, nullptr, nullptr, nullptr, JSONFunctionLocalState::Init);
fun.varargs = LogicalType::JSON();
fun.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
return ScalarFunctionSet(fun);
}
} // namespace duckdb

View File

@@ -0,0 +1,32 @@
#include "json_executors.hpp"
namespace duckdb {
//! Pretty Print a given JSON Document
string_t PrettyPrint(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &, idx_t) {
D_ASSERT(alc);
size_t len_size_t;
auto data = yyjson_val_write_opts(val, JSONCommon::WRITE_PRETTY_FLAG, alc, &len_size_t, nullptr);
idx_t len = len_size_t;
return string_t(data, len);
}
static void PrettyPrintFunction(DataChunk &args, ExpressionState &state, Vector &result) {
auto json_type = args.data[0].GetType();
D_ASSERT(json_type == LogicalType::VARCHAR || json_type == LogicalType::JSON());
JSONExecutors::UnaryExecute<string_t>(args, state, result, PrettyPrint);
}
static void GetPrettyPrintFunctionInternal(ScalarFunctionSet &set, const LogicalType &json) {
set.AddFunction(ScalarFunction("json_pretty", {json}, LogicalType::VARCHAR, PrettyPrintFunction, nullptr, nullptr,
nullptr, JSONFunctionLocalState::Init));
}
ScalarFunctionSet JSONFunctions::GetPrettyPrintFunction() {
ScalarFunctionSet set("json_pretty");
GetPrettyPrintFunctionInternal(set, LogicalType::JSON());
return set;
}
} // namespace duckdb

View File

@@ -0,0 +1,223 @@
#include "duckdb/execution/column_binding_resolver.hpp"
#include "duckdb/execution/expression_executor.hpp"
#include "duckdb/main/connection.hpp"
#include "duckdb/main/database.hpp"
#include "duckdb/optimizer/optimizer.hpp"
#include "duckdb/parser/parsed_data/create_pragma_function_info.hpp"
#include "duckdb/parser/parser.hpp"
#include "duckdb/planner/planner.hpp"
#include "json_common.hpp"
#include "json_deserializer.hpp"
#include "json_functions.hpp"
#include "json_serializer.hpp"
namespace duckdb {
//-----------------------------------------------------------------------------
// json_serialize_plan
//-----------------------------------------------------------------------------
struct JsonSerializePlanBindData : public FunctionData {
bool skip_if_null = false;
bool skip_if_empty = false;
bool skip_if_default = false;
bool format = false;
bool optimize = false;
JsonSerializePlanBindData(bool skip_if_null_p, bool skip_if_empty_p, bool skip_if_default_p, bool format_p,
bool optimize_p)
: skip_if_null(skip_if_null_p), skip_if_empty(skip_if_empty_p), skip_if_default(skip_if_default_p),
format(format_p), optimize(optimize_p) {
}
public:
unique_ptr<FunctionData> Copy() const override {
return make_uniq<JsonSerializePlanBindData>(skip_if_null, skip_if_empty, skip_if_default, format, optimize);
}
bool Equals(const FunctionData &other_p) const override {
return true;
}
};
static unique_ptr<FunctionData> JsonSerializePlanBind(ClientContext &context, ScalarFunction &bound_function,
vector<unique_ptr<Expression>> &arguments) {
if (arguments.empty()) {
throw BinderException("json_serialize_plan takes at least one argument");
}
if (arguments[0]->return_type != LogicalType::VARCHAR) {
throw InvalidTypeException("json_serialize_plan first argument must be a VARCHAR");
}
// Optional arguments
bool skip_if_null = false;
bool skip_if_empty = false;
bool skip_if_default = false;
bool format = false;
bool optimize = false;
for (idx_t i = 1; i < arguments.size(); i++) {
auto &arg = arguments[i];
if (arg->HasParameter()) {
throw ParameterNotResolvedException();
}
if (!arg->IsFoldable()) {
throw BinderException("json_serialize_plan: arguments must be constant");
}
auto &alias = arg->GetAlias();
if (alias == "skip_null") {
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
throw BinderException("json_serialize_plan: 'skip_null' argument must be a boolean");
}
skip_if_null = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
} else if (alias == "skip_empty") {
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
throw BinderException("json_serialize_plan: 'skip_empty' argument must be a boolean");
}
skip_if_empty = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
} else if (alias == "skip_default") {
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
throw BinderException("json_serialize_plan: 'skip_default' argument must be a boolean");
}
skip_if_default = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
} else if (alias == "format") {
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
throw BinderException("json_serialize_plan: 'format' argument must be a boolean");
}
format = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
} else if (alias == "optimize") {
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
throw BinderException("json_serialize_plan: 'optimize' argument must be a boolean");
}
optimize = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
} else {
throw BinderException(StringUtil::Format("json_serialize_plan: Unknown argument '%s'", alias));
}
}
return make_uniq<JsonSerializePlanBindData>(skip_if_null, skip_if_empty, skip_if_default, format, optimize);
}
static bool OperatorSupportsSerialization(LogicalOperator &op, string &operator_name) {
for (auto &child : op.children) {
if (!OperatorSupportsSerialization(*child, operator_name)) {
return false;
}
}
auto supported = op.SupportSerialization();
if (!supported) {
operator_name = EnumUtil::ToString(op.type);
}
return supported;
}
static void JsonSerializePlanFunction(DataChunk &args, ExpressionState &state, Vector &result) {
auto &local_state = JSONFunctionLocalState::ResetAndGet(state);
auto alc = local_state.json_allocator->GetYYAlc();
auto &inputs = args.data[0];
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
const auto &info = func_expr.bind_info->Cast<JsonSerializePlanBindData>();
if (!state.HasContext()) {
throw InvalidInputException("json_serialize_plan: No client context available");
}
auto &context = state.GetContext();
UnaryExecutor::Execute<string_t, string_t>(inputs, result, args.size(), [&](string_t input) {
auto doc = JSONCommon::CreateDocument(alc);
auto result_obj = yyjson_mut_obj(doc);
yyjson_mut_doc_set_root(doc, result_obj);
try {
Parser parser;
parser.ParseQuery(input.GetString());
auto plans_arr = yyjson_mut_arr(doc);
for (auto &statement : parser.statements) {
auto stmt = std::move(statement);
Planner planner(context);
planner.CreatePlan(std::move(stmt));
auto plan = std::move(planner.plan);
if (info.optimize && plan->RequireOptimizer()) {
Optimizer optimizer(*planner.binder, context);
plan = optimizer.Optimize(std::move(plan));
}
ColumnBindingResolver resolver;
resolver.Verify(*plan);
resolver.VisitOperator(*plan);
plan->ResolveOperatorTypes();
string operator_name;
if (!OperatorSupportsSerialization(*plan, operator_name)) {
throw InvalidInputException("Operator '%s' does not support serialization", operator_name);
}
auto plan_json =
JsonSerializer::Serialize(*plan, doc, info.skip_if_null, info.skip_if_empty, info.skip_if_default);
yyjson_mut_arr_append(plans_arr, plan_json);
}
yyjson_mut_obj_add_false(doc, result_obj, "error");
yyjson_mut_obj_add_val(doc, result_obj, "plans", plans_arr);
size_t len_size_t;
auto data = yyjson_mut_val_write_opts(result_obj,
info.format ? JSONCommon::WRITE_PRETTY_FLAG : JSONCommon::WRITE_FLAG,
alc, &len_size_t, nullptr);
idx_t len = len_size_t;
if (data == nullptr) {
throw SerializationException(
"Failed to serialize json, perhaps the query contains invalid utf8 characters?");
}
return StringVector::AddString(result, data, len);
} catch (std::exception &ex) {
ErrorData error(ex);
yyjson_mut_obj_add_true(doc, result_obj, "error");
// error type and message
yyjson_mut_obj_add_strcpy(doc, result_obj, "error_type",
StringUtil::Lower(Exception::ExceptionTypeToString(error.Type())).c_str());
yyjson_mut_obj_add_strcpy(doc, result_obj, "error_message", error.RawMessage().c_str());
// add extra info
for (auto &entry : error.ExtraInfo()) {
yyjson_mut_obj_add_strcpy(doc, result_obj, entry.first.c_str(), entry.second.c_str());
}
size_t len_size_t;
auto data = yyjson_mut_val_write_opts(result_obj,
info.format ? JSONCommon::WRITE_PRETTY_FLAG : JSONCommon::WRITE_FLAG,
alc, &len_size_t, nullptr);
idx_t len = len_size_t;
return StringVector::AddString(result, data, len);
}
});
}
ScalarFunctionSet JSONFunctions::GetSerializePlanFunction() {
ScalarFunctionSet set("json_serialize_plan");
set.AddFunction(ScalarFunction({LogicalType::VARCHAR}, LogicalType::JSON(), JsonSerializePlanFunction,
JsonSerializePlanBind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BOOLEAN}, LogicalType::JSON(),
JsonSerializePlanFunction, JsonSerializePlanBind, nullptr, nullptr,
JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BOOLEAN, LogicalType::BOOLEAN},
LogicalType::JSON(), JsonSerializePlanFunction, JsonSerializePlanBind, nullptr,
nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction(
{LogicalType::VARCHAR, LogicalType::BOOLEAN, LogicalType::BOOLEAN, LogicalType::BOOLEAN}, LogicalType::JSON(),
JsonSerializePlanFunction, JsonSerializePlanBind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction(
{LogicalType::VARCHAR, LogicalType::BOOLEAN, LogicalType::BOOLEAN, LogicalType::BOOLEAN, LogicalType::BOOLEAN},
LogicalType::JSON(), JsonSerializePlanFunction, JsonSerializePlanBind, nullptr, nullptr,
JSONFunctionLocalState::Init));
return set;
}
} // namespace duckdb

View File

@@ -0,0 +1,323 @@
#include "duckdb/execution/expression_executor.hpp"
#include "duckdb/main/connection.hpp"
#include "duckdb/main/database.hpp"
#include "duckdb/parser/parsed_data/create_pragma_function_info.hpp"
#include "duckdb/parser/parser.hpp"
#include "json_deserializer.hpp"
#include "json_functions.hpp"
#include "json_serializer.hpp"
namespace duckdb {
struct JsonSerializeBindData : public FunctionData {
bool skip_if_null = false;
bool skip_if_empty = false;
bool skip_if_default = false;
bool format = false;
JsonSerializeBindData(bool skip_if_null_p, bool skip_if_empty_p, bool skip_if_default_p, bool format_p)
: skip_if_null(skip_if_null_p), skip_if_empty(skip_if_empty_p), skip_if_default(skip_if_default_p),
format(format_p) {
}
public:
unique_ptr<FunctionData> Copy() const override {
return make_uniq<JsonSerializeBindData>(skip_if_null, skip_if_empty, skip_if_default, format);
}
bool Equals(const FunctionData &other_p) const override {
return true;
}
};
static unique_ptr<FunctionData> JsonSerializeBind(ClientContext &context, ScalarFunction &bound_function,
vector<unique_ptr<Expression>> &arguments) {
if (arguments.empty()) {
throw BinderException("json_serialize_sql takes at least one argument");
}
if (arguments[0]->return_type != LogicalType::VARCHAR) {
throw InvalidTypeException("json_serialize_sql first argument must be a VARCHAR");
}
// Optional arguments
bool skip_if_null = false;
bool skip_if_empty = false;
bool skip_if_default = false;
bool format = false;
for (idx_t i = 1; i < arguments.size(); i++) {
auto &arg = arguments[i];
if (arg->HasParameter()) {
throw ParameterNotResolvedException();
}
if (!arg->IsFoldable()) {
throw BinderException("json_serialize_sql: arguments must be constant");
}
auto &alias = arg->GetAlias();
if (alias == "skip_null") {
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
throw BinderException("json_serialize_sql: 'skip_null' argument must be a boolean");
}
skip_if_null = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
} else if (alias == "skip_empty") {
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
throw BinderException("json_serialize_sql: 'skip_empty' argument must be a boolean");
}
skip_if_empty = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
} else if (alias == "format") {
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
throw BinderException("json_serialize_sql: 'format' argument must be a boolean");
}
format = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
} else if (alias == "skip_default") {
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
throw BinderException("json_serialize_sql: 'skip_default' argument must be a boolean");
}
skip_if_default = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
} else {
throw BinderException(StringUtil::Format("json_serialize_sql: Unknown argument '%s'", alias));
}
}
return make_uniq<JsonSerializeBindData>(skip_if_null, skip_if_empty, skip_if_default, format);
}
static void JsonSerializeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
auto &local_state = JSONFunctionLocalState::ResetAndGet(state);
auto alc = local_state.json_allocator->GetYYAlc();
auto &inputs = args.data[0];
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
const auto &info = func_expr.bind_info->Cast<JsonSerializeBindData>();
UnaryExecutor::Execute<string_t, string_t>(inputs, result, args.size(), [&](string_t input) {
auto doc = JSONCommon::CreateDocument(alc);
auto result_obj = yyjson_mut_obj(doc);
yyjson_mut_doc_set_root(doc, result_obj);
try {
auto parser = Parser();
parser.ParseQuery(input.GetString());
auto statements_arr = yyjson_mut_arr(doc);
for (auto &statement : parser.statements) {
if (statement->type != StatementType::SELECT_STATEMENT) {
throw NotImplementedException("Only SELECT statements can be serialized to json!");
}
auto &select = statement->Cast<SelectStatement>();
auto json =
JsonSerializer::Serialize(select, doc, info.skip_if_null, info.skip_if_empty, info.skip_if_default);
yyjson_mut_arr_append(statements_arr, json);
}
yyjson_mut_obj_add_false(doc, result_obj, "error");
yyjson_mut_obj_add_val(doc, result_obj, "statements", statements_arr);
size_t len_size_t;
auto data = yyjson_mut_val_write_opts(result_obj,
info.format ? JSONCommon::WRITE_PRETTY_FLAG : JSONCommon::WRITE_FLAG,
alc, &len_size_t, nullptr);
idx_t len = len_size_t;
if (data == nullptr) {
throw SerializationException(
"Failed to serialize json, perhaps the query contains invalid utf8 characters?");
}
return StringVector::AddString(result, data, len);
} catch (std::exception &ex) {
ErrorData error(ex);
yyjson_mut_obj_add_true(doc, result_obj, "error");
yyjson_mut_obj_add_strcpy(doc, result_obj, "error_type",
StringUtil::Lower(Exception::ExceptionTypeToString(error.Type())).c_str());
yyjson_mut_obj_add_strcpy(doc, result_obj, "error_message", error.RawMessage().c_str());
// add extra info
for (auto &entry : error.ExtraInfo()) {
yyjson_mut_obj_add_strcpy(doc, result_obj, entry.first.c_str(), entry.second.c_str());
}
size_t len_size_t;
auto data = yyjson_mut_val_write_opts(result_obj,
info.format ? JSONCommon::WRITE_PRETTY_FLAG : JSONCommon::WRITE_FLAG,
alc, &len_size_t, nullptr);
idx_t len = len_size_t;
return StringVector::AddString(result, data, len);
}
});
}
ScalarFunctionSet JSONFunctions::GetSerializeSqlFunction() {
ScalarFunctionSet set("json_serialize_sql");
set.AddFunction(ScalarFunction({LogicalType::VARCHAR}, LogicalType::JSON(), JsonSerializeFunction,
JsonSerializeBind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BOOLEAN}, LogicalType::JSON(),
JsonSerializeFunction, JsonSerializeBind, nullptr, nullptr,
JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BOOLEAN, LogicalType::BOOLEAN},
LogicalType::JSON(), JsonSerializeFunction, JsonSerializeBind, nullptr, nullptr,
JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction(
{LogicalType::VARCHAR, LogicalType::BOOLEAN, LogicalType::BOOLEAN, LogicalType::BOOLEAN}, LogicalType::JSON(),
JsonSerializeFunction, JsonSerializeBind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction(
{LogicalType::VARCHAR, LogicalType::BOOLEAN, LogicalType::BOOLEAN, LogicalType::BOOLEAN, LogicalType::BOOLEAN},
LogicalType::JSON(), JsonSerializeFunction, JsonSerializeBind, nullptr, nullptr, JSONFunctionLocalState::Init));
return set;
}
//----------------------------------------------------------------------
// JSON DESERIALIZE
//----------------------------------------------------------------------
static vector<unique_ptr<SelectStatement>> DeserializeSelectStatement(string_t input, yyjson_alc *alc) {
auto doc = yyjson_doc_ptr(JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc));
if (!doc) {
throw ParserException("Could not parse json");
}
auto root = doc->root;
auto err = yyjson_obj_get(root, "error");
if (err && yyjson_is_true(err)) {
auto err_type = yyjson_obj_get(root, "error_type");
auto err_msg = yyjson_obj_get(root, "error_message");
if (err_type && err_msg) {
throw ParserException("Error parsing json: %s: %s", yyjson_get_str(err_type), yyjson_get_str(err_msg));
}
throw ParserException(
"Error parsing json, expected error property to contain 'error_type' and 'error_message'");
}
auto statements = yyjson_obj_get(root, "statements");
if (!statements || !yyjson_is_arr(statements)) {
throw ParserException("Error parsing json: no statements array");
}
auto size = yyjson_arr_size(statements);
if (size == 0) {
throw ParserException("Error parsing json: no statements");
}
vector<unique_ptr<SelectStatement>> result;
idx_t idx;
idx_t max;
yyjson_val *stmt_json;
yyjson_arr_foreach(statements, idx, max, stmt_json) {
JsonDeserializer deserializer(stmt_json, doc);
auto stmt = SelectStatement::Deserialize(deserializer);
if (!stmt->node) {
throw ParserException("Error parsing json: no select node found in json");
}
result.push_back(std::move(stmt));
}
return result;
}
//----------------------------------------------------------------------
// JSON DESERIALIZE SQL FUNCTION
//----------------------------------------------------------------------
static void JsonDeserializeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
auto &local_state = JSONFunctionLocalState::ResetAndGet(state);
auto alc = local_state.json_allocator->GetYYAlc();
auto &inputs = args.data[0];
UnaryExecutor::Execute<string_t, string_t>(inputs, result, args.size(), [&](string_t input) {
auto stmts = DeserializeSelectStatement(input, alc);
// Combine all statements into a single semicolon separated string
string str;
for (idx_t i = 0; i < stmts.size(); i++) {
if (i > 0) {
str += "; ";
}
str += stmts[i]->ToString();
}
return StringVector::AddString(result, str);
});
}
ScalarFunctionSet JSONFunctions::GetDeserializeSqlFunction() {
ScalarFunctionSet set("json_deserialize_sql");
set.AddFunction(ScalarFunction({LogicalType::JSON()}, LogicalType::VARCHAR, JsonDeserializeFunction, nullptr,
nullptr, nullptr, JSONFunctionLocalState::Init));
return set;
}
//----------------------------------------------------------------------
// JSON EXECUTE SERIALIZED SQL (PRAGMA)
//----------------------------------------------------------------------
static string ExecuteJsonSerializedSqlPragmaFunction(ClientContext &context, const FunctionParameters &parameters) {
JSONFunctionLocalState local_state(context);
auto alc = local_state.json_allocator->GetYYAlc();
auto input = parameters.values[0].GetValueUnsafe<string_t>();
auto stmts = DeserializeSelectStatement(input, alc);
if (stmts.size() != 1) {
throw BinderException("json_execute_serialized_sql pragma expects exactly one statement");
}
return stmts[0]->ToString();
}
PragmaFunctionSet JSONFunctions::GetExecuteJsonSerializedSqlPragmaFunction() {
return PragmaFunctionSet(PragmaFunction::PragmaCall(
"json_execute_serialized_sql", ExecuteJsonSerializedSqlPragmaFunction, {LogicalType::VARCHAR}));
}
//----------------------------------------------------------------------
// JSON EXECUTE SERIALIZED SQL (TABLE FUNCTION)
//----------------------------------------------------------------------
struct ExecuteSqlTableFunction {
struct BindData : public TableFunctionData {
shared_ptr<Relation> plan;
unique_ptr<QueryResult> result;
unique_ptr<Connection> con;
};
static unique_ptr<FunctionData> Bind(ClientContext &context, TableFunctionBindInput &input,
vector<LogicalType> &return_types, vector<string> &names) {
JSONFunctionLocalState local_state(context);
auto alc = local_state.json_allocator->GetYYAlc();
auto result = make_uniq<BindData>();
result->con = make_uniq<Connection>(*context.db);
if (input.inputs[0].IsNull()) {
throw BinderException("json_execute_serialized_sql cannot execute NULL plan");
}
auto serialized = input.inputs[0].GetValueUnsafe<string>();
auto stmts = DeserializeSelectStatement(serialized, alc);
if (stmts.size() != 1) {
throw BinderException("json_execute_serialized_sql expects exactly one statement");
}
result->plan = result->con->RelationFromQuery(std::move(stmts[0]));
for (auto &col : result->plan->Columns()) {
return_types.emplace_back(col.Type());
names.emplace_back(col.Name());
}
return std::move(result);
}
static void Function(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
auto &data = (BindData &)*data_p.bind_data;
if (!data.result) {
data.result = data.plan->Execute();
}
auto result_chunk = data.result->Fetch();
if (!result_chunk) {
return;
}
output.Move(*result_chunk);
}
};
TableFunctionSet JSONFunctions::GetExecuteJsonSerializedSqlFunction() {
TableFunction func("json_execute_serialized_sql", {LogicalType::VARCHAR}, ExecuteSqlTableFunction::Function,
ExecuteSqlTableFunction::Bind);
return TableFunctionSet(func);
}
} // namespace duckdb

View File

@@ -0,0 +1,809 @@
#include "json_structure.hpp"
#include "duckdb/common/enum_util.hpp"
#include "duckdb/common/extra_type_info.hpp"
#include "json_executors.hpp"
#include "json_scan.hpp"
#include "json_transform.hpp"
namespace duckdb {
static bool IsNumeric(LogicalTypeId type) {
return type == LogicalTypeId::DOUBLE || type == LogicalTypeId::UBIGINT || type == LogicalTypeId::BIGINT;
}
static LogicalTypeId MaxNumericType(const LogicalTypeId &a, const LogicalTypeId &b) {
D_ASSERT(a != b);
if (a == LogicalTypeId::DOUBLE || b == LogicalTypeId::DOUBLE) {
return LogicalTypeId::DOUBLE;
}
return LogicalTypeId::BIGINT;
}
JSONStructureNode::JSONStructureNode() : count(0), null_count(0) {
}
JSONStructureNode::JSONStructureNode(const char *key_ptr, const size_t key_len) : JSONStructureNode() {
key = make_uniq<string>(key_ptr, key_len);
}
JSONStructureNode::JSONStructureNode(yyjson_val *key_p, yyjson_val *val_p, const bool ignore_errors)
: JSONStructureNode(unsafe_yyjson_get_str(key_p), unsafe_yyjson_get_len(key_p)) {
JSONStructure::ExtractStructure(val_p, *this, ignore_errors);
}
static void SwapJSONStructureNode(JSONStructureNode &a, JSONStructureNode &b) noexcept {
std::swap(a.key, b.key);
std::swap(a.initialized, b.initialized);
std::swap(a.descriptions, b.descriptions);
std::swap(a.count, b.count);
std::swap(a.null_count, b.null_count);
}
JSONStructureNode::JSONStructureNode(JSONStructureNode &&other) noexcept {
SwapJSONStructureNode(*this, other);
}
JSONStructureNode &JSONStructureNode::operator=(JSONStructureNode &&other) noexcept {
SwapJSONStructureNode(*this, other);
return *this;
}
JSONStructureDescription &JSONStructureNode::GetOrCreateDescription(const LogicalTypeId type) {
if (descriptions.empty()) {
// Empty, just put this type in there
descriptions.emplace_back(type);
return descriptions.back();
}
if (descriptions.size() == 1 && descriptions[0].type == LogicalTypeId::SQLNULL) {
// Only a NULL in there, override
descriptions[0].type = type;
return descriptions[0];
}
if (type == LogicalTypeId::SQLNULL) {
// 'descriptions' is non-empty, so let's not add NULL
return descriptions.back();
}
// Check if type is already in there or if we can merge numerics
const auto is_numeric = IsNumeric(type);
for (auto &description : descriptions) {
if (type == description.type) {
return description;
}
if (is_numeric && IsNumeric(description.type)) {
description.type = MaxNumericType(type, description.type);
return description;
}
}
// Type was not there, create a new description
descriptions.emplace_back(type);
return descriptions.back();
}
bool JSONStructureNode::ContainsVarchar() const {
if (descriptions.size() != 1) {
// We can't refine types if we have more than 1 description (yet), defaults to JSON type for now
return false;
}
auto &description = descriptions[0];
if (description.type == LogicalTypeId::VARCHAR) {
return true;
}
for (auto &child : description.children) {
if (child.ContainsVarchar()) {
return true;
}
}
return false;
}
void JSONStructureNode::InitializeCandidateTypes(const idx_t max_depth, const bool convert_strings_to_integers,
const idx_t depth) {
if (depth >= max_depth) {
return;
}
if (descriptions.size() != 1) {
// We can't refine types if we have more than 1 description (yet), defaults to JSON type for now
return;
}
auto &description = descriptions[0];
if (description.type == LogicalTypeId::VARCHAR && !initialized) {
// We loop through the candidate types and format templates from back to front
if (convert_strings_to_integers) {
description.candidate_types = {LogicalTypeId::UUID, LogicalTypeId::BIGINT, LogicalTypeId::TIMESTAMP,
LogicalTypeId::DATE, LogicalTypeId::TIME};
} else {
description.candidate_types = {LogicalTypeId::UUID, LogicalTypeId::TIMESTAMP, LogicalTypeId::DATE,
LogicalTypeId::TIME};
}
initialized = true;
} else {
for (auto &child : description.children) {
child.InitializeCandidateTypes(max_depth, convert_strings_to_integers, depth + 1);
}
}
}
void JSONStructureNode::RefineCandidateTypes(yyjson_val *vals[], const idx_t val_count, Vector &string_vector,
ArenaAllocator &allocator, MutableDateFormatMap &date_format_map) {
if (descriptions.size() != 1) {
// We can't refine types if we have more than 1 description (yet), defaults to JSON type for now
return;
}
if (!ContainsVarchar()) {
return;
}
auto &description = descriptions[0];
switch (description.type) {
case LogicalTypeId::LIST:
return RefineCandidateTypesArray(vals, val_count, string_vector, allocator, date_format_map);
case LogicalTypeId::STRUCT:
return RefineCandidateTypesObject(vals, val_count, string_vector, allocator, date_format_map);
case LogicalTypeId::VARCHAR:
return RefineCandidateTypesString(vals, val_count, string_vector, date_format_map);
default:
return;
}
}
void JSONStructureNode::RefineCandidateTypesArray(yyjson_val *vals[], const idx_t val_count, Vector &string_vector,
ArenaAllocator &allocator, MutableDateFormatMap &date_format_map) {
D_ASSERT(descriptions.size() == 1 && descriptions[0].type == LogicalTypeId::LIST);
auto &desc = descriptions[0];
D_ASSERT(desc.children.size() == 1);
auto &child = desc.children[0];
idx_t total_list_size = 0;
for (idx_t i = 0; i < val_count; i++) {
if (vals[i] && !unsafe_yyjson_is_null(vals[i])) {
D_ASSERT(yyjson_is_arr(vals[i]));
total_list_size += unsafe_yyjson_get_len(vals[i]);
}
}
idx_t offset = 0;
auto child_vals =
reinterpret_cast<yyjson_val **>(allocator.AllocateAligned(total_list_size * sizeof(yyjson_val *)));
size_t idx, max;
yyjson_val *child_val;
for (idx_t i = 0; i < val_count; i++) {
if (vals[i] && !unsafe_yyjson_is_null(vals[i])) {
yyjson_arr_foreach(vals[i], idx, max, child_val) {
child_vals[offset++] = child_val;
}
}
}
child.RefineCandidateTypes(child_vals, total_list_size, string_vector, allocator, date_format_map);
}
void JSONStructureNode::RefineCandidateTypesObject(yyjson_val *vals[], const idx_t val_count, Vector &string_vector,
ArenaAllocator &allocator, MutableDateFormatMap &date_format_map) {
D_ASSERT(descriptions.size() == 1 && descriptions[0].type == LogicalTypeId::STRUCT);
auto &desc = descriptions[0];
const idx_t child_count = desc.children.size();
vector<yyjson_val **> child_vals;
child_vals.reserve(child_count);
for (idx_t child_idx = 0; child_idx < child_count; child_idx++) {
child_vals.emplace_back(
reinterpret_cast<yyjson_val **>(allocator.AllocateAligned(val_count * sizeof(yyjson_val *))));
}
const auto found_keys = reinterpret_cast<bool *>(allocator.AllocateAligned(sizeof(bool) * child_count));
const auto &key_map = desc.key_map;
size_t idx, max;
yyjson_val *child_key, *child_val;
for (idx_t i = 0; i < val_count; i++) {
if (vals[i] && !unsafe_yyjson_is_null(vals[i])) {
idx_t found_key_count = 0;
memset(found_keys, false, child_count);
D_ASSERT(yyjson_is_obj(vals[i]));
yyjson_obj_foreach(vals[i], idx, max, child_key, child_val) {
D_ASSERT(yyjson_is_str(child_key));
const auto key_ptr = unsafe_yyjson_get_str(child_key);
const auto key_len = unsafe_yyjson_get_len(child_key);
auto it = key_map.find({key_ptr, key_len});
D_ASSERT(it != key_map.end());
const auto child_idx = it->second;
child_vals[child_idx][i] = child_val;
found_key_count += !found_keys[child_idx];
found_keys[child_idx] = true;
}
if (found_key_count != child_count) {
// Set child val to nullptr so recursion doesn't break
for (idx_t child_idx = 0; child_idx < child_count; child_idx++) {
if (!found_keys[child_idx]) {
child_vals[child_idx][i] = nullptr;
}
}
}
} else {
for (idx_t child_idx = 0; child_idx < child_count; child_idx++) {
child_vals[child_idx][i] = nullptr;
}
}
}
for (idx_t child_idx = 0; child_idx < child_count; child_idx++) {
desc.children[child_idx].RefineCandidateTypes(child_vals[child_idx], val_count, string_vector, allocator,
date_format_map);
}
}
void JSONStructureNode::RefineCandidateTypesString(yyjson_val *vals[], const idx_t val_count, Vector &string_vector,
MutableDateFormatMap &date_format_map) {
D_ASSERT(descriptions.size() == 1 && descriptions[0].type == LogicalTypeId::VARCHAR);
if (descriptions[0].candidate_types.empty()) {
return;
}
static JSONTransformOptions OPTIONS;
JSONTransform::GetStringVector(vals, val_count, LogicalType::SQLNULL, string_vector, OPTIONS);
EliminateCandidateTypes(val_count, string_vector, date_format_map);
}
void JSONStructureNode::EliminateCandidateTypes(const idx_t vec_count, Vector &string_vector,
MutableDateFormatMap &date_format_map) {
D_ASSERT(descriptions.size() == 1 && descriptions[0].type == LogicalTypeId::VARCHAR);
auto &description = descriptions[0];
auto &candidate_types = description.candidate_types;
while (true) {
if (candidate_types.empty()) {
return;
}
const auto type = candidate_types.back();
Vector result_vector(type, vec_count);
if (date_format_map.HasFormats(type)) {
if (EliminateCandidateFormats(vec_count, string_vector, result_vector, date_format_map)) {
return;
} else {
candidate_types.pop_back();
}
} else {
string error_message;
if (!VectorOperations::DefaultTryCast(string_vector, result_vector, vec_count, &error_message, true)) {
candidate_types.pop_back();
} else {
return;
}
}
}
}
template <class OP, class T>
bool TryParse(Vector &string_vector, StrpTimeFormat &format, const idx_t count) {
const auto strings = FlatVector::GetData<string_t>(string_vector);
const auto &validity = FlatVector::Validity(string_vector);
T result;
string error_message;
if (validity.AllValid()) {
for (idx_t i = 0; i < count; i++) {
if (!OP::template Operation<T>(format, strings[i], result, error_message)) {
return false;
}
}
} else {
for (idx_t i = 0; i < count; i++) {
if (validity.RowIsValid(i)) {
if (!OP::template Operation<T>(format, strings[i], result, error_message)) {
return false;
}
}
}
}
return true;
}
bool JSONStructureNode::EliminateCandidateFormats(const idx_t vec_count, Vector &string_vector,
const Vector &result_vector, MutableDateFormatMap &date_format_map) {
D_ASSERT(descriptions.size() == 1 && descriptions[0].type == LogicalTypeId::VARCHAR);
const auto type = result_vector.GetType().id();
auto i = date_format_map.NumberOfFormats(type);
for (; i != 0; i--) {
StrpTimeFormat format;
if (!date_format_map.GetFormatAtIndex(type, i - 1, format)) {
continue;
}
bool success;
switch (type) {
case LogicalTypeId::DATE:
success = TryParse<TryParseDate, date_t>(string_vector, format, vec_count);
break;
case LogicalTypeId::TIMESTAMP:
success = TryParse<TryParseTimeStamp, timestamp_t>(string_vector, format, vec_count);
break;
default:
throw InternalException("No date/timestamp formats for %s", EnumUtil::ToString(type));
}
if (success) {
date_format_map.ShrinkFormatsToSize(type, i);
return true;
}
}
return false;
}
JSONStructureDescription::JSONStructureDescription(const LogicalTypeId type_p) : type(type_p) {
}
static void SwapJSONStructureDescription(JSONStructureDescription &a, JSONStructureDescription &b) noexcept {
std::swap(a.type, b.type);
std::swap(a.key_map, b.key_map);
std::swap(a.children, b.children);
std::swap(a.candidate_types, b.candidate_types);
}
JSONStructureDescription::JSONStructureDescription(JSONStructureDescription &&other) noexcept {
SwapJSONStructureDescription(*this, other);
}
JSONStructureDescription &JSONStructureDescription::operator=(JSONStructureDescription &&other) noexcept {
SwapJSONStructureDescription(*this, other);
return *this;
}
JSONStructureNode &JSONStructureDescription::GetOrCreateChild() {
D_ASSERT(type == LogicalTypeId::LIST);
if (children.empty()) {
children.emplace_back();
}
D_ASSERT(children.size() == 1);
return children.back();
}
JSONStructureNode &JSONStructureDescription::GetOrCreateChild(const char *key_ptr, const size_t key_size) {
// Check if there is already a child with the same key
const JSONKey temp_key {key_ptr, key_size};
const auto it = key_map.find(temp_key);
if (it != key_map.end()) {
return children[it->second]; // Found it
}
// Didn't find, create a new child
children.emplace_back(key_ptr, key_size);
const auto &persistent_key_string = *children.back().key;
JSONKey new_key {persistent_key_string.c_str(), persistent_key_string.length()};
key_map.emplace(new_key, children.size() - 1);
return children.back();
}
JSONStructureNode &JSONStructureDescription::GetOrCreateChild(yyjson_val *key, yyjson_val *val,
const bool ignore_errors) {
D_ASSERT(yyjson_is_str(key));
auto &child = GetOrCreateChild(unsafe_yyjson_get_str(key), unsafe_yyjson_get_len(key));
JSONStructure::ExtractStructure(val, child, ignore_errors);
return child;
}
static void ExtractStructureArray(yyjson_val *arr, JSONStructureNode &node, const bool ignore_errors) {
D_ASSERT(yyjson_is_arr(arr));
auto &description = node.GetOrCreateDescription(LogicalTypeId::LIST);
auto &child = description.GetOrCreateChild();
size_t idx, max;
yyjson_val *val;
yyjson_arr_foreach(arr, idx, max, val) {
JSONStructure::ExtractStructure(val, child, ignore_errors);
}
}
static void ExtractStructureObject(yyjson_val *obj, JSONStructureNode &node, const bool ignore_errors) {
D_ASSERT(yyjson_is_obj(obj));
auto &description = node.GetOrCreateDescription(LogicalTypeId::STRUCT);
// Keep track of keys so we can detect duplicates
unordered_set<string> obj_keys;
case_insensitive_set_t ci_obj_keys;
size_t idx, max;
yyjson_val *key, *val;
yyjson_obj_foreach(obj, idx, max, key, val) {
const string obj_key(unsafe_yyjson_get_str(key), unsafe_yyjson_get_len(key));
auto insert_result = obj_keys.insert(obj_key);
if (!ignore_errors && !insert_result.second) { // Exact match
JSONCommon::ThrowValFormatError("Duplicate key \"" + obj_key + "\" in object %s", obj);
}
insert_result = ci_obj_keys.insert(obj_key);
if (!ignore_errors && !insert_result.second) { // Case-insensitive match
JSONCommon::ThrowValFormatError("Duplicate key (different case) \"" + obj_key + "\" and \"" +
*insert_result.first + "\" in object %s",
obj);
}
description.GetOrCreateChild(key, val, ignore_errors);
}
}
static void ExtractStructureVal(yyjson_val *val, JSONStructureNode &node) {
D_ASSERT(!yyjson_is_arr(val) && !yyjson_is_obj(val));
node.GetOrCreateDescription(JSONCommon::ValTypeToLogicalTypeId(val));
}
void JSONStructure::ExtractStructure(yyjson_val *val, JSONStructureNode &node, const bool ignore_errors) {
node.count++;
const auto tag = yyjson_get_tag(val);
if (tag == (YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE)) {
node.null_count++;
}
switch (tag) {
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
return ExtractStructureArray(val, node, ignore_errors);
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
return ExtractStructureObject(val, node, ignore_errors);
default:
return ExtractStructureVal(val, node);
}
}
JSONStructureNode ExtractStructureInternal(yyjson_val *val, const bool ignore_errors) {
JSONStructureNode node;
JSONStructure::ExtractStructure(val, node, ignore_errors);
return node;
}
//! Forward declaration for recursion
static yyjson_mut_val *ConvertStructure(const JSONStructureNode &node, yyjson_mut_doc *doc);
static yyjson_mut_val *ConvertStructureArray(const JSONStructureNode &node, yyjson_mut_doc *doc) {
D_ASSERT(node.descriptions.size() == 1 && node.descriptions[0].type == LogicalTypeId::LIST);
const auto &desc = node.descriptions[0];
D_ASSERT(desc.children.size() == 1);
const auto arr = yyjson_mut_arr(doc);
yyjson_mut_arr_append(arr, ConvertStructure(desc.children[0], doc));
return arr;
}
static yyjson_mut_val *ConvertStructureObject(const JSONStructureNode &node, yyjson_mut_doc *doc) {
D_ASSERT(node.descriptions.size() == 1 && node.descriptions[0].type == LogicalTypeId::STRUCT);
auto &desc = node.descriptions[0];
if (desc.children.empty()) {
// Empty struct - let's do JSON instead
return yyjson_mut_str(doc, LogicalType::JSON_TYPE_NAME);
}
const auto obj = yyjson_mut_obj(doc);
for (auto &child : desc.children) {
D_ASSERT(child.key);
yyjson_mut_obj_add(obj, yyjson_mut_strn(doc, child.key->c_str(), child.key->length()),
ConvertStructure(child, doc));
}
return obj;
}
static yyjson_mut_val *ConvertStructure(const JSONStructureNode &node, yyjson_mut_doc *doc) {
if (node.descriptions.empty()) {
return yyjson_mut_str(doc, JSONCommon::TYPE_STRING_NULL);
}
if (node.descriptions.size() != 1) { // Inconsistent types, so we resort to JSON
return yyjson_mut_str(doc, LogicalType::JSON_TYPE_NAME);
}
auto &desc = node.descriptions[0];
D_ASSERT(desc.type != LogicalTypeId::INVALID);
switch (desc.type) {
case LogicalTypeId::LIST:
return ConvertStructureArray(node, doc);
case LogicalTypeId::STRUCT:
return ConvertStructureObject(node, doc);
default:
return yyjson_mut_str(doc, EnumUtil::ToChars(desc.type));
}
}
static string_t JSONStructureFunction(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &, idx_t) {
return JSONCommon::WriteVal<yyjson_mut_val>(
ConvertStructure(ExtractStructureInternal(val, true), yyjson_mut_doc_new(alc)), alc);
}
static void StructureFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::UnaryExecute<string_t>(args, state, result, JSONStructureFunction);
}
static void GetStructureFunctionInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
set.AddFunction(ScalarFunction({input_type}, LogicalType::JSON(), StructureFunction, nullptr, nullptr, nullptr,
JSONFunctionLocalState::Init));
}
ScalarFunctionSet JSONFunctions::GetStructureFunction() {
ScalarFunctionSet set("json_structure");
GetStructureFunctionInternal(set, LogicalType::VARCHAR);
GetStructureFunctionInternal(set, LogicalType::JSON());
return set;
}
static LogicalType StructureToTypeArray(ClientContext &context, const JSONStructureNode &node, const idx_t max_depth,
const double field_appearance_threshold, const idx_t map_inference_threshold,
const idx_t depth, const LogicalType &null_type) {
D_ASSERT(node.descriptions.size() == 1 && node.descriptions[0].type == LogicalTypeId::LIST);
const auto &desc = node.descriptions[0];
D_ASSERT(desc.children.size() == 1);
return LogicalType::LIST(JSONStructure::StructureToType(context, desc.children[0], max_depth,
field_appearance_threshold, map_inference_threshold,
depth + 1, null_type));
}
static void MergeNodeArray(JSONStructureNode &merged, const JSONStructureDescription &child_desc) {
D_ASSERT(child_desc.type == LogicalTypeId::LIST);
auto &merged_desc = merged.GetOrCreateDescription(LogicalTypeId::LIST);
auto &merged_child = merged_desc.GetOrCreateChild();
for (auto &list_child : child_desc.children) {
JSONStructure::MergeNodes(merged_child, list_child);
}
}
static void MergeNodeObject(JSONStructureNode &merged, const JSONStructureDescription &child_desc) {
D_ASSERT(child_desc.type == LogicalTypeId::STRUCT);
auto &merged_desc = merged.GetOrCreateDescription(LogicalTypeId::STRUCT);
for (auto &struct_child : child_desc.children) {
const auto &struct_child_key = *struct_child.key;
auto &merged_child = merged_desc.GetOrCreateChild(struct_child_key.c_str(), struct_child_key.length());
JSONStructure::MergeNodes(merged_child, struct_child);
}
}
static void MergeNodeVal(JSONStructureNode &merged, const JSONStructureDescription &child_desc,
const bool node_initialized) {
D_ASSERT(child_desc.type != LogicalTypeId::LIST && child_desc.type != LogicalTypeId::STRUCT);
auto &merged_desc = merged.GetOrCreateDescription(child_desc.type);
if (merged_desc.type != LogicalTypeId::VARCHAR || !node_initialized || merged.descriptions.size() != 1) {
return;
}
if (!merged.initialized) {
merged_desc.candidate_types = child_desc.candidate_types;
} else if (merged_desc.candidate_types.empty() != child_desc.candidate_types.empty() // both empty or neither empty
|| (!merged_desc.candidate_types.empty() &&
merged_desc.candidate_types.back() != child_desc.candidate_types.back())) { // non-empty: check type
merged_desc.candidate_types.clear(); // Not the same, default to VARCHAR
}
merged.initialized = true;
}
void JSONStructure::MergeNodes(JSONStructureNode &merged, const JSONStructureNode &node) {
merged.count += node.count;
merged.null_count += node.null_count;
for (const auto &child_desc : node.descriptions) {
switch (child_desc.type) {
case LogicalTypeId::LIST:
MergeNodeArray(merged, child_desc);
break;
case LogicalTypeId::STRUCT:
MergeNodeObject(merged, child_desc);
break;
default:
MergeNodeVal(merged, child_desc, node.initialized);
break;
}
}
}
static double CalculateTypeSimilarity(const LogicalType &merged, const LogicalType &type, idx_t max_depth, idx_t depth);
static double CalculateMapAndStructSimilarity(const LogicalType &map_type, const LogicalType &struct_type,
const bool swapped, const idx_t max_depth, const idx_t depth) {
const auto &map_value_type = MapType::ValueType(map_type);
const auto &struct_child_types = StructType::GetChildTypes(struct_type);
double total_similarity = 0;
for (const auto &struct_child_type : struct_child_types) {
const auto similarity =
swapped ? CalculateTypeSimilarity(struct_child_type.second, map_value_type, max_depth, depth + 1)
: CalculateTypeSimilarity(map_value_type, struct_child_type.second, max_depth, depth + 1);
if (similarity < 0) {
return similarity;
}
total_similarity += similarity;
}
return total_similarity / static_cast<double>(struct_child_types.size());
}
static double CalculateTypeSimilarity(const LogicalType &merged, const LogicalType &type, const idx_t max_depth,
const idx_t depth) {
if (depth >= max_depth || merged.id() == LogicalTypeId::SQLNULL || type.id() == LogicalTypeId::SQLNULL) {
return 1;
}
if (merged.IsJSONType()) {
// Incompatible types
return -1;
}
if (type.IsJSONType() || merged == type) {
return 1;
}
switch (merged.id()) {
case LogicalTypeId::STRUCT: {
if (type.id() == LogicalTypeId::MAP) {
// This can happen for empty structs/maps ("{}"), or in rare cases where an inconsistent struct becomes
// consistent when merged, but does not have enough children to be considered a map.
return CalculateMapAndStructSimilarity(type, merged, true, max_depth, depth);
} else if (type.id() != LogicalTypeId::STRUCT) {
return -1;
}
// Only structs can be merged into a struct
D_ASSERT(type.id() == LogicalTypeId::STRUCT);
const auto &merged_child_types = StructType::GetChildTypes(merged);
const auto &type_child_types = StructType::GetChildTypes(type);
unordered_map<string, const LogicalType &> merged_child_types_map;
for (const auto &merged_child : merged_child_types) {
merged_child_types_map.emplace(merged_child.first, merged_child.second);
}
double total_similarity = 0;
for (const auto &type_child_type : type_child_types) {
const auto it = merged_child_types_map.find(type_child_type.first);
if (it == merged_child_types_map.end()) {
return -1;
}
const auto similarity = CalculateTypeSimilarity(it->second, type_child_type.second, max_depth, depth + 1);
if (similarity < 0) {
return similarity;
}
total_similarity += similarity;
}
return total_similarity / static_cast<double>(merged_child_types.size());
}
case LogicalTypeId::MAP: {
if (type.id() == LogicalTypeId::MAP) {
return CalculateTypeSimilarity(MapType::ValueType(merged), MapType::ValueType(type), max_depth, depth + 1);
}
// Only maps and structs can be merged into a map
if (type.id() != LogicalTypeId::STRUCT) {
return -1;
}
return CalculateMapAndStructSimilarity(merged, type, false, max_depth, depth);
}
case LogicalTypeId::LIST: {
// Only lists can be merged into a list
D_ASSERT(type.id() == LogicalTypeId::LIST);
const auto &merged_child_type = ListType::GetChildType(merged);
const auto &type_child_type = ListType::GetChildType(type);
return CalculateTypeSimilarity(merged_child_type, type_child_type, max_depth, depth + 1);
}
default:
// This is only reachable if type has been inferred using candidate_types, but candidate_types were not
// consistent among all map values
return 1;
}
}
static bool IsStructureInconsistent(const JSONStructureDescription &desc, const idx_t sample_count,
const idx_t null_count, const double field_appearance_threshold) {
D_ASSERT(sample_count > null_count);
double total_child_counts = 0;
for (const auto &child : desc.children) {
total_child_counts += static_cast<double>(child.count) / static_cast<double>(sample_count - null_count);
}
const auto avg_occurrence = total_child_counts / static_cast<double>(desc.children.size());
return avg_occurrence < field_appearance_threshold;
}
static LogicalType GetMergedType(ClientContext &context, const JSONStructureNode &node, const idx_t max_depth,
const double field_appearance_threshold, const idx_t map_inference_threshold,
const idx_t depth, const LogicalType &null_type) {
D_ASSERT(node.descriptions.size() == 1);
auto &desc = node.descriptions[0];
JSONStructureNode merged;
for (const auto &child : desc.children) {
JSONStructure::MergeNodes(merged, child);
}
return JSONStructure::StructureToType(context, merged, max_depth, field_appearance_threshold,
map_inference_threshold, depth + 1, null_type);
}
static LogicalType StructureToTypeObject(ClientContext &context, const JSONStructureNode &node, const idx_t max_depth,
const double field_appearance_threshold, const idx_t map_inference_threshold,
const idx_t depth, const LogicalType &null_type) {
D_ASSERT(node.descriptions.size() == 1 && node.descriptions[0].type == LogicalTypeId::STRUCT);
auto &desc = node.descriptions[0];
if (desc.children.empty()) {
if (map_inference_threshold != DConstants::INVALID_INDEX) {
// Empty struct - let's do MAP of JSON instead
return LogicalType::MAP(LogicalType::VARCHAR, null_type);
} else {
return LogicalType::JSON();
}
}
// If it's an inconsistent object we also just do MAP with the best-possible, recursively-merged value type
if (map_inference_threshold != DConstants::INVALID_INDEX &&
IsStructureInconsistent(desc, node.count, node.null_count, field_appearance_threshold)) {
return LogicalType::MAP(LogicalType::VARCHAR,
GetMergedType(context, node, max_depth, field_appearance_threshold,
map_inference_threshold, depth + 1, null_type));
}
// We have a consistent object
child_list_t<LogicalType> child_types;
child_types.reserve(desc.children.size());
for (auto &child : desc.children) {
D_ASSERT(child.key);
child_types.emplace_back(*child.key,
JSONStructure::StructureToType(context, child, max_depth, field_appearance_threshold,
map_inference_threshold, depth + 1, null_type));
}
// If we have many children and all children have similar-enough types we infer map
if (desc.children.size() >= map_inference_threshold) {
LogicalType map_value_type = GetMergedType(context, node, max_depth, field_appearance_threshold,
map_inference_threshold, depth + 1, LogicalTypeId::SQLNULL);
double total_similarity = 0;
for (const auto &child_type : child_types) {
const auto similarity = CalculateTypeSimilarity(map_value_type, child_type.second, max_depth, depth + 1);
if (similarity < 0) {
total_similarity = similarity;
break;
}
total_similarity += similarity;
}
const auto avg_similarity = total_similarity / static_cast<double>(child_types.size());
if (avg_similarity >= 0.8) {
if (null_type != LogicalTypeId::SQLNULL) {
map_value_type = GetMergedType(context, node, max_depth, field_appearance_threshold,
map_inference_threshold, depth + 1, null_type);
}
return LogicalType::MAP(LogicalType::VARCHAR, map_value_type);
}
}
return LogicalType::STRUCT(child_types);
}
static LogicalType StructureToTypeString(const JSONStructureNode &node) {
D_ASSERT(node.descriptions.size() == 1 && node.descriptions[0].type == LogicalTypeId::VARCHAR);
auto &desc = node.descriptions[0];
if (desc.candidate_types.empty()) {
return LogicalTypeId::VARCHAR;
}
return desc.candidate_types.back();
}
LogicalType JSONStructure::StructureToType(ClientContext &context, const JSONStructureNode &node, const idx_t max_depth,
const double field_appearance_threshold, const idx_t map_inference_threshold,
const idx_t depth, const LogicalType &null_type) {
if (depth >= max_depth) {
return LogicalType::JSON();
}
if (node.descriptions.empty()) {
return null_type;
}
if (node.descriptions.size() != 1) { // Inconsistent types, so we resort to JSON
return LogicalType::JSON();
}
auto &desc = node.descriptions[0];
D_ASSERT(desc.type != LogicalTypeId::INVALID);
switch (desc.type) {
case LogicalTypeId::LIST:
return StructureToTypeArray(context, node, max_depth, field_appearance_threshold, map_inference_threshold,
depth, null_type);
case LogicalTypeId::STRUCT:
return StructureToTypeObject(context, node, max_depth, field_appearance_threshold, map_inference_threshold,
depth, null_type);
case LogicalTypeId::VARCHAR:
return StructureToTypeString(node);
case LogicalTypeId::UBIGINT:
return LogicalTypeId::BIGINT; // We prefer not to return UBIGINT in our type auto-detection
case LogicalTypeId::SQLNULL:
return null_type;
default:
return desc.type;
}
}
} // namespace duckdb

View File

@@ -0,0 +1,414 @@
#include "json_common.hpp"
#include "json_functions.hpp"
#include "duckdb/function/table_function.hpp"
namespace duckdb {
enum class JSONTableInOutType { EACH, TREE };
static unique_ptr<FunctionData> JSONTableInOutBind(ClientContext &, TableFunctionBindInput &input,
vector<LogicalType> &return_types, vector<string> &names) {
const child_list_t<LogicalType> schema {
{"key", LogicalType::VARCHAR}, {"value", LogicalType::JSON()}, {"type", LogicalType::VARCHAR},
{"atom", LogicalType::JSON()}, {"id", LogicalType::UBIGINT}, {"parent", LogicalType::UBIGINT},
{"fullkey", LogicalType::VARCHAR}, {"path", LogicalType::VARCHAR},
};
// Add all default columns
names.reserve(schema.size());
return_types.reserve(schema.size());
for (const auto &col : schema) {
names.emplace_back(col.first);
return_types.emplace_back(col.second);
}
return nullptr;
}
struct JSONTableInOutGlobalState : GlobalTableFunctionState {
JSONTableInOutGlobalState() {
}
//! Regular columns
optional_idx key_column_index;
optional_idx value_column_index;
optional_idx type_column_index;
optional_idx atom_column_index;
optional_idx id_column_index;
optional_idx parent_column_index;
optional_idx fullkey_column_index;
optional_idx path_column_index;
//! Virtual columns
optional_idx json_column_index;
optional_idx root_column_index;
optional_idx empty_column_idex;
optional_idx rowid_column_index;
static constexpr idx_t JSON_COLUMN_OFFSET = 0;
static constexpr idx_t ROOT_COLUMN_OFFSET = 1;
};
static unique_ptr<GlobalTableFunctionState> JSONTableInOutInitGlobal(ClientContext &, TableFunctionInitInput &input) {
auto result = make_uniq<JSONTableInOutGlobalState>();
for (idx_t i = 0; i < input.column_indexes.size(); i++) {
const auto &col_idx = input.column_indexes[i];
if (!col_idx.IsVirtualColumn()) {
switch (col_idx.GetPrimaryIndex()) {
case 0:
result->key_column_index = i;
break;
case 1:
result->value_column_index = i;
break;
case 2:
result->type_column_index = i;
break;
case 3:
result->atom_column_index = i;
break;
case 4:
result->id_column_index = i;
break;
case 5:
result->parent_column_index = i;
break;
case 6:
result->fullkey_column_index = i;
break;
case 7:
result->path_column_index = i;
break;
default:
throw NotImplementedException("Column %llu for json_each/json_tree", col_idx.GetPrimaryIndex());
}
} else {
if (col_idx.GetPrimaryIndex() == VIRTUAL_COLUMN_START + JSONTableInOutGlobalState::JSON_COLUMN_OFFSET) {
result->json_column_index = i;
} else if (col_idx.GetPrimaryIndex() ==
VIRTUAL_COLUMN_START + JSONTableInOutGlobalState::ROOT_COLUMN_OFFSET) {
result->root_column_index = i;
} else if (col_idx.IsEmptyColumn()) {
result->empty_column_idex = i;
} else if (col_idx.IsRowIdColumn()) {
result->rowid_column_index = i;
} else {
throw NotImplementedException("Virtual column %llu for json_each/json_tree", col_idx.GetPrimaryIndex());
}
}
}
return std::move(result);
}
struct JSONTableInOutRecursionNode {
JSONTableInOutRecursionNode(string path_p, yyjson_val *parent_val_p)
: path(std::move(path_p)), parent_val(parent_val_p), child_index(0) {
}
string path;
yyjson_val *parent_val;
idx_t child_index;
};
struct JSONTableInOutLocalState : LocalTableFunctionState {
explicit JSONTableInOutLocalState(ClientContext &context)
: json_allocator(BufferAllocator::Get(context)), alc(json_allocator.GetYYAlc()), len(DConstants::INVALID_INDEX),
doc(nullptr), initialized(false), total_count(0) {
}
string GetPath() const {
auto result = path;
for (const auto &ri : recursion_nodes) {
result += ri.path;
}
return result;
}
void AddRecursionNode(yyjson_val *val, optional_ptr<yyjson_val> vkey, const optional_idx arr_index) {
string str;
if (vkey) {
str = "." + string(unsafe_yyjson_get_str(vkey.get()), unsafe_yyjson_get_len(vkey.get()));
} else if (arr_index.IsValid()) {
str = "[" + to_string(arr_index.GetIndex()) + "]";
}
recursion_nodes.emplace_back(str, val);
}
JSONAllocator json_allocator;
yyjson_alc *alc;
string path;
idx_t len;
yyjson_doc *doc;
bool initialized;
idx_t total_count;
vector<JSONTableInOutRecursionNode> recursion_nodes;
};
static unique_ptr<LocalTableFunctionState> JSONTableInOutInitLocal(ExecutionContext &context, TableFunctionInitInput &,
GlobalTableFunctionState *) {
return make_uniq<JSONTableInOutLocalState>(context.client);
}
template <class T>
struct JSONTableInOutResultVector {
explicit JSONTableInOutResultVector(DataChunk &output, const optional_idx &output_column_index)
: enabled(output_column_index.IsValid()), vector(output.data[enabled ? output_column_index.GetIndex() : 0]),
data(enabled ? FlatVector::GetData<T>(vector) : nullptr), validity(FlatVector::Validity(vector)) {
}
const bool enabled;
Vector &vector;
T *data;
ValidityMask &validity;
};
struct JSONTableInOutResult {
explicit JSONTableInOutResult(const JSONTableInOutGlobalState &gstate, DataChunk &output)
: count(0), key(output, gstate.key_column_index), value(output, gstate.value_column_index),
type(output, gstate.type_column_index), atom(output, gstate.atom_column_index),
id(output, gstate.id_column_index), parent(output, gstate.parent_column_index),
fullkey(output, gstate.fullkey_column_index), path(output, gstate.path_column_index),
rowid(output, gstate.rowid_column_index) {
}
template <JSONTableInOutType TYPE>
void AddRow(JSONTableInOutLocalState &lstate, optional_ptr<yyjson_val> vkey, yyjson_val *val) {
const auto &recursion_nodes = lstate.recursion_nodes;
const auto arr_el = !recursion_nodes.empty() && unsafe_yyjson_is_arr(recursion_nodes.back().parent_val);
if (key.enabled) {
if (vkey) { // Object field
key.data[count] = string_t(unsafe_yyjson_get_str(vkey.get()), unsafe_yyjson_get_len(vkey.get()));
} else if (arr_el) { // Array element
key.data[count] = StringVector::AddString(key.vector, to_string(recursion_nodes.back().child_index));
} else { // Other
key.validity.SetInvalid(count);
}
}
if (value.enabled) {
value.data[count] = JSONCommon::WriteVal(val, lstate.alc);
}
if (type.enabled) {
type.data[count] = JSONCommon::ValTypeToStringT(val);
}
if (atom.enabled) {
atom.data[count] = JSONCommon::JSONValue(val, lstate.alc, atom.vector, atom.validity, count);
}
if (id.enabled) {
id.data[count] = NumericCast<idx_t>(val - lstate.doc->root);
}
if (parent.enabled) {
if (TYPE == JSONTableInOutType::EACH || recursion_nodes.empty()) {
parent.validity.SetInvalid(count);
} else {
parent.data[count] = NumericCast<uint64_t>(recursion_nodes.back().parent_val - lstate.doc->root);
}
}
const auto path_str = lstate.GetPath();
if (fullkey.enabled) {
if (vkey) { // Object field
const auto vkey_str = string(unsafe_yyjson_get_str(vkey.get()), unsafe_yyjson_get_len(vkey.get()));
fullkey.data[count] = StringVector::AddString(fullkey.vector, path_str + "." + vkey_str);
} else if (arr_el) { // Array element
const auto arr_path = "[" + to_string(recursion_nodes.back().child_index) + "]";
fullkey.data[count] = StringVector::AddString(fullkey.vector, path_str + arr_path);
} else { // Other
fullkey.data[count] = StringVector::AddString(fullkey.vector, path_str);
}
}
if (path.enabled) {
path.data[count] = StringVector::AddString(path.vector, path_str);
}
if (rowid.enabled) {
rowid.data[count] = NumericCast<int64_t>(lstate.total_count++);
}
count++;
}
idx_t count;
JSONTableInOutResultVector<string_t> key;
JSONTableInOutResultVector<string_t> value;
JSONTableInOutResultVector<string_t> type;
JSONTableInOutResultVector<string_t> atom;
JSONTableInOutResultVector<uint64_t> id;
JSONTableInOutResultVector<uint64_t> parent;
JSONTableInOutResultVector<string_t> fullkey;
JSONTableInOutResultVector<string_t> path;
JSONTableInOutResultVector<int64_t> rowid;
};
template <JSONTableInOutType TYPE>
static void InitializeLocalState(JSONTableInOutLocalState &lstate, DataChunk &input, JSONTableInOutResult &result) {
lstate.total_count = 0;
// Parse path, default to root if not given
Value path_value("$");
if (input.data.size() > 1) {
auto &path_vector = input.data[1];
if (ConstantVector::IsNull(path_vector)) {
return;
}
path_value = ConstantVector::GetData<string_t>(path_vector)[0];
}
if (JSONReadFunctionData::CheckPath(path_value, lstate.path, lstate.len) == JSONCommon::JSONPathType::WILDCARD) {
throw BinderException("Wildcard JSON path not supported in json_each/json_tree");
}
if (lstate.path.c_str()[0] != '$') {
throw BinderException("JSON path must start with '$' for json_each/json_tree");
}
// Parse document and get the value at the supplied path
const auto &input_vector = input.data[0];
if (ConstantVector::IsNull(input_vector)) {
return;
}
const auto &input_data = FlatVector::GetData<string_t>(input_vector)[0];
lstate.doc = JSONCommon::ReadDocument(input_data, JSONCommon::READ_FLAG, lstate.alc);
const auto root = JSONCommon::GetUnsafe(lstate.doc->root, lstate.path.c_str(), lstate.len);
if (!root) {
return;
}
const auto is_container = unsafe_yyjson_is_arr(root) || unsafe_yyjson_is_obj(root);
if (!is_container || TYPE == JSONTableInOutType::TREE) {
result.AddRow<TYPE>(lstate, nullptr, root);
}
if (is_container) {
lstate.AddRecursionNode(root, nullptr, optional_idx());
}
}
template <JSONTableInOutType TYPE>
static bool JSONTableInOutHandleValue(JSONTableInOutLocalState &lstate, JSONTableInOutResult &result,
idx_t &child_index, size_t &idx, yyjson_val *child_key, yyjson_val *child_val) {
if (idx < child_index) {
return false; // Continue: Get back to where we left off
}
result.AddRow<TYPE>(lstate, child_key, child_val);
child_index++; // We finished processing the array element
if (TYPE == JSONTableInOutType::TREE && (unsafe_yyjson_is_arr(child_val) || unsafe_yyjson_is_obj(child_val))) {
lstate.AddRecursionNode(child_val, child_key, idx);
return true; // Break: We added a recursion node, go depth-first
}
if (result.count == STANDARD_VECTOR_SIZE) {
return true; // Break: Vector is full
}
return false; // Continue: Next element
}
template <JSONTableInOutType TYPE>
static OperatorResultType JSONTableInOutFunction(ExecutionContext &, TableFunctionInput &data_p, DataChunk &input,
DataChunk &output) {
auto &gstate = data_p.global_state->Cast<JSONTableInOutGlobalState>();
auto &lstate = data_p.local_state->Cast<JSONTableInOutLocalState>();
JSONTableInOutResult result(gstate, output);
if (!lstate.initialized) {
InitializeLocalState<TYPE>(lstate, input, result);
lstate.initialized = true;
}
// Traverse the JSON (keeping a stack to avoid recursion and save progress across calls)
auto &recursion_nodes = lstate.recursion_nodes;
while (!lstate.recursion_nodes.empty() && result.count != STANDARD_VECTOR_SIZE) {
auto &parent_val = recursion_nodes.back().parent_val;
auto &child_index = recursion_nodes.back().child_index;
size_t idx, max;
yyjson_val *child_key, *child_val;
switch (yyjson_get_tag(parent_val)) {
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
yyjson_arr_foreach(parent_val, idx, max, child_val) {
if (JSONTableInOutHandleValue<TYPE>(lstate, result, child_index, idx, nullptr, child_val)) {
break;
}
}
break;
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
yyjson_obj_foreach(parent_val, idx, max, child_key, child_val) {
if (JSONTableInOutHandleValue<TYPE>(lstate, result, child_index, idx, child_key, child_val)) {
break;
}
}
break;
default:
throw InternalException("Non-object/array JSON added to recursion in json_each/json_tree");
}
if (idx == max) {
lstate.recursion_nodes.pop_back(); // Array/object is done, remove
}
}
output.SetCardinality(result.count);
// Set constant virtual columns ("json", "root", and "empty")
if (gstate.json_column_index.IsValid()) {
auto &json_vector = output.data[gstate.json_column_index.GetIndex()];
json_vector.Reference(input.data[0]);
}
if (gstate.root_column_index.IsValid()) {
auto &root_vector = output.data[gstate.root_column_index.GetIndex()];
root_vector.SetVectorType(VectorType::CONSTANT_VECTOR);
FlatVector::GetData<string_t>(root_vector)[0] = string_t(lstate.path.c_str(), lstate.len);
}
if (gstate.empty_column_idex.IsValid()) {
auto &empty_vector = output.data[gstate.empty_column_idex.GetIndex()];
empty_vector.SetVectorType(VectorType::CONSTANT_VECTOR);
ConstantVector::SetNull(empty_vector, true);
}
if (output.size() == 0) {
D_ASSERT(recursion_nodes.empty());
lstate.json_allocator.Reset();
lstate.initialized = false;
return OperatorResultType::NEED_MORE_INPUT;
}
return OperatorResultType::HAVE_MORE_OUTPUT;
}
virtual_column_map_t GetJSONTableInOutVirtualColumns(ClientContext &, optional_ptr<FunctionData>) {
virtual_column_map_t result;
result.insert(make_pair(VIRTUAL_COLUMN_START + JSONTableInOutGlobalState::JSON_COLUMN_OFFSET,
TableColumn("json", LogicalType::JSON())));
result.insert(make_pair(VIRTUAL_COLUMN_START + JSONTableInOutGlobalState::ROOT_COLUMN_OFFSET,
TableColumn("root", LogicalType::VARCHAR)));
result.insert(make_pair(COLUMN_IDENTIFIER_EMPTY, TableColumn("", LogicalType::BOOLEAN)));
result.insert(make_pair(COLUMN_IDENTIFIER_ROW_ID, TableColumn("rowid", LogicalType::BIGINT)));
return result;
}
template <JSONTableInOutType TYPE>
TableFunction GetJSONTableInOutFunction(const LogicalType &input_type, const bool &has_path_param) {
vector<LogicalType> arguments = {input_type};
if (has_path_param) {
arguments.push_back(LogicalType::VARCHAR);
}
TableFunction function(arguments, nullptr, JSONTableInOutBind, JSONTableInOutInitGlobal, JSONTableInOutInitLocal);
function.in_out_function = JSONTableInOutFunction<TYPE>;
function.get_virtual_columns = GetJSONTableInOutVirtualColumns;
function.projection_pushdown = true;
return function;
}
TableFunctionSet JSONFunctions::GetJSONEachFunction() {
TableFunctionSet set("json_each");
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::EACH>(LogicalType::VARCHAR, false));
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::EACH>(LogicalType::VARCHAR, true));
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::EACH>(LogicalType::JSON(), false));
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::EACH>(LogicalType::JSON(), true));
return set;
}
TableFunctionSet JSONFunctions::GetJSONTreeFunction() {
TableFunctionSet set("json_tree");
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::TREE>(LogicalType::VARCHAR, false));
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::TREE>(LogicalType::VARCHAR, true));
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::TREE>(LogicalType::JSON(), false));
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::TREE>(LogicalType::JSON(), true));
return set;
}
} // namespace duckdb

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,38 @@
#include "json_executors.hpp"
namespace duckdb {
static inline string_t GetType(yyjson_val *val, yyjson_alc *, Vector &, ValidityMask &mask, idx_t idx) {
return JSONCommon::ValTypeToStringT(val);
}
static void UnaryTypeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::UnaryExecute<string_t>(args, state, result, GetType);
}
static void BinaryTypeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::BinaryExecute<string_t>(args, state, result, GetType);
}
static void ManyTypeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::ExecuteMany<string_t>(args, state, result, GetType);
}
static void GetTypeFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
set.AddFunction(ScalarFunction({input_type}, LogicalType::VARCHAR, UnaryTypeFunction, nullptr, nullptr, nullptr,
JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::VARCHAR, BinaryTypeFunction,
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
LogicalType::LIST(LogicalType::VARCHAR), ManyTypeFunction,
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
}
ScalarFunctionSet JSONFunctions::GetTypeFunction() {
ScalarFunctionSet set("json_type");
GetTypeFunctionsInternal(set, LogicalType::VARCHAR);
GetTypeFunctionsInternal(set, LogicalType::JSON());
return set;
}
} // namespace duckdb

View File

@@ -0,0 +1,27 @@
#include "json_executors.hpp"
namespace duckdb {
static void ValidFunction(DataChunk &args, ExpressionState &state, Vector &result) {
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
auto alc = lstate.json_allocator->GetYYAlc();
auto &inputs = args.data[0];
UnaryExecutor::Execute<string_t, bool>(inputs, result, args.size(), [&](string_t input) {
return JSONCommon::ReadDocumentUnsafe(input, JSONCommon::READ_FLAG, alc);
});
}
static void GetValidFunctionInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
set.AddFunction(ScalarFunction("json_valid", {input_type}, LogicalType::BOOLEAN, ValidFunction, nullptr, nullptr,
nullptr, JSONFunctionLocalState::Init));
}
ScalarFunctionSet JSONFunctions::GetValidFunction() {
ScalarFunctionSet set("json_valid");
GetValidFunctionInternal(set, LogicalType::VARCHAR);
GetValidFunctionInternal(set, LogicalType::JSON());
return set;
}
} // namespace duckdb

View File

@@ -0,0 +1,31 @@
#include "json_executors.hpp"
namespace duckdb {
static void ValueFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::BinaryExecute<string_t>(args, state, result, JSONCommon::JSONValue);
}
static void ValueManyFunction(DataChunk &args, ExpressionState &state, Vector &result) {
JSONExecutors::ExecuteMany<string_t>(args, state, result, JSONCommon::JSONValue);
}
static void GetValueFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
set.AddFunction(ScalarFunction({input_type, LogicalType::BIGINT}, LogicalType::VARCHAR, ValueFunction,
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::VARCHAR, ValueFunction,
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
LogicalType::LIST(LogicalType::VARCHAR), ValueManyFunction,
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
}
ScalarFunctionSet JSONFunctions::GetValueFunction() {
// The value function is just like the extract function but returns NULL if the JSON is not a scalar value
ScalarFunctionSet set("json_value");
GetValueFunctionsInternal(set, LogicalType::VARCHAR);
GetValueFunctionsInternal(set, LogicalType::JSON());
return set;
}
} // namespace duckdb

View File

@@ -0,0 +1,288 @@
#include "duckdb/common/helper.hpp"
#include "duckdb/common/multi_file/multi_file_reader.hpp"
#include "json_functions.hpp"
#include "json_scan.hpp"
#include "json_structure.hpp"
#include "json_transform.hpp"
#include "json_multi_file_info.hpp"
#include "duckdb/parallel/task_executor.hpp"
namespace duckdb {
static inline LogicalType RemoveDuplicateStructKeys(const LogicalType &type, const bool ignore_errors) {
switch (type.id()) {
case LogicalTypeId::STRUCT: {
case_insensitive_set_t child_names;
child_list_t<LogicalType> child_types;
for (auto &child_type : StructType::GetChildTypes(type)) {
auto insert_success = child_names.insert(child_type.first).second;
if (!insert_success) {
if (ignore_errors) {
continue;
}
throw NotImplementedException(
"Duplicate name \"%s\" in struct auto-detected in JSON, try ignore_errors=true", child_type.first);
} else {
child_types.emplace_back(child_type.first, RemoveDuplicateStructKeys(child_type.second, ignore_errors));
}
}
return LogicalType::STRUCT(child_types);
}
case LogicalTypeId::MAP:
return LogicalType::MAP(RemoveDuplicateStructKeys(MapType::KeyType(type), ignore_errors),
RemoveDuplicateStructKeys(MapType::ValueType(type), ignore_errors));
case LogicalTypeId::LIST:
return LogicalType::LIST(RemoveDuplicateStructKeys(ListType::GetChildType(type), ignore_errors));
default:
return type;
}
}
struct AutoDetectState {
AutoDetectState(ClientContext &context_p, MultiFileBindData &bind_data_p, const vector<OpenFileInfo> &files,
MutableDateFormatMap &date_format_map)
: context(context_p), bind_data(bind_data_p), files(files), date_format_map(date_format_map), files_scanned(0),
tuples_scanned(0), bytes_scanned(0), total_file_size(0) {
}
ClientContext &context;
MultiFileBindData &bind_data;
const vector<OpenFileInfo> &files;
MutableDateFormatMap &date_format_map;
atomic<idx_t> files_scanned;
atomic<idx_t> tuples_scanned;
atomic<idx_t> bytes_scanned;
atomic<idx_t> total_file_size;
};
class JSONSchemaTask : public BaseExecutorTask {
public:
JSONSchemaTask(TaskExecutor &executor, AutoDetectState &auto_detect_state, JSONStructureNode &node_p,
const idx_t file_idx_start_p, const idx_t file_idx_end_p)
: BaseExecutorTask(executor), auto_detect_state(auto_detect_state), node(node_p),
file_idx_start(file_idx_start_p), file_idx_end(file_idx_end_p),
allocator(BufferAllocator::Get(auto_detect_state.context)), string_vector(LogicalType::VARCHAR) {
}
static idx_t ExecuteInternal(AutoDetectState &auto_detect_state, JSONStructureNode &node, const idx_t file_idx,
ArenaAllocator &allocator, Vector &string_vector, idx_t remaining) {
auto &context = auto_detect_state.context;
auto &bind_data = auto_detect_state.bind_data;
auto &files = auto_detect_state.files;
auto &json_data = bind_data.bind_data->Cast<JSONScanData>();
auto json_reader = make_shared_ptr<JSONReader>(context, json_data.options, files[file_idx].path);
if (bind_data.union_readers[file_idx]) {
throw InternalException("Union data already set");
}
auto &reader = *json_reader;
auto union_data = make_uniq<BaseUnionData>(files[file_idx].path);
union_data->reader = std::move(json_reader);
bind_data.union_readers[file_idx] = std::move(union_data);
auto &global_allocator = Allocator::Get(context);
idx_t buffer_capacity = json_data.options.maximum_object_size * 2;
JSONReaderScanState scan_state(context, global_allocator, buffer_capacity);
auto &options = json_data.options;
// Read and detect schema
idx_t total_tuple_count = 0;
idx_t total_read_size = 0;
reader.Initialize(global_allocator, buffer_capacity);
reader.InitializeScan(scan_state, JSONFileReadType::SCAN_ENTIRE_FILE);
auto file_size = reader.GetFileHandle().GetHandle().GetFileSize();
while (remaining != 0) {
allocator.Reset();
auto buffer_offset_before = scan_state.buffer_offset;
auto read_count = reader.Scan(scan_state);
if (read_count == 0) {
break;
}
total_read_size += scan_state.buffer_offset - buffer_offset_before;
total_tuple_count += read_count;
const auto next = MinValue<idx_t>(read_count, remaining);
for (idx_t i = 0; i < next; i++) {
const auto &val = scan_state.values[i];
if (val) {
JSONStructure::ExtractStructure(val, node, true);
}
}
if (!node.ContainsVarchar()) { // Can't refine non-VARCHAR types
continue;
}
node.InitializeCandidateTypes(options.max_depth, options.convert_strings_to_integers);
node.RefineCandidateTypes(scan_state.values, next, string_vector, allocator,
auto_detect_state.date_format_map);
remaining -= next;
}
auto_detect_state.total_file_size += file_size;
auto_detect_state.bytes_scanned += total_read_size;
auto_detect_state.tuples_scanned += total_tuple_count;
++auto_detect_state.files_scanned;
return remaining;
}
void ExecuteTask() override {
auto &json_data = auto_detect_state.bind_data.bind_data->Cast<JSONScanData>();
auto &options = json_data.options;
for (idx_t file_idx = file_idx_start; file_idx < file_idx_end; file_idx++) {
ExecuteInternal(auto_detect_state, node, file_idx, allocator, string_vector, options.sample_size);
}
}
string TaskType() const override {
return "JSONSchemaTask";
}
private:
AutoDetectState &auto_detect_state;
JSONStructureNode &node;
const idx_t file_idx_start;
const idx_t file_idx_end;
ArenaAllocator allocator;
Vector string_vector;
};
void JSONScan::AutoDetect(ClientContext &context, MultiFileBindData &bind_data, vector<LogicalType> &return_types,
vector<string> &names) {
auto &json_data = bind_data.bind_data->Cast<JSONScanData>();
MutableDateFormatMap date_format_map(*json_data.date_format_map);
JSONStructureNode node;
auto &options = json_data.options;
auto files = bind_data.file_list->GetAllFiles();
auto file_count = bind_data.file_options.union_by_name
? files.size()
: MinValue<idx_t>(options.maximum_sample_files, files.size());
bind_data.union_readers.resize(files.empty() ? 0 : files.size());
AutoDetectState auto_detect_state(context, bind_data, files, date_format_map);
const auto num_threads = NumericCast<idx_t>(TaskScheduler::GetScheduler(context).NumberOfThreads());
const auto files_per_task = (file_count + num_threads - 1) / num_threads;
const auto num_tasks = (file_count + files_per_task - 1) / files_per_task;
vector<JSONStructureNode> task_nodes(num_tasks);
// Same idea as in union_by_name.hpp
TaskExecutor executor(context);
for (idx_t task_idx = 0; task_idx < num_tasks; task_idx++) {
const auto file_idx_start = task_idx * files_per_task;
const auto file_idx_end = MinValue(file_idx_start + files_per_task, file_count);
auto task =
make_uniq<JSONSchemaTask>(executor, auto_detect_state, task_nodes[task_idx], file_idx_start, file_idx_end);
executor.ScheduleTask(std::move(task));
}
executor.WorkOnTasks();
// Merge task nodes into one
for (auto &task_node : task_nodes) {
JSONStructure::MergeNodes(node, task_node);
}
// set the max threads/estimated per-file cardinality
if (auto_detect_state.files_scanned > 0 && auto_detect_state.tuples_scanned > 0) {
auto average_tuple_size =
MaxValue<idx_t>(auto_detect_state.bytes_scanned / auto_detect_state.tuples_scanned, 1);
json_data.estimated_cardinality_per_file = auto_detect_state.total_file_size / average_tuple_size;
if (auto_detect_state.files_scanned == 1) {
json_data.max_threads =
MaxValue<idx_t>(auto_detect_state.total_file_size / json_data.options.maximum_object_size, 1);
}
}
// Convert structure to logical type
auto type = JSONStructure::StructureToType(context, node, options.max_depth, options.field_appearance_threshold,
options.map_inference_threshold);
// Auto-detect record type
if (json_data.options.record_type == JSONRecordType::AUTO_DETECT) {
if (type.id() == LogicalTypeId::STRUCT) {
json_data.options.record_type = JSONRecordType::RECORDS;
} else {
json_data.options.record_type = JSONRecordType::VALUES;
}
}
if (!names.empty()) {
// COPY - we already have names/types
return;
}
// Auto-detect columns
if (json_data.options.record_type == JSONRecordType::RECORDS) {
if (type.id() == LogicalTypeId::STRUCT) {
const auto &child_types = StructType::GetChildTypes(type);
return_types.reserve(child_types.size());
names.reserve(child_types.size());
for (auto &child_type : child_types) {
return_types.emplace_back(RemoveDuplicateStructKeys(child_type.second, options.ignore_errors));
names.emplace_back(child_type.first);
}
} else {
throw BinderException("json_read expected records, but got non-record JSON instead."
"\n Try setting records='auto' or records='false'.");
}
} else {
D_ASSERT(json_data.options.record_type == JSONRecordType::VALUES);
return_types.emplace_back(RemoveDuplicateStructKeys(type, options.ignore_errors));
names.emplace_back("json");
}
}
TableFunction JSONFunctions::GetReadJSONTableFunction(shared_ptr<JSONScanInfo> function_info) {
MultiFileFunction<JSONMultiFileInfo> table_function("read_json");
JSONScan::TableFunctionDefaults(table_function);
table_function.named_parameters["columns"] = LogicalType::ANY;
table_function.named_parameters["auto_detect"] = LogicalType::BOOLEAN;
table_function.named_parameters["sample_size"] = LogicalType::BIGINT;
table_function.named_parameters["dateformat"] = LogicalType::VARCHAR;
table_function.named_parameters["date_format"] = LogicalType::VARCHAR;
table_function.named_parameters["timestampformat"] = LogicalType::VARCHAR;
table_function.named_parameters["timestamp_format"] = LogicalType::VARCHAR;
table_function.named_parameters["records"] = LogicalType::VARCHAR;
table_function.named_parameters["maximum_sample_files"] = LogicalType::BIGINT;
// TODO: might be able to do filter pushdown/prune ?
table_function.function_info = std::move(function_info);
return static_cast<TableFunction>(table_function);
}
TableFunctionSet CreateJSONFunctionInfo(string name, shared_ptr<JSONScanInfo> info) {
auto table_function = JSONFunctions::GetReadJSONTableFunction(std::move(info));
table_function.name = std::move(name);
table_function.named_parameters["maximum_depth"] = LogicalType::BIGINT;
table_function.named_parameters["field_appearance_threshold"] = LogicalType::DOUBLE;
table_function.named_parameters["convert_strings_to_integers"] = LogicalType::BOOLEAN;
table_function.named_parameters["map_inference_threshold"] = LogicalType::BIGINT;
return MultiFileReader::CreateFunctionSet(table_function);
}
TableFunctionSet JSONFunctions::GetReadJSONFunction() {
auto info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT,
JSONRecordType::AUTO_DETECT, true);
return CreateJSONFunctionInfo("read_json", std::move(info));
}
TableFunctionSet JSONFunctions::GetReadNDJSONFunction() {
auto info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED,
JSONRecordType::AUTO_DETECT, true);
return CreateJSONFunctionInfo("read_ndjson", std::move(info));
}
TableFunctionSet JSONFunctions::GetReadJSONAutoFunction() {
auto info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT,
JSONRecordType::AUTO_DETECT, true);
return CreateJSONFunctionInfo("read_json_auto", std::move(info));
}
TableFunctionSet JSONFunctions::GetReadNDJSONAutoFunction() {
auto info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED,
JSONRecordType::AUTO_DETECT, true);
return CreateJSONFunctionInfo("read_ndjson_auto", std::move(info));
}
} // namespace duckdb

View File

@@ -0,0 +1,37 @@
#include "json_common.hpp"
#include "json_functions.hpp"
#include "json_scan.hpp"
#include "duckdb/common/helper.hpp"
#include "json_multi_file_info.hpp"
namespace duckdb {
TableFunction GetReadJSONObjectsTableFunction(string name, shared_ptr<JSONScanInfo> function_info) {
MultiFileFunction<JSONMultiFileInfo> table_function(std::move(name));
JSONScan::TableFunctionDefaults(table_function);
table_function.function_info = std::move(function_info);
return static_cast<TableFunction>(table_function);
}
TableFunctionSet JSONFunctions::GetReadJSONObjectsFunction() {
auto function_info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::AUTO_DETECT,
JSONRecordType::RECORDS);
auto table_function = GetReadJSONObjectsTableFunction("read_json_objects", std::move(function_info));
return MultiFileReader::CreateFunctionSet(std::move(table_function));
}
TableFunctionSet JSONFunctions::GetReadNDJSONObjectsFunction() {
auto function_info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::NEWLINE_DELIMITED,
JSONRecordType::RECORDS);
auto table_function = GetReadJSONObjectsTableFunction("read_ndjson_objects", std::move(function_info));
return MultiFileReader::CreateFunctionSet(std::move(table_function));
}
TableFunctionSet JSONFunctions::GetReadJSONObjectsAutoFunction() {
auto function_info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::AUTO_DETECT,
JSONRecordType::RECORDS);
auto table_function = GetReadJSONObjectsTableFunction("read_json_objects_auto", std::move(function_info));
return MultiFileReader::CreateFunctionSet(std::move(table_function));
}
} // namespace duckdb