should be it
This commit is contained in:
48
external/duckdb/extension/json/CMakeLists.txt
vendored
Normal file
48
external/duckdb/extension/json/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,48 @@
|
||||
cmake_minimum_required(VERSION 2.8.12...3.29)
|
||||
|
||||
project(JSONExtension)
|
||||
|
||||
include_directories(include)
|
||||
add_subdirectory(json_functions)
|
||||
|
||||
set(JSON_EXTENSION_FILES
|
||||
json_reader.cpp
|
||||
json_extension.cpp
|
||||
json_common.cpp
|
||||
json_enums.cpp
|
||||
json_functions.cpp
|
||||
json_multi_file_info.cpp
|
||||
json_scan.cpp
|
||||
json_serializer.cpp
|
||||
json_deserializer.cpp
|
||||
serialize_json.cpp
|
||||
json_functions/copy_json.cpp
|
||||
json_functions/json_array_length.cpp
|
||||
json_functions/json_contains.cpp
|
||||
json_functions/json_exists.cpp
|
||||
json_functions/json_extract.cpp
|
||||
json_functions/json_keys.cpp
|
||||
json_functions/json_merge_patch.cpp
|
||||
json_functions/json_pretty.cpp
|
||||
json_functions/json_table_in_out.cpp
|
||||
json_functions/json_structure.cpp
|
||||
json_functions/json_transform.cpp
|
||||
json_functions/json_create.cpp
|
||||
json_functions/json_type.cpp
|
||||
json_functions/json_valid.cpp
|
||||
json_functions/json_value.cpp
|
||||
json_functions/json_serialize_sql.cpp
|
||||
json_functions/json_serialize_plan.cpp
|
||||
json_functions/read_json.cpp
|
||||
json_functions/read_json_objects.cpp)
|
||||
|
||||
build_static_extension(json ${JSON_EXTENSION_FILES})
|
||||
set(PARAMETERS "-warnings")
|
||||
build_loadable_extension(json ${PARAMETERS} ${JSON_EXTENSION_FILES})
|
||||
target_link_libraries(json_loadable_extension duckdb_yyjson)
|
||||
|
||||
install(
|
||||
TARGETS json_extension
|
||||
EXPORT "${DUCKDB_EXPORT_SET}"
|
||||
LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
|
||||
ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")
|
||||
36
external/duckdb/extension/json/include/json.json
vendored
Normal file
36
external/duckdb/extension/json/include/json.json
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
[
|
||||
{
|
||||
"class": "JSONTransformOptions",
|
||||
"includes": [
|
||||
"json_transform.hpp"
|
||||
],
|
||||
"members": [
|
||||
{
|
||||
"id": 100,
|
||||
"name": "strict_cast",
|
||||
"type": "bool"
|
||||
},
|
||||
{
|
||||
"id": 101,
|
||||
"name": "error_duplicate_key",
|
||||
"type": "bool"
|
||||
},
|
||||
{
|
||||
"id": 102,
|
||||
"name": "error_missing_key",
|
||||
"type": "bool"
|
||||
},
|
||||
{
|
||||
"id": 103,
|
||||
"name": "error_unknown_key",
|
||||
"type": "bool"
|
||||
},
|
||||
{
|
||||
"id": 104,
|
||||
"name": "delay_error",
|
||||
"type": "bool"
|
||||
}
|
||||
],
|
||||
"pointer_type": "none"
|
||||
}
|
||||
]
|
||||
388
external/duckdb/extension/json/include/json_common.hpp
vendored
Normal file
388
external/duckdb/extension/json/include/json_common.hpp
vendored
Normal file
@@ -0,0 +1,388 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_common.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/operator/cast_operators.hpp"
|
||||
#include "duckdb/common/operator/decimal_cast_operators.hpp"
|
||||
#include "duckdb/common/operator/string_cast.hpp"
|
||||
#include "duckdb/planner/expression/bound_function_expression.hpp"
|
||||
#include "yyjson.hpp"
|
||||
#include "duckdb/common/types/blob.hpp"
|
||||
|
||||
using namespace duckdb_yyjson; // NOLINT
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class JSONAllocator;
|
||||
|
||||
class JSONStringVectorBuffer : public VectorBuffer {
|
||||
public:
|
||||
explicit JSONStringVectorBuffer(shared_ptr<JSONAllocator> allocator_p)
|
||||
: VectorBuffer(VectorBufferType::OPAQUE_BUFFER), allocator(std::move(allocator_p)) {
|
||||
}
|
||||
|
||||
private:
|
||||
shared_ptr<JSONAllocator> allocator;
|
||||
};
|
||||
|
||||
//! JSON allocator is a custom allocator for yyjson that prevents many tiny allocations
|
||||
class JSONAllocator : public enable_shared_from_this<JSONAllocator> {
|
||||
public:
|
||||
explicit JSONAllocator(Allocator &allocator)
|
||||
: arena_allocator(allocator), yyjson_allocator({Allocate, Reallocate, Free, this}) {
|
||||
}
|
||||
|
||||
inline yyjson_alc *GetYYAlc() {
|
||||
return &yyjson_allocator;
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
arena_allocator.Reset();
|
||||
}
|
||||
|
||||
void AddBuffer(Vector &vector) {
|
||||
if (vector.GetType().InternalType() == PhysicalType::VARCHAR) {
|
||||
StringVector::AddBuffer(vector, make_buffer<JSONStringVectorBuffer>(shared_from_this()));
|
||||
}
|
||||
}
|
||||
|
||||
static void AddBuffer(Vector &vector, yyjson_alc *alc) {
|
||||
auto alloc = (JSONAllocator *)alc->ctx; // NOLINT
|
||||
alloc->AddBuffer(vector);
|
||||
}
|
||||
|
||||
private:
|
||||
static inline void *Allocate(void *ctx, size_t size) {
|
||||
auto alloc = (JSONAllocator *)ctx; // NOLINT
|
||||
return alloc->arena_allocator.AllocateAligned(size);
|
||||
}
|
||||
|
||||
static inline void *Reallocate(void *ctx, void *ptr, size_t old_size, size_t size) {
|
||||
auto alloc = (JSONAllocator *)ctx; // NOLINT
|
||||
return alloc->arena_allocator.ReallocateAligned(data_ptr_cast(ptr), old_size, size);
|
||||
}
|
||||
|
||||
static inline void Free(void *ctx, void *ptr) {
|
||||
// NOP because ArenaAllocator can't free
|
||||
}
|
||||
|
||||
private:
|
||||
ArenaAllocator arena_allocator;
|
||||
yyjson_alc yyjson_allocator;
|
||||
};
|
||||
|
||||
//! JSONKey / json_key_map_t speeds up mapping from JSON key to column ID
|
||||
struct JSONKey {
|
||||
const char *ptr;
|
||||
size_t len;
|
||||
};
|
||||
|
||||
struct JSONKeyHash {
|
||||
inline std::size_t operator()(const JSONKey &k) const {
|
||||
size_t result;
|
||||
if (k.len >= sizeof(size_t)) {
|
||||
memcpy(&result, k.ptr + k.len - sizeof(size_t), sizeof(size_t));
|
||||
} else {
|
||||
result = 0;
|
||||
FastMemcpy(&result, k.ptr, k.len);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
struct JSONKeyEquality {
|
||||
inline bool operator()(const JSONKey &a, const JSONKey &b) const {
|
||||
if (a.len != b.len) {
|
||||
return false;
|
||||
}
|
||||
return FastMemcmp(a.ptr, b.ptr, a.len) == 0;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using json_key_map_t = unordered_map<JSONKey, T, JSONKeyHash, JSONKeyEquality>;
|
||||
using json_key_set_t = unordered_set<JSONKey, JSONKeyHash, JSONKeyEquality>;
|
||||
|
||||
//! Common JSON functionality for most JSON functions
|
||||
struct JSONCommon {
|
||||
public:
|
||||
//! Read/Write flags
|
||||
static constexpr auto READ_FLAG =
|
||||
YYJSON_READ_ALLOW_INF_AND_NAN | YYJSON_READ_ALLOW_TRAILING_COMMAS | YYJSON_READ_BIGNUM_AS_RAW;
|
||||
static constexpr auto READ_STOP_FLAG = READ_FLAG | YYJSON_READ_STOP_WHEN_DONE;
|
||||
static constexpr auto READ_INSITU_FLAG = READ_STOP_FLAG | YYJSON_READ_INSITU;
|
||||
static constexpr auto WRITE_FLAG = YYJSON_WRITE_ALLOW_INF_AND_NAN;
|
||||
static constexpr auto WRITE_PRETTY_FLAG = YYJSON_WRITE_ALLOW_INF_AND_NAN | YYJSON_WRITE_PRETTY;
|
||||
|
||||
public:
|
||||
//! Constant JSON type strings
|
||||
static constexpr char const *TYPE_STRING_NULL = "NULL";
|
||||
static constexpr char const *TYPE_STRING_BOOLEAN = "BOOLEAN";
|
||||
static constexpr char const *TYPE_STRING_BIGINT = "BIGINT";
|
||||
static constexpr char const *TYPE_STRING_UBIGINT = "UBIGINT";
|
||||
static constexpr char const *TYPE_STRING_DOUBLE = "DOUBLE";
|
||||
static constexpr char const *TYPE_STRING_HUGEINT = "HUGEINT";
|
||||
static constexpr char const *TYPE_STRING_VARCHAR = "VARCHAR";
|
||||
static constexpr char const *TYPE_STRING_ARRAY = "ARRAY";
|
||||
static constexpr char const *TYPE_STRING_OBJECT = "OBJECT";
|
||||
|
||||
static inline const char *ValTypeToString(yyjson_val *val) {
|
||||
switch (yyjson_get_tag(val)) {
|
||||
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
|
||||
return TYPE_STRING_NULL;
|
||||
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC:
|
||||
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
|
||||
return TYPE_STRING_VARCHAR;
|
||||
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
||||
return TYPE_STRING_ARRAY;
|
||||
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
||||
return TYPE_STRING_OBJECT;
|
||||
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_TRUE:
|
||||
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_FALSE:
|
||||
return TYPE_STRING_BOOLEAN;
|
||||
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_UINT:
|
||||
return TYPE_STRING_UBIGINT;
|
||||
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_SINT:
|
||||
return TYPE_STRING_BIGINT;
|
||||
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_REAL:
|
||||
case YYJSON_TYPE_RAW | YYJSON_SUBTYPE_NONE:
|
||||
return TYPE_STRING_DOUBLE;
|
||||
default:
|
||||
throw InternalException("Unexpected yyjson tag in ValTypeToString");
|
||||
}
|
||||
}
|
||||
|
||||
static inline string_t ValTypeToStringT(yyjson_val *val) {
|
||||
return string_t(ValTypeToString(val));
|
||||
}
|
||||
|
||||
static inline LogicalTypeId ValTypeToLogicalTypeId(yyjson_val *val) {
|
||||
switch (yyjson_get_tag(val)) {
|
||||
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
|
||||
return LogicalTypeId::SQLNULL;
|
||||
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC:
|
||||
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
|
||||
return LogicalTypeId::VARCHAR;
|
||||
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
||||
return LogicalTypeId::LIST;
|
||||
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
||||
return LogicalTypeId::STRUCT;
|
||||
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_TRUE:
|
||||
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_FALSE:
|
||||
return LogicalTypeId::BOOLEAN;
|
||||
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_UINT:
|
||||
return LogicalTypeId::UBIGINT;
|
||||
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_SINT:
|
||||
return LogicalTypeId::BIGINT;
|
||||
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_REAL:
|
||||
case YYJSON_TYPE_RAW | YYJSON_SUBTYPE_NONE:
|
||||
return LogicalTypeId::DOUBLE;
|
||||
default:
|
||||
throw InternalException("Unexpected yyjson tag in ValTypeToLogicalTypeId");
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Document creation / reading / writing
|
||||
//===--------------------------------------------------------------------===//
|
||||
template <class T>
|
||||
static T *AllocateArray(yyjson_alc *alc, idx_t count) {
|
||||
return reinterpret_cast<T *>(alc->malloc(alc->ctx, sizeof(T) * count));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static T *AllocateArray(yyjson_mut_doc *doc, idx_t count) {
|
||||
return AllocateArray<T>(&doc->alc, count);
|
||||
}
|
||||
|
||||
static inline yyjson_mut_doc *CreateDocument(yyjson_alc *alc) {
|
||||
D_ASSERT(alc);
|
||||
return yyjson_mut_doc_new(alc);
|
||||
}
|
||||
static inline yyjson_doc *ReadDocumentUnsafe(char *data, idx_t size, const yyjson_read_flag flg, yyjson_alc *alc,
|
||||
yyjson_read_err *err = nullptr) {
|
||||
D_ASSERT(alc);
|
||||
return yyjson_read_opts(data, size, flg, alc, err);
|
||||
}
|
||||
static inline yyjson_doc *ReadDocumentUnsafe(const string_t &input, const yyjson_read_flag flg, yyjson_alc *alc,
|
||||
yyjson_read_err *err = nullptr) {
|
||||
return ReadDocumentUnsafe(input.GetDataWriteable(), input.GetSize(), flg, alc, err);
|
||||
}
|
||||
static inline yyjson_doc *ReadDocument(char *data, idx_t size, const yyjson_read_flag flg, yyjson_alc *alc) {
|
||||
yyjson_read_err error;
|
||||
auto result = ReadDocumentUnsafe(data, size, flg, alc, &error);
|
||||
if (error.code != YYJSON_READ_SUCCESS) {
|
||||
ThrowParseError(data, size, error);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
static inline yyjson_doc *ReadDocument(const string_t &input, const yyjson_read_flag flg, yyjson_alc *alc) {
|
||||
return ReadDocument(input.GetDataWriteable(), input.GetSize(), flg, alc);
|
||||
}
|
||||
|
||||
static string FormatParseError(const char *data, idx_t length, yyjson_read_err &error, const string &extra = "") {
|
||||
D_ASSERT(error.code != YYJSON_READ_SUCCESS);
|
||||
// Truncate, so we don't print megabytes worth of JSON
|
||||
auto input = length > 50 ? string(data, 47) + "..." : string(data, length);
|
||||
// Have to replace \r, otherwise output is unreadable
|
||||
input = StringUtil::Replace(input, "\r", "\\r");
|
||||
return StringUtil::Format("Malformed JSON at byte %lld of input: %s. %s Input: \"%s\"", error.pos, error.msg,
|
||||
extra, input);
|
||||
}
|
||||
static void ThrowParseError(const char *data, idx_t length, yyjson_read_err &error, const string &extra = "") {
|
||||
throw InvalidInputException(FormatParseError(data, length, error, extra));
|
||||
}
|
||||
|
||||
template <class YYJSON_VAL_T>
|
||||
static inline char *WriteVal(YYJSON_VAL_T *val, yyjson_alc *alc, idx_t &len) {
|
||||
throw InternalException("Unknown yyjson val type");
|
||||
}
|
||||
template <class YYJSON_VAL_T>
|
||||
static inline string_t WriteVal(YYJSON_VAL_T *val, yyjson_alc *alc) {
|
||||
D_ASSERT(alc);
|
||||
idx_t len;
|
||||
auto data = WriteVal<YYJSON_VAL_T>(val, alc, len);
|
||||
return string_t(data, len);
|
||||
}
|
||||
|
||||
//! Slow and easy ToString for errors
|
||||
static string ValToString(yyjson_val *val, idx_t max_len = DConstants::INVALID_INDEX);
|
||||
//! Throw an error with the printed yyjson_val
|
||||
static void ThrowValFormatError(string error_string, yyjson_val *val);
|
||||
|
||||
public:
|
||||
//===--------------------------------------------------------------------===//
|
||||
// JSON pointer / path
|
||||
//===--------------------------------------------------------------------===//
|
||||
enum class JSONPathType : uint8_t {
|
||||
//! Extract a single value
|
||||
REGULAR = 0,
|
||||
//! Extract multiple values (when we have a '*' wildcard in the JSON Path)
|
||||
WILDCARD = 1,
|
||||
};
|
||||
|
||||
//! Get JSON value using JSON path query (safe, checks the path query)
|
||||
static inline yyjson_val *Get(yyjson_val *val, const string_t &path_str, bool integral_argument) {
|
||||
auto ptr = path_str.GetData();
|
||||
auto len = path_str.GetSize();
|
||||
if (len == 0) {
|
||||
return GetUnsafe(val, ptr, len);
|
||||
}
|
||||
if (integral_argument) {
|
||||
auto str = "$[" + path_str.GetString() + "]";
|
||||
return GetUnsafe(val, str.c_str(), str.length());
|
||||
}
|
||||
switch (*ptr) {
|
||||
case '/': {
|
||||
// '/' notation must be '\0'-terminated
|
||||
auto str = string(ptr, len);
|
||||
return GetUnsafe(val, str.c_str(), len);
|
||||
}
|
||||
case '$': {
|
||||
if (ValidatePath(ptr, len, false) == JSONPathType::WILDCARD) {
|
||||
throw InvalidInputException(
|
||||
"JSON path cannot contain wildcards if the path is not a constant parameter");
|
||||
}
|
||||
return GetUnsafe(val, ptr, len);
|
||||
}
|
||||
default: {
|
||||
string path;
|
||||
if (memchr(ptr, '"', len)) {
|
||||
path = "/" + string(ptr, len);
|
||||
} else {
|
||||
path = "$.\"" + path_str.GetString() + "\"";
|
||||
}
|
||||
return GetUnsafe(val, path.c_str(), path.length());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//! Get JSON value using JSON path query (unsafe)
|
||||
static inline yyjson_val *GetUnsafe(yyjson_val *val, const char *ptr, const idx_t &len) {
|
||||
if (len == 0) {
|
||||
return val;
|
||||
}
|
||||
switch (*ptr) {
|
||||
case '/':
|
||||
return GetPointer(val, ptr, len);
|
||||
case '$':
|
||||
return GetPath(val, ptr, len);
|
||||
default:
|
||||
throw InternalException("JSON pointer/path does not start with '/' or '$'");
|
||||
}
|
||||
}
|
||||
|
||||
//! Get JSON value using JSON path query (unsafe)
|
||||
static void GetWildcardPath(yyjson_val *val, const char *ptr, const idx_t &len, vector<yyjson_val *> &vals);
|
||||
|
||||
//! Validate JSON Path ($.field[index]... syntax), returns true if there are wildcards in the path
|
||||
static JSONPathType ValidatePath(const char *ptr, const idx_t &len, const bool binder);
|
||||
|
||||
public:
|
||||
//! Same as BigQuery json_value
|
||||
static inline string_t JSONValue(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &mask, idx_t idx) {
|
||||
switch (yyjson_get_tag(val)) {
|
||||
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
|
||||
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
||||
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
||||
mask.SetInvalid(idx);
|
||||
return string_t {};
|
||||
default:
|
||||
return JSONCommon::WriteVal<yyjson_val>(val, alc);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
//! Get JSON pointer (/field/index/... syntax)
|
||||
static inline yyjson_val *GetPointer(yyjson_val *val, const char *ptr, const idx_t &len) {
|
||||
yyjson_ptr_err err;
|
||||
return unsafe_yyjson_ptr_getx(val, ptr, len, &err);
|
||||
}
|
||||
//! Get JSON path ($.field[index]... syntax)
|
||||
static yyjson_val *GetPath(yyjson_val *val, const char *ptr, const idx_t &len);
|
||||
};
|
||||
|
||||
template <>
|
||||
inline char *JSONCommon::WriteVal(yyjson_val *val, yyjson_alc *alc, idx_t &len) {
|
||||
size_t len_size_t;
|
||||
// yyjson_val_write_opts must not throw
|
||||
auto ret = yyjson_val_write_opts(val, JSONCommon::WRITE_FLAG, alc, &len_size_t, nullptr);
|
||||
len = len_size_t;
|
||||
return ret;
|
||||
}
|
||||
template <>
|
||||
inline char *JSONCommon::WriteVal(yyjson_mut_val *val, yyjson_alc *alc, idx_t &len) {
|
||||
size_t len_size_t;
|
||||
// yyjson_mut_val_write_opts must not throw
|
||||
auto ret = yyjson_mut_val_write_opts(val, JSONCommon::WRITE_FLAG, alc, &len_size_t, nullptr);
|
||||
len = len_size_t;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct yyjson_doc_deleter {
|
||||
void operator()(yyjson_doc *doc) {
|
||||
if (doc) {
|
||||
yyjson_doc_free(doc);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct yyjson_mut_doc_deleter {
|
||||
void operator()(yyjson_mut_doc *doc) {
|
||||
if (doc) {
|
||||
yyjson_mut_doc_free(doc);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using yyjson_doc_ptr = unique_ptr<yyjson_doc, yyjson_doc_deleter>;
|
||||
using yyjson_mut_doc_ptr = unique_ptr<yyjson_mut_doc, yyjson_mut_doc_deleter>;
|
||||
|
||||
} // namespace duckdb
|
||||
82
external/duckdb/extension/json/include/json_deserializer.hpp
vendored
Normal file
82
external/duckdb/extension/json/include/json_deserializer.hpp
vendored
Normal file
@@ -0,0 +1,82 @@
|
||||
#pragma once
|
||||
#include "json_common.hpp"
|
||||
#include "duckdb/common/serializer/deserializer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class JsonDeserializer : public Deserializer {
|
||||
public:
|
||||
JsonDeserializer(yyjson_val *val, const yyjson_doc_ptr &doc) : doc(doc.get()) {
|
||||
deserialize_enum_from_string = true;
|
||||
stack.emplace_back(val);
|
||||
}
|
||||
|
||||
private:
|
||||
struct StackFrame {
|
||||
yyjson_val *val;
|
||||
yyjson_arr_iter arr_iter;
|
||||
explicit StackFrame(yyjson_val *val) : val(val) {
|
||||
yyjson_arr_iter_init(val, &arr_iter);
|
||||
}
|
||||
};
|
||||
|
||||
yyjson_doc *doc;
|
||||
const char *current_tag = nullptr;
|
||||
vector<StackFrame> stack;
|
||||
|
||||
void DumpDoc();
|
||||
void DumpCurrent();
|
||||
void Dump(yyjson_mut_val *val);
|
||||
void Dump(yyjson_val *val);
|
||||
|
||||
// Get the current json value
|
||||
inline StackFrame &Current() {
|
||||
return stack.back();
|
||||
};
|
||||
|
||||
inline void Push(yyjson_val *val) {
|
||||
stack.emplace_back(val);
|
||||
}
|
||||
inline void Pop() {
|
||||
stack.pop_back();
|
||||
}
|
||||
yyjson_val *GetNextValue();
|
||||
|
||||
void ThrowTypeError(yyjson_val *val, const char *expected);
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Nested Types Hooks
|
||||
//===--------------------------------------------------------------------===//
|
||||
void OnPropertyBegin(const field_id_t field_id, const char *tag) final;
|
||||
void OnPropertyEnd() final;
|
||||
bool OnOptionalPropertyBegin(const field_id_t field_id, const char *tag) final;
|
||||
void OnOptionalPropertyEnd(bool present) final;
|
||||
|
||||
void OnObjectBegin() final;
|
||||
void OnObjectEnd() final;
|
||||
idx_t OnListBegin() final;
|
||||
void OnListEnd() final;
|
||||
bool OnNullableBegin() final;
|
||||
void OnNullableEnd() final;
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Primitive Types
|
||||
//===--------------------------------------------------------------------===//
|
||||
bool ReadBool() final;
|
||||
int8_t ReadSignedInt8() final;
|
||||
uint8_t ReadUnsignedInt8() final;
|
||||
int16_t ReadSignedInt16() final;
|
||||
uint16_t ReadUnsignedInt16() final;
|
||||
int32_t ReadSignedInt32() final;
|
||||
uint32_t ReadUnsignedInt32() final;
|
||||
int64_t ReadSignedInt64() final;
|
||||
uint64_t ReadUnsignedInt64() final;
|
||||
float ReadFloat() final;
|
||||
double ReadDouble() final;
|
||||
string ReadString() final;
|
||||
hugeint_t ReadHugeInt() final;
|
||||
uhugeint_t ReadUhugeInt() final;
|
||||
void ReadDataPtr(data_ptr_t &ptr, idx_t count) final;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
60
external/duckdb/extension/json/include/json_enums.hpp
vendored
Normal file
60
external/duckdb/extension/json/include/json_enums.hpp
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// This file is automatically generated by scripts/generate_enums.py
|
||||
// Do not edit this file manually, your changes will be overwritten
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/constants.hpp"
|
||||
#include "duckdb/common/enum_util.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
enum class JSONScanType : uint8_t {
|
||||
INVALID = 0,
|
||||
//! Read JSON straight to columnar data
|
||||
READ_JSON = 1,
|
||||
//! Read JSON values as strings
|
||||
READ_JSON_OBJECTS = 2,
|
||||
//! Sample run for schema detection
|
||||
SAMPLE = 3,
|
||||
};
|
||||
|
||||
enum class JSONRecordType : uint8_t {
|
||||
AUTO_DETECT = 0,
|
||||
//! Sequential objects that are unpacked
|
||||
RECORDS = 1,
|
||||
//! Any other JSON type, e.g., ARRAY
|
||||
VALUES = 2,
|
||||
};
|
||||
|
||||
enum class JSONFormat : uint8_t {
|
||||
//! Auto-detect format (UNSTRUCTURED / NEWLINE_DELIMITED)
|
||||
AUTO_DETECT = 0,
|
||||
//! One unit after another, newlines can be anywhere
|
||||
UNSTRUCTURED = 1,
|
||||
//! Units are separated by newlines, newlines do not occur within Units (NDJSON)
|
||||
NEWLINE_DELIMITED = 2,
|
||||
//! File is one big array of units
|
||||
ARRAY = 3,
|
||||
};
|
||||
|
||||
template<>
|
||||
const char* EnumUtil::ToChars<JSONScanType>(JSONScanType value);
|
||||
|
||||
template<>
|
||||
JSONScanType EnumUtil::FromString<JSONScanType>(const char *value);
|
||||
|
||||
template<>
|
||||
const char* EnumUtil::ToChars<JSONRecordType>(JSONRecordType value);
|
||||
|
||||
template<>
|
||||
JSONRecordType EnumUtil::FromString<JSONRecordType>(const char *value);
|
||||
|
||||
template<>
|
||||
const char* EnumUtil::ToChars<JSONFormat>(JSONFormat value);
|
||||
|
||||
template<>
|
||||
JSONFormat EnumUtil::FromString<JSONFormat>(const char *value);
|
||||
|
||||
} // namespace duckdb
|
||||
55
external/duckdb/extension/json/include/json_enums.json
vendored
Normal file
55
external/duckdb/extension/json/include/json_enums.json
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
[
|
||||
{
|
||||
"name": "JSONScanType",
|
||||
"values": [
|
||||
"INVALID",
|
||||
{
|
||||
"name": "READ_JSON",
|
||||
"comment": "Read JSON straight to columnar data"
|
||||
},
|
||||
{
|
||||
"name": "READ_JSON_OBJECTS",
|
||||
"comment": "Read JSON values as strings"
|
||||
},
|
||||
{
|
||||
"name": "SAMPLE",
|
||||
"comment": "Sample run for schema detection"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "JSONRecordType",
|
||||
"values": [
|
||||
"AUTO_DETECT",
|
||||
{
|
||||
"name": "RECORDS",
|
||||
"comment": "Sequential objects that are unpacked"
|
||||
},
|
||||
{
|
||||
"name": "VALUES",
|
||||
"comment": "Any other JSON type, e.g., ARRAY"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "JSONFormat",
|
||||
"values": [
|
||||
{
|
||||
"name": "AUTO_DETECT",
|
||||
"comment": "Auto-detect format (UNSTRUCTURED / NEWLINE_DELIMITED)"
|
||||
},
|
||||
{
|
||||
"name": "UNSTRUCTURED",
|
||||
"comment": "One unit after another, newlines can be anywhere"
|
||||
},
|
||||
{
|
||||
"name": "NEWLINE_DELIMITED",
|
||||
"comment": "Units are separated by newlines, newlines do not occur within Units (NDJSON)"
|
||||
},
|
||||
{
|
||||
"name": "ARRAY",
|
||||
"comment": "File is one big array of units"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
180
external/duckdb/extension/json/include/json_executors.hpp
vendored
Normal file
180
external/duckdb/extension/json/include/json_executors.hpp
vendored
Normal file
@@ -0,0 +1,180 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_executors.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/vector_operations/vector_operations.hpp"
|
||||
#include "duckdb/execution/expression_executor.hpp"
|
||||
#include "json_functions.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
template <class T>
|
||||
using json_function_t = std::function<T(yyjson_val *, yyjson_alc *, Vector &, ValidityMask &, idx_t)>;
|
||||
|
||||
struct JSONExecutors {
|
||||
public:
|
||||
//! Single-argument JSON read function, i.e. json_type('[1, 2, 3]')
|
||||
template <class T>
|
||||
static void UnaryExecute(DataChunk &args, ExpressionState &state, Vector &result, const json_function_t<T> fun) {
|
||||
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
|
||||
auto &inputs = args.data[0];
|
||||
UnaryExecutor::ExecuteWithNulls<string_t, T>(
|
||||
inputs, result, args.size(), [&](string_t input, ValidityMask &mask, idx_t idx) {
|
||||
auto doc = JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc);
|
||||
return fun(doc->root, alc, result, mask, idx);
|
||||
});
|
||||
|
||||
JSONAllocator::AddBuffer(result, alc);
|
||||
}
|
||||
|
||||
//! Two-argument JSON read function (with path query), i.e. json_type('[1, 2, 3]', '$[0]')
|
||||
template <class T, bool SET_NULL_IF_NOT_FOUND = true>
|
||||
static void BinaryExecute(DataChunk &args, ExpressionState &state, Vector &result, const json_function_t<T> fun) {
|
||||
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
|
||||
const auto &info = func_expr.bind_info->Cast<JSONReadFunctionData>();
|
||||
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
|
||||
auto &inputs = args.data[0];
|
||||
if (info.constant) { // Constant path
|
||||
const char *ptr = info.ptr;
|
||||
const idx_t &len = info.len;
|
||||
if (info.path_type == JSONCommon::JSONPathType::REGULAR) {
|
||||
UnaryExecutor::ExecuteWithNulls<string_t, T>(
|
||||
inputs, result, args.size(), [&](string_t input, ValidityMask &mask, idx_t idx) {
|
||||
auto doc = JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc);
|
||||
auto val = JSONCommon::GetUnsafe(doc->root, ptr, len);
|
||||
if (SET_NULL_IF_NOT_FOUND && !val) {
|
||||
mask.SetInvalid(idx);
|
||||
return T {};
|
||||
} else {
|
||||
return fun(val, alc, result, mask, idx);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
D_ASSERT(info.path_type == JSONCommon::JSONPathType::WILDCARD);
|
||||
vector<yyjson_val *> vals;
|
||||
UnaryExecutor::Execute<string_t, list_entry_t>(inputs, result, args.size(), [&](string_t input) {
|
||||
vals.clear();
|
||||
|
||||
auto doc = JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc);
|
||||
JSONCommon::GetWildcardPath(doc->root, ptr, len, vals);
|
||||
|
||||
auto current_size = ListVector::GetListSize(result);
|
||||
auto new_size = current_size + vals.size();
|
||||
if (ListVector::GetListCapacity(result) < new_size) {
|
||||
ListVector::Reserve(result, new_size);
|
||||
}
|
||||
|
||||
auto &child_entry = ListVector::GetEntry(result);
|
||||
auto child_vals = FlatVector::GetData<T>(child_entry);
|
||||
auto &child_validity = FlatVector::Validity(child_entry);
|
||||
for (idx_t i = 0; i < vals.size(); i++) {
|
||||
auto &val = vals[i];
|
||||
D_ASSERT(val != nullptr); // Wildcard extract shouldn't give back nullptrs
|
||||
child_vals[current_size + i] = fun(val, alc, result, child_validity, current_size + i);
|
||||
}
|
||||
|
||||
ListVector::SetListSize(result, new_size);
|
||||
|
||||
return list_entry_t {current_size, vals.size()};
|
||||
});
|
||||
}
|
||||
} else { // Columnref path
|
||||
D_ASSERT(info.path_type == JSONCommon::JSONPathType::REGULAR);
|
||||
unique_ptr<Vector> casted_paths;
|
||||
if (args.data[1].GetType().id() == LogicalTypeId::VARCHAR) {
|
||||
casted_paths = make_uniq<Vector>(args.data[1]);
|
||||
} else {
|
||||
casted_paths = make_uniq<Vector>(LogicalTypeId::VARCHAR);
|
||||
VectorOperations::DefaultCast(args.data[1], *casted_paths, args.size(), true);
|
||||
}
|
||||
BinaryExecutor::ExecuteWithNulls<string_t, string_t, T>(
|
||||
inputs, *casted_paths, result, args.size(),
|
||||
[&](string_t input, string_t path, ValidityMask &mask, idx_t idx) {
|
||||
auto doc = JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc);
|
||||
auto val = JSONCommon::Get(doc->root, path, args.data[1].GetType().IsIntegral());
|
||||
if (SET_NULL_IF_NOT_FOUND && !val) {
|
||||
mask.SetInvalid(idx);
|
||||
return T {};
|
||||
} else {
|
||||
return fun(val, alc, result, mask, idx);
|
||||
}
|
||||
});
|
||||
}
|
||||
if (args.AllConstant()) {
|
||||
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
||||
}
|
||||
|
||||
JSONAllocator::AddBuffer(result, alc);
|
||||
}
|
||||
|
||||
//! JSON read function with list of path queries, i.e. json_type('[1, 2, 3]', ['$[0]', '$[1]'])
|
||||
template <class T, bool SET_NULL_IF_NOT_FOUND = true>
|
||||
static void ExecuteMany(DataChunk &args, ExpressionState &state, Vector &result, const json_function_t<T> fun) {
|
||||
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
|
||||
const auto &info = func_expr.bind_info->Cast<JSONReadManyFunctionData>();
|
||||
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
D_ASSERT(info.ptrs.size() == info.lens.size());
|
||||
|
||||
const auto count = args.size();
|
||||
const idx_t num_paths = info.ptrs.size();
|
||||
const idx_t list_size = count * num_paths;
|
||||
|
||||
UnifiedVectorFormat input_data;
|
||||
auto &input_vector = args.data[0];
|
||||
input_vector.ToUnifiedFormat(count, input_data);
|
||||
auto inputs = UnifiedVectorFormat::GetData<string_t>(input_data);
|
||||
|
||||
ListVector::Reserve(result, list_size);
|
||||
auto list_entries = FlatVector::GetData<list_entry_t>(result);
|
||||
auto &list_validity = FlatVector::Validity(result);
|
||||
|
||||
auto &child = ListVector::GetEntry(result);
|
||||
auto child_data = FlatVector::GetData<T>(child);
|
||||
auto &child_validity = FlatVector::Validity(child);
|
||||
|
||||
idx_t offset = 0;
|
||||
yyjson_val *val;
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
auto idx = input_data.sel->get_index(i);
|
||||
if (!input_data.validity.RowIsValid(idx)) {
|
||||
list_validity.SetInvalid(i);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto doc = JSONCommon::ReadDocument(inputs[idx], JSONCommon::READ_FLAG, alc);
|
||||
for (idx_t path_i = 0; path_i < num_paths; path_i++) {
|
||||
auto child_idx = offset + path_i;
|
||||
val = JSONCommon::GetUnsafe(doc->root, info.ptrs[path_i], info.lens[path_i]);
|
||||
if (SET_NULL_IF_NOT_FOUND && !val) {
|
||||
child_validity.SetInvalid(child_idx);
|
||||
} else {
|
||||
child_data[child_idx] = fun(val, alc, child, child_validity, child_idx);
|
||||
}
|
||||
}
|
||||
|
||||
list_entries[i].offset = offset;
|
||||
list_entries[i].length = num_paths;
|
||||
offset += num_paths;
|
||||
}
|
||||
ListVector::SetListSize(result, offset);
|
||||
|
||||
if (args.AllConstant()) {
|
||||
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
||||
}
|
||||
|
||||
JSONAllocator::AddBuffer(result, alc);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
22
external/duckdb/extension/json/include/json_extension.hpp
vendored
Normal file
22
external/duckdb/extension/json/include/json_extension.hpp
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_extension.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class JsonExtension : public Extension {
|
||||
public:
|
||||
void Load(ExtensionLoader &db) override;
|
||||
std::string Name() override;
|
||||
std::string Version() const override;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
138
external/duckdb/extension/json/include/json_functions.hpp
vendored
Normal file
138
external/duckdb/extension/json/include/json_functions.hpp
vendored
Normal file
@@ -0,0 +1,138 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_functions.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/main/extension/extension_loader.hpp"
|
||||
#include "json_common.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class TableRef;
|
||||
struct ReplacementScanData;
|
||||
class CastFunctionSet;
|
||||
struct CastParameters;
|
||||
struct CastLocalStateParameters;
|
||||
struct JSONScanInfo;
|
||||
class BuiltinFunctions;
|
||||
|
||||
// Scalar function stuff
|
||||
struct JSONReadFunctionData : public FunctionData {
|
||||
public:
|
||||
JSONReadFunctionData(bool constant, string path_p, idx_t len, JSONCommon::JSONPathType path_type);
|
||||
unique_ptr<FunctionData> Copy() const override;
|
||||
bool Equals(const FunctionData &other_p) const override;
|
||||
static JSONCommon::JSONPathType CheckPath(const Value &path_val, string &path, idx_t &len);
|
||||
static unique_ptr<FunctionData> Bind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments);
|
||||
|
||||
public:
|
||||
const bool constant;
|
||||
const string path;
|
||||
const JSONCommon::JSONPathType path_type;
|
||||
const char *ptr;
|
||||
const idx_t len;
|
||||
};
|
||||
|
||||
struct JSONReadManyFunctionData : public FunctionData {
|
||||
public:
|
||||
JSONReadManyFunctionData(vector<string> paths_p, vector<idx_t> lens_p);
|
||||
unique_ptr<FunctionData> Copy() const override;
|
||||
bool Equals(const FunctionData &other_p) const override;
|
||||
static unique_ptr<FunctionData> Bind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments);
|
||||
|
||||
public:
|
||||
const vector<string> paths;
|
||||
vector<const char *> ptrs;
|
||||
const vector<idx_t> lens;
|
||||
};
|
||||
|
||||
struct JSONFunctionLocalState : public FunctionLocalState {
|
||||
public:
|
||||
explicit JSONFunctionLocalState(Allocator &allocator);
|
||||
explicit JSONFunctionLocalState(ClientContext &context);
|
||||
static unique_ptr<FunctionLocalState> Init(ExpressionState &state, const BoundFunctionExpression &expr,
|
||||
FunctionData *bind_data);
|
||||
static unique_ptr<FunctionLocalState> InitCastLocalState(CastLocalStateParameters ¶meters);
|
||||
static JSONFunctionLocalState &ResetAndGet(ExpressionState &state);
|
||||
|
||||
public:
|
||||
shared_ptr<JSONAllocator> json_allocator;
|
||||
};
|
||||
|
||||
class JSONFunctions {
|
||||
public:
|
||||
static vector<ScalarFunctionSet> GetScalarFunctions();
|
||||
static vector<PragmaFunctionSet> GetPragmaFunctions();
|
||||
static vector<TableFunctionSet> GetTableFunctions();
|
||||
static unique_ptr<TableRef> ReadJSONReplacement(ClientContext &context, ReplacementScanInput &input,
|
||||
optional_ptr<ReplacementScanData> data);
|
||||
static TableFunction GetReadJSONTableFunction(shared_ptr<JSONScanInfo> function_info);
|
||||
static CopyFunction GetJSONCopyFunction();
|
||||
static void RegisterSimpleCastFunctions(ExtensionLoader &loader);
|
||||
static void RegisterJSONCreateCastFunctions(ExtensionLoader &loader);
|
||||
static void RegisterJSONTransformCastFunctions(ExtensionLoader &loader);
|
||||
|
||||
private:
|
||||
// Scalar functions
|
||||
static ScalarFunctionSet GetExtractFunction();
|
||||
static ScalarFunctionSet GetExtractStringFunction();
|
||||
|
||||
static ScalarFunctionSet GetArrayFunction();
|
||||
static ScalarFunctionSet GetObjectFunction();
|
||||
static ScalarFunctionSet GetToJSONFunction();
|
||||
static ScalarFunctionSet GetArrayToJSONFunction();
|
||||
static ScalarFunctionSet GetRowToJSONFunction();
|
||||
static ScalarFunctionSet GetMergePatchFunction();
|
||||
|
||||
static ScalarFunctionSet GetStructureFunction();
|
||||
static ScalarFunctionSet GetTransformFunction();
|
||||
static ScalarFunctionSet GetTransformStrictFunction();
|
||||
|
||||
static ScalarFunctionSet GetArrayLengthFunction();
|
||||
static ScalarFunctionSet GetContainsFunction();
|
||||
static ScalarFunctionSet GetExistsFunction();
|
||||
static ScalarFunctionSet GetKeysFunction();
|
||||
static ScalarFunctionSet GetTypeFunction();
|
||||
static ScalarFunctionSet GetValidFunction();
|
||||
static ScalarFunctionSet GetValueFunction();
|
||||
static ScalarFunctionSet GetSerializeSqlFunction();
|
||||
static ScalarFunctionSet GetDeserializeSqlFunction();
|
||||
static ScalarFunctionSet GetSerializePlanFunction();
|
||||
|
||||
static ScalarFunctionSet GetPrettyPrintFunction();
|
||||
|
||||
static PragmaFunctionSet GetExecuteJsonSerializedSqlPragmaFunction();
|
||||
|
||||
template <class FUNCTION_INFO>
|
||||
static void AddAliases(const vector<string> &names, FUNCTION_INFO fun, vector<FUNCTION_INFO> &functions) {
|
||||
for (auto &name : names) {
|
||||
fun.name = name;
|
||||
functions.push_back(fun);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Table functions
|
||||
static TableFunctionSet GetReadJSONObjectsFunction();
|
||||
static TableFunctionSet GetReadNDJSONObjectsFunction();
|
||||
static TableFunctionSet GetReadJSONObjectsAutoFunction();
|
||||
|
||||
static TableFunctionSet GetReadJSONFunction();
|
||||
static TableFunctionSet GetReadNDJSONFunction();
|
||||
static TableFunctionSet GetReadJSONAutoFunction();
|
||||
static TableFunctionSet GetReadNDJSONAutoFunction();
|
||||
|
||||
static TableFunctionSet GetJSONEachFunction();
|
||||
static TableFunctionSet GetJSONTreeFunction();
|
||||
|
||||
static TableFunctionSet GetExecuteJsonSerializedSqlFunction();
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
54
external/duckdb/extension/json/include/json_multi_file_info.hpp
vendored
Normal file
54
external/duckdb/extension/json/include/json_multi_file_info.hpp
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_multi_file_info.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/multi_file/multi_file_function.hpp"
|
||||
#include "json_reader_options.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class JSONFileReaderOptions : public BaseFileReaderOptions {
|
||||
public:
|
||||
JSONReaderOptions options;
|
||||
};
|
||||
|
||||
struct JSONMultiFileInfo : MultiFileReaderInterface {
|
||||
static unique_ptr<MultiFileReaderInterface> CreateInterface(ClientContext &context);
|
||||
|
||||
unique_ptr<BaseFileReaderOptions> InitializeOptions(ClientContext &context,
|
||||
optional_ptr<TableFunctionInfo> info) override;
|
||||
bool ParseCopyOption(ClientContext &context, const string &key, const vector<Value> &values,
|
||||
BaseFileReaderOptions &options, vector<string> &expected_names,
|
||||
vector<LogicalType> &expected_types) override;
|
||||
bool ParseOption(ClientContext &context, const string &key, const Value &val, MultiFileOptions &file_options,
|
||||
BaseFileReaderOptions &options) override;
|
||||
void FinalizeCopyBind(ClientContext &context, BaseFileReaderOptions &options, const vector<string> &expected_names,
|
||||
const vector<LogicalType> &expected_types) override;
|
||||
unique_ptr<TableFunctionData> InitializeBindData(MultiFileBindData &multi_file_data,
|
||||
unique_ptr<BaseFileReaderOptions> options) override;
|
||||
void BindReader(ClientContext &context, vector<LogicalType> &return_types, vector<string> &names,
|
||||
MultiFileBindData &bind_data) override;
|
||||
optional_idx MaxThreads(const MultiFileBindData &bind_data, const MultiFileGlobalState &global_state,
|
||||
FileExpandResult expand_result) override;
|
||||
unique_ptr<GlobalTableFunctionState> InitializeGlobalState(ClientContext &context, MultiFileBindData &bind_data,
|
||||
MultiFileGlobalState &global_state) override;
|
||||
unique_ptr<LocalTableFunctionState> InitializeLocalState(ExecutionContext &context,
|
||||
GlobalTableFunctionState &global_state) override;
|
||||
shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
|
||||
BaseUnionData &union_data, const MultiFileBindData &bind_data_p) override;
|
||||
shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
|
||||
const OpenFileInfo &file, idx_t file_idx,
|
||||
const MultiFileBindData &bind_data) override;
|
||||
void FinishReading(ClientContext &context, GlobalTableFunctionState &global_state,
|
||||
LocalTableFunctionState &local_state) override;
|
||||
unique_ptr<NodeStatistics> GetCardinality(const MultiFileBindData &bind_data, idx_t file_count) override;
|
||||
FileGlobInput GetGlobInput() override;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
302
external/duckdb/extension/json/include/json_reader.hpp
vendored
Normal file
302
external/duckdb/extension/json/include/json_reader.hpp
vendored
Normal file
@@ -0,0 +1,302 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/atomic.hpp"
|
||||
#include "duckdb/common/enum_util.hpp"
|
||||
#include "duckdb/common/enums/file_compression_type.hpp"
|
||||
#include "duckdb/common/file_system.hpp"
|
||||
#include "duckdb/common/multi_file/base_file_reader.hpp"
|
||||
#include "duckdb/common/multi_file/multi_file_reader.hpp"
|
||||
#include "json_reader_options.hpp"
|
||||
#include "duckdb/common/mutex.hpp"
|
||||
#include "json_common.hpp"
|
||||
#include "json_enums.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
struct JSONScanGlobalState;
|
||||
class JSONReader;
|
||||
|
||||
struct JSONBufferHandle {
|
||||
public:
|
||||
JSONBufferHandle(JSONReader &reader, idx_t buffer_index, idx_t readers, AllocatedData &&buffer, idx_t buffer_size,
|
||||
idx_t buffer_start);
|
||||
|
||||
public:
|
||||
//! The reader this buffer comes from
|
||||
JSONReader &reader;
|
||||
//! Buffer index (within same file)
|
||||
const idx_t buffer_index;
|
||||
|
||||
//! Number of readers for this buffer
|
||||
atomic<idx_t> readers;
|
||||
//! The buffer
|
||||
AllocatedData buffer;
|
||||
//! The size of the data in the buffer (can be less than buffer.GetSize())
|
||||
const idx_t buffer_size;
|
||||
//! The start position in the buffer
|
||||
idx_t buffer_start;
|
||||
};
|
||||
|
||||
struct JSONFileHandle {
|
||||
public:
|
||||
JSONFileHandle(QueryContext context, unique_ptr<FileHandle> file_handle, Allocator &allocator);
|
||||
|
||||
bool IsOpen() const;
|
||||
void Close();
|
||||
|
||||
void Reset();
|
||||
bool RequestedReadsComplete();
|
||||
bool LastReadRequested() const;
|
||||
|
||||
idx_t FileSize() const;
|
||||
idx_t Remaining() const;
|
||||
|
||||
bool CanSeek() const;
|
||||
bool IsPipe() const;
|
||||
|
||||
FileHandle &GetHandle();
|
||||
|
||||
//! The next two functions return whether the read was successful
|
||||
bool GetPositionAndSize(idx_t &position, idx_t &size, idx_t requested_size);
|
||||
bool Read(char *pointer, idx_t &read_size, idx_t requested_size);
|
||||
//! Read at position optionally allows passing a custom handle to read from, otherwise the default one is used
|
||||
void ReadAtPosition(char *pointer, idx_t size, idx_t position, optional_ptr<FileHandle> override_handle = nullptr);
|
||||
|
||||
private:
|
||||
idx_t ReadInternal(char *pointer, const idx_t requested_size);
|
||||
idx_t ReadFromCache(char *&pointer, idx_t &size, atomic<idx_t> &position);
|
||||
|
||||
private:
|
||||
QueryContext context;
|
||||
|
||||
//! The JSON file handle
|
||||
unique_ptr<FileHandle> file_handle;
|
||||
Allocator &allocator;
|
||||
|
||||
//! File properties
|
||||
const bool can_seek;
|
||||
const idx_t file_size;
|
||||
|
||||
//! Read properties
|
||||
atomic<idx_t> read_position;
|
||||
atomic<idx_t> requested_reads;
|
||||
atomic<idx_t> actual_reads;
|
||||
atomic<bool> last_read_requested;
|
||||
|
||||
//! Cached buffers for resetting when reading stream
|
||||
vector<AllocatedData> cached_buffers;
|
||||
idx_t cached_size;
|
||||
};
|
||||
|
||||
struct JSONString {
|
||||
public:
|
||||
JSONString() {
|
||||
}
|
||||
JSONString(const char *pointer_p, idx_t size_p) : pointer(pointer_p), size(size_p) {
|
||||
}
|
||||
|
||||
const char *pointer;
|
||||
idx_t size;
|
||||
|
||||
public:
|
||||
string ToString() {
|
||||
return string(pointer, size);
|
||||
}
|
||||
|
||||
const char &operator[](size_t i) const {
|
||||
return pointer[i];
|
||||
}
|
||||
};
|
||||
|
||||
enum class JSONFileReadType { SCAN_ENTIRE_FILE, SCAN_PARTIAL };
|
||||
|
||||
struct JSONReaderScanState {
|
||||
explicit JSONReaderScanState(ClientContext &context, Allocator &global_allocator,
|
||||
idx_t reconstruct_buffer_capacity);
|
||||
|
||||
FileSystem &fs;
|
||||
Allocator &global_allocator;
|
||||
//! Thread-local allocator
|
||||
JSONAllocator allocator;
|
||||
idx_t buffer_capacity;
|
||||
bool initialized = false;
|
||||
// if we have a buffer already - this is our buffer index
|
||||
optional_idx buffer_index;
|
||||
//! Whether or not we are scanning the entire file
|
||||
//! If we are scanning the entire file we don't share reads between threads and just read the file until we are done
|
||||
JSONFileReadType file_read_type = JSONFileReadType::SCAN_PARTIAL;
|
||||
// Data for reading (if we have postponed reading)
|
||||
//! Buffer (if we have one)
|
||||
AllocatedData read_buffer;
|
||||
bool needs_to_read = false;
|
||||
idx_t request_size;
|
||||
idx_t read_position;
|
||||
idx_t read_size;
|
||||
//! Current scan data
|
||||
idx_t scan_count = 0;
|
||||
JSONString units[STANDARD_VECTOR_SIZE];
|
||||
yyjson_val *values[STANDARD_VECTOR_SIZE];
|
||||
optional_ptr<JSONBufferHandle> current_buffer_handle;
|
||||
//! Current buffer read info
|
||||
optional_ptr<JSONReader> current_reader;
|
||||
char *buffer_ptr = nullptr;
|
||||
idx_t buffer_size = 0;
|
||||
idx_t buffer_offset = 0;
|
||||
idx_t prev_buffer_remainder = 0;
|
||||
idx_t prev_buffer_offset = 0;
|
||||
idx_t lines_or_objects_in_buffer = 0;
|
||||
//! Whether this is the first time scanning this buffer
|
||||
bool is_first_scan = false;
|
||||
//! Whether this is the last batch of the file
|
||||
bool is_last = false;
|
||||
//! Buffer to reconstruct split values
|
||||
optional_idx batch_index;
|
||||
|
||||
//! For some filesystems (e.g. S3), using a filehandle per thread increases performance
|
||||
unique_ptr<FileHandle> thread_local_filehandle;
|
||||
|
||||
public:
|
||||
//! Reset for parsing the next batch of JSON from the current buffer
|
||||
void ResetForNextParse();
|
||||
//! Reset state for reading the next buffer
|
||||
void ResetForNextBuffer();
|
||||
//! Clear the buffer handle (if any)
|
||||
void ClearBufferHandle();
|
||||
};
|
||||
|
||||
struct JSONError {
|
||||
idx_t buf_index;
|
||||
idx_t line_or_object_in_buf;
|
||||
string error_msg;
|
||||
};
|
||||
|
||||
class JSONReader : public BaseFileReader {
|
||||
public:
|
||||
JSONReader(ClientContext &context, JSONReaderOptions options, OpenFileInfo file);
|
||||
|
||||
void OpenJSONFile();
|
||||
void CloseHandle();
|
||||
void Reset();
|
||||
|
||||
bool HasFileHandle() const;
|
||||
bool IsOpen() const;
|
||||
bool IsInitialized() const {
|
||||
return initialized;
|
||||
}
|
||||
|
||||
JSONReaderOptions &GetOptions();
|
||||
|
||||
JSONFormat GetFormat() const;
|
||||
void SetFormat(JSONFormat format);
|
||||
|
||||
JSONRecordType GetRecordType() const;
|
||||
void SetRecordType(JSONRecordType type);
|
||||
|
||||
const string &GetFileName() const;
|
||||
JSONFileHandle &GetFileHandle() const;
|
||||
|
||||
public:
|
||||
string GetReaderType() const override {
|
||||
return "JSON";
|
||||
}
|
||||
|
||||
void PrepareReader(ClientContext &context, GlobalTableFunctionState &) override;
|
||||
bool TryInitializeScan(ClientContext &context, GlobalTableFunctionState &gstate,
|
||||
LocalTableFunctionState &lstate) override;
|
||||
void Scan(ClientContext &context, GlobalTableFunctionState &global_state, LocalTableFunctionState &local_state,
|
||||
DataChunk &chunk) override;
|
||||
void FinishFile(ClientContext &context, GlobalTableFunctionState &gstate_p) override;
|
||||
double GetProgressInFile(ClientContext &context) override;
|
||||
|
||||
public:
|
||||
//! Get a new buffer index (must hold the lock)
|
||||
idx_t GetBufferIndex();
|
||||
//! Set line count for a buffer that is done (grabs the lock)
|
||||
void SetBufferLineOrObjectCount(JSONBufferHandle &handle, idx_t count);
|
||||
//! Records a parse error in the specified buffer
|
||||
void AddParseError(JSONReaderScanState &scan_state, idx_t line_or_object_in_buf, yyjson_read_err &err,
|
||||
const string &extra = "");
|
||||
//! Records a transform error in the specified buffer
|
||||
void AddTransformError(JSONReaderScanState &scan_state, idx_t object_index, const string &error_message);
|
||||
//! Whether this reader has thrown if an error has occurred
|
||||
bool HasThrown();
|
||||
|
||||
void Initialize(Allocator &allocator, idx_t buffer_size);
|
||||
bool InitializeScan(JSONReaderScanState &state, JSONFileReadType file_read_type);
|
||||
void ParseJSON(JSONReaderScanState &scan_state, char *const json_start, const idx_t json_size,
|
||||
const idx_t remaining);
|
||||
void ParseNextChunk(JSONReaderScanState &scan_state);
|
||||
idx_t Scan(JSONReaderScanState &scan_state);
|
||||
bool ReadNextBuffer(JSONReaderScanState &scan_state);
|
||||
bool PrepareBufferForRead(JSONReaderScanState &scan_state);
|
||||
|
||||
//! Scan progress
|
||||
double GetProgress() const;
|
||||
|
||||
void DecrementBufferUsage(JSONBufferHandle &handle, idx_t lines_or_object_in_buffer, AllocatedData &buffer);
|
||||
|
||||
private:
|
||||
void SkipOverArrayStart(JSONReaderScanState &scan_state);
|
||||
void AutoDetect(Allocator &allocator, idx_t buffer_size);
|
||||
bool CopyRemainderFromPreviousBuffer(JSONReaderScanState &scan_state);
|
||||
void FinalizeBufferInternal(JSONReaderScanState &scan_state, AllocatedData &buffer, idx_t buffer_index);
|
||||
void PrepareForReadInternal(JSONReaderScanState &scan_state);
|
||||
void PrepareForScan(JSONReaderScanState &scan_state);
|
||||
bool PrepareBufferSeek(JSONReaderScanState &scan_state);
|
||||
void ReadNextBufferSeek(JSONReaderScanState &scan_state);
|
||||
bool ReadNextBufferNoSeek(JSONReaderScanState &scan_state);
|
||||
void FinalizeBuffer(JSONReaderScanState &scan_state);
|
||||
|
||||
//! Insert/get/remove buffer (grabs the lock)
|
||||
void InsertBuffer(idx_t buffer_idx, unique_ptr<JSONBufferHandle> &&buffer);
|
||||
optional_ptr<JSONBufferHandle> GetBuffer(idx_t buffer_idx);
|
||||
AllocatedData RemoveBuffer(JSONBufferHandle &handle);
|
||||
|
||||
void ThrowObjectSizeError(const idx_t object_size);
|
||||
|
||||
private:
|
||||
//! Add an error to the buffer - requires the lock to be held
|
||||
void AddError(idx_t buf_index, idx_t line_or_object_in_buf, const string &error_msg);
|
||||
//! Throw errors if possible - requires the lock to be held
|
||||
void ThrowErrorsIfPossible();
|
||||
//! Try to get the line number - requires the lock to be held
|
||||
optional_idx TryGetLineNumber(idx_t buf_index, idx_t line_or_object_in_buf);
|
||||
|
||||
private:
|
||||
ClientContext &context;
|
||||
JSONReaderOptions options;
|
||||
|
||||
//! File handle
|
||||
unique_ptr<JSONFileHandle> file_handle;
|
||||
|
||||
//! Whether or not the reader has been initialized
|
||||
bool initialized;
|
||||
//! Next buffer index within the file
|
||||
idx_t next_buffer_index;
|
||||
//! Mapping from batch index to currently held buffers
|
||||
unordered_map<idx_t, unique_ptr<JSONBufferHandle>> buffer_map;
|
||||
|
||||
//! Line count per buffer
|
||||
vector<int64_t> buffer_line_or_object_counts;
|
||||
//! Whether any of the reading threads has thrown an error
|
||||
bool thrown;
|
||||
|
||||
//! If we have auto-detected, this is the buffer read by the auto-detection
|
||||
AllocatedData auto_detect_data;
|
||||
idx_t auto_detect_data_size = 0;
|
||||
|
||||
//! The first error we found in the file (if any)
|
||||
unique_ptr<JSONError> error;
|
||||
|
||||
public:
|
||||
mutable mutex lock;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
129
external/duckdb/extension/json/include/json_reader_options.hpp
vendored
Normal file
129
external/duckdb/extension/json/include/json_reader_options.hpp
vendored
Normal file
@@ -0,0 +1,129 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_reader_options.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "json_common.hpp"
|
||||
#include "json_enums.hpp"
|
||||
#include "duckdb/common/types/type_map.hpp"
|
||||
#include "duckdb/function/scalar/strftime_format.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct DateFormatMap {
|
||||
friend class MutableDateFormatMap;
|
||||
|
||||
public:
|
||||
explicit DateFormatMap(type_id_map_t<vector<StrpTimeFormat>> candidate_formats_p)
|
||||
: candidate_formats(std::move(candidate_formats_p)) {
|
||||
}
|
||||
|
||||
bool HasFormats(LogicalTypeId type) const {
|
||||
return HasFormats(candidate_formats, type);
|
||||
}
|
||||
|
||||
const StrpTimeFormat &GetFormat(LogicalTypeId type) const {
|
||||
D_ASSERT(candidate_formats.find(type) != candidate_formats.end());
|
||||
return candidate_formats.find(type)->second.back();
|
||||
}
|
||||
|
||||
public:
|
||||
static void AddFormat(type_id_map_t<vector<StrpTimeFormat>> &candidate_formats, LogicalTypeId type,
|
||||
const string &format_string) {
|
||||
auto &formats = candidate_formats[type];
|
||||
formats.emplace_back();
|
||||
formats.back().format_specifier = format_string;
|
||||
StrpTimeFormat::ParseFormatSpecifier(formats.back().format_specifier, formats.back());
|
||||
}
|
||||
|
||||
static bool HasFormats(const type_id_map_t<vector<StrpTimeFormat>> &candidate_formats, LogicalTypeId type) {
|
||||
return candidate_formats.find(type) != candidate_formats.end();
|
||||
}
|
||||
|
||||
private:
|
||||
type_id_map_t<vector<StrpTimeFormat>> candidate_formats;
|
||||
};
|
||||
|
||||
class MutableDateFormatMap {
|
||||
public:
|
||||
explicit MutableDateFormatMap(DateFormatMap &date_format_map) : date_format_map(date_format_map) {
|
||||
}
|
||||
|
||||
bool HasFormats(LogicalTypeId type) {
|
||||
lock_guard<mutex> lock(format_lock);
|
||||
return date_format_map.HasFormats(type);
|
||||
}
|
||||
|
||||
idx_t NumberOfFormats(LogicalTypeId type) {
|
||||
lock_guard<mutex> lock(format_lock);
|
||||
return date_format_map.candidate_formats.at(type).size();
|
||||
}
|
||||
|
||||
bool GetFormatAtIndex(LogicalTypeId type, idx_t index, StrpTimeFormat &format) {
|
||||
lock_guard<mutex> lock(format_lock);
|
||||
auto &formats = date_format_map.candidate_formats.at(type);
|
||||
if (index >= formats.size()) {
|
||||
return false;
|
||||
}
|
||||
format = formats[index];
|
||||
return true;
|
||||
}
|
||||
|
||||
void ShrinkFormatsToSize(LogicalTypeId type, idx_t size) {
|
||||
lock_guard<mutex> lock(format_lock);
|
||||
auto &formats = date_format_map.candidate_formats[type];
|
||||
while (formats.size() > size) {
|
||||
formats.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
mutex format_lock;
|
||||
DateFormatMap &date_format_map;
|
||||
};
|
||||
|
||||
struct JSONReaderOptions {
|
||||
//! Scan type
|
||||
JSONScanType type = JSONScanType::READ_JSON;
|
||||
//! The format of the JSON
|
||||
JSONFormat format = JSONFormat::AUTO_DETECT;
|
||||
//! Whether record types in the JSON
|
||||
JSONRecordType record_type = JSONRecordType::AUTO_DETECT;
|
||||
//! Whether file is compressed or not, and if so which compression type
|
||||
FileCompressionType compression = FileCompressionType::AUTO_DETECT;
|
||||
//! Whether or not we should ignore malformed JSON (default to NULL)
|
||||
bool ignore_errors = false;
|
||||
//! Maximum JSON object size (defaults to 16MB minimum)
|
||||
idx_t maximum_object_size = 16777216;
|
||||
//! Whether we auto-detect a schema
|
||||
bool auto_detect = false;
|
||||
//! Sample size for detecting schema
|
||||
idx_t sample_size = idx_t(STANDARD_VECTOR_SIZE) * 10;
|
||||
//! Max depth we go to detect nested JSON schema (defaults to unlimited)
|
||||
idx_t max_depth = NumericLimits<idx_t>::Maximum();
|
||||
//! We divide the number of appearances of each JSON field by the auto-detection sample size
|
||||
//! If the average over the fields of an object is less than this threshold,
|
||||
//! we default to the MAP type with value type of merged field types
|
||||
double field_appearance_threshold = 0.1;
|
||||
//! The maximum number of files we sample to sample sample_size rows
|
||||
idx_t maximum_sample_files = 32;
|
||||
//! Whether we auto-detect and convert JSON strings to integers
|
||||
bool convert_strings_to_integers = false;
|
||||
//! If a struct contains more fields than this threshold with at least 80% similar types,
|
||||
//! we infer it as MAP type
|
||||
idx_t map_inference_threshold = 200;
|
||||
//! User-provided list of names (in order)
|
||||
vector<string> name_list;
|
||||
//! User-provided list of types (in order)
|
||||
vector<LogicalType> sql_type_list;
|
||||
//! Forced date/timestamp formats
|
||||
string date_format;
|
||||
string timestamp_format;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
148
external/duckdb/extension/json/include/json_scan.hpp
vendored
Normal file
148
external/duckdb/extension/json/include/json_scan.hpp
vendored
Normal file
@@ -0,0 +1,148 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_scan.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "json_reader.hpp"
|
||||
#include "duckdb/common/multi_file/multi_file_reader.hpp"
|
||||
#include "duckdb/common/mutex.hpp"
|
||||
#include "duckdb/common/pair.hpp"
|
||||
#include "duckdb/common/types/type_map.hpp"
|
||||
#include "duckdb/function/scalar/strftime_format.hpp"
|
||||
#include "duckdb/function/table_function.hpp"
|
||||
#include "json_enums.hpp"
|
||||
#include "json_transform.hpp"
|
||||
#include "json_reader_options.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct JSONScanData : public TableFunctionData {
|
||||
public:
|
||||
JSONScanData();
|
||||
|
||||
void InitializeFormats();
|
||||
void InitializeFormats(bool auto_detect);
|
||||
|
||||
public:
|
||||
//! JSON reader options
|
||||
JSONReaderOptions options;
|
||||
|
||||
//! The set of keys to extract (case sensitive)
|
||||
vector<string> key_names;
|
||||
|
||||
//! The date format map
|
||||
unique_ptr<DateFormatMap> date_format_map;
|
||||
//! Options when transforming the JSON to columnar data
|
||||
JSONTransformOptions transform_options;
|
||||
|
||||
optional_idx max_threads;
|
||||
optional_idx estimated_cardinality_per_file;
|
||||
};
|
||||
|
||||
struct JSONScanInfo : public TableFunctionInfo {
|
||||
public:
|
||||
explicit JSONScanInfo(JSONScanType type_p = JSONScanType::INVALID, JSONFormat format_p = JSONFormat::AUTO_DETECT,
|
||||
JSONRecordType record_type_p = JSONRecordType::AUTO_DETECT, bool auto_detect_p = false)
|
||||
: type(type_p), format(format_p), record_type(record_type_p), auto_detect(auto_detect_p) {
|
||||
}
|
||||
|
||||
JSONScanType type;
|
||||
JSONFormat format;
|
||||
JSONRecordType record_type;
|
||||
bool auto_detect;
|
||||
};
|
||||
|
||||
struct JSONScanGlobalState {
|
||||
public:
|
||||
JSONScanGlobalState(ClientContext &context, const MultiFileBindData &bind_data);
|
||||
|
||||
public:
|
||||
//! Bound data
|
||||
const MultiFileBindData &bind_data;
|
||||
const JSONScanData &json_data;
|
||||
//! Options when transforming the JSON to columnar data
|
||||
JSONTransformOptions transform_options;
|
||||
|
||||
//! Column names that we're actually reading (after projection pushdown)
|
||||
vector<string> names;
|
||||
vector<column_t> column_ids;
|
||||
vector<ColumnIndex> column_indices;
|
||||
|
||||
//! Buffer manager allocator
|
||||
Allocator &allocator;
|
||||
//! The current buffer capacity
|
||||
idx_t buffer_capacity;
|
||||
|
||||
//! Current number of threads active
|
||||
idx_t system_threads;
|
||||
//! Whether we enable parallel scans (only if less files than threads)
|
||||
bool enable_parallel_scans;
|
||||
|
||||
bool file_is_assigned = false;
|
||||
bool initialized = false;
|
||||
};
|
||||
|
||||
struct JSONScanLocalState {
|
||||
public:
|
||||
JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate);
|
||||
|
||||
public:
|
||||
idx_t Read();
|
||||
void AddTransformError(idx_t object_index, const string &error_message);
|
||||
|
||||
JSONReaderScanState &GetScanState() {
|
||||
return scan_state;
|
||||
}
|
||||
|
||||
const JSONReaderScanState &GetScanState() const {
|
||||
return scan_state;
|
||||
}
|
||||
|
||||
bool TryInitializeScan(JSONScanGlobalState &gstate, JSONReader &reader);
|
||||
|
||||
public:
|
||||
//! Options when transforming the JSON to columnar data
|
||||
JSONTransformOptions transform_options;
|
||||
|
||||
private:
|
||||
void ParseJSON(char *const json_start, const idx_t json_size, const idx_t remaining);
|
||||
|
||||
private:
|
||||
//! Scan state
|
||||
JSONReaderScanState scan_state;
|
||||
};
|
||||
|
||||
struct JSONGlobalTableFunctionState : public GlobalTableFunctionState {
|
||||
public:
|
||||
JSONGlobalTableFunctionState(ClientContext &context, const MultiFileBindData &bind_data);
|
||||
|
||||
public:
|
||||
JSONScanGlobalState state;
|
||||
};
|
||||
|
||||
struct JSONLocalTableFunctionState : public LocalTableFunctionState {
|
||||
public:
|
||||
JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate);
|
||||
|
||||
public:
|
||||
JSONScanLocalState state;
|
||||
};
|
||||
|
||||
struct JSONScan {
|
||||
public:
|
||||
static void AutoDetect(ClientContext &context, MultiFileBindData &bind_data, vector<LogicalType> &return_types,
|
||||
vector<string> &names);
|
||||
|
||||
static void Serialize(Serializer &serializer, const optional_ptr<FunctionData> bind_data,
|
||||
const TableFunction &function);
|
||||
static unique_ptr<FunctionData> Deserialize(Deserializer &deserializer, TableFunction &function);
|
||||
|
||||
static void TableFunctionDefaults(TableFunction &table_function);
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
97
external/duckdb/extension/json/include/json_serializer.hpp
vendored
Normal file
97
external/duckdb/extension/json/include/json_serializer.hpp
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
#pragma once
|
||||
|
||||
#include "json_common.hpp"
|
||||
#include "duckdb/common/serializer/serializer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct JsonSerializer : Serializer {
|
||||
private:
|
||||
yyjson_mut_doc *doc;
|
||||
yyjson_mut_val *current_tag;
|
||||
vector<yyjson_mut_val *> stack;
|
||||
|
||||
// Skip writing property if null
|
||||
bool skip_if_null = false;
|
||||
// Skip writing property if empty string, empty list or empty map.
|
||||
bool skip_if_empty = false;
|
||||
|
||||
// Get the current json value
|
||||
inline yyjson_mut_val *Current() {
|
||||
return stack.back();
|
||||
};
|
||||
|
||||
// Either adds a value to the current object with the current tag, or appends it to the current array
|
||||
void PushValue(yyjson_mut_val *val);
|
||||
|
||||
public:
|
||||
explicit JsonSerializer(yyjson_mut_doc *doc, bool skip_if_null, bool skip_if_empty, bool skip_if_default)
|
||||
: doc(doc), stack({yyjson_mut_obj(doc)}), skip_if_null(skip_if_null), skip_if_empty(skip_if_empty) {
|
||||
options.serialize_enum_as_string = true;
|
||||
options.serialize_default_values = !skip_if_default;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static yyjson_mut_val *Serialize(T &value, yyjson_mut_doc *doc, bool skip_if_null, bool skip_if_empty,
|
||||
bool skip_if_default) {
|
||||
JsonSerializer serializer(doc, skip_if_null, skip_if_empty, skip_if_default);
|
||||
value.Serialize(serializer);
|
||||
return serializer.GetRootObject();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static string SerializeToString(T &value) {
|
||||
auto doc = yyjson_mut_doc_new(nullptr);
|
||||
JsonSerializer serializer(doc, false, false, false);
|
||||
value.Serialize(serializer);
|
||||
auto result_obj = serializer.GetRootObject();
|
||||
idx_t len = 0;
|
||||
auto data = yyjson_mut_val_write_opts(result_obj, JSONCommon::WRITE_PRETTY_FLAG, nullptr,
|
||||
reinterpret_cast<size_t *>(&len), nullptr);
|
||||
return string(data, len);
|
||||
}
|
||||
|
||||
yyjson_mut_val *GetRootObject() {
|
||||
D_ASSERT(stack.size() == 1); // or we forgot to pop somewhere
|
||||
return stack.front();
|
||||
};
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Nested Types Hooks
|
||||
//===--------------------------------------------------------------------===//
|
||||
void OnPropertyBegin(const field_id_t field_id, const char *tag) final;
|
||||
void OnPropertyEnd() final;
|
||||
void OnOptionalPropertyBegin(const field_id_t field_id, const char *tag, bool present) final;
|
||||
void OnOptionalPropertyEnd(bool present) final;
|
||||
|
||||
void OnListBegin(idx_t count) final;
|
||||
void OnListEnd() final;
|
||||
void OnObjectBegin() final;
|
||||
void OnObjectEnd() final;
|
||||
void OnNullableBegin(bool present) final;
|
||||
void OnNullableEnd() final;
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Primitive Types
|
||||
//===--------------------------------------------------------------------===//
|
||||
void WriteNull() final;
|
||||
void WriteValue(uint8_t value) final;
|
||||
void WriteValue(int8_t value) final;
|
||||
void WriteValue(uint16_t value) final;
|
||||
void WriteValue(int16_t value) final;
|
||||
void WriteValue(uint32_t value) final;
|
||||
void WriteValue(int32_t value) final;
|
||||
void WriteValue(uint64_t value) final;
|
||||
void WriteValue(int64_t value) final;
|
||||
void WriteValue(hugeint_t value) final;
|
||||
void WriteValue(uhugeint_t value) final;
|
||||
void WriteValue(float value) final;
|
||||
void WriteValue(double value) final;
|
||||
void WriteValue(const string_t value) final;
|
||||
void WriteValue(const string &value) final;
|
||||
void WriteValue(const char *value) final;
|
||||
void WriteValue(bool value) final;
|
||||
void WriteDataPtr(const_data_ptr_t ptr, idx_t count) final;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
94
external/duckdb/extension/json/include/json_structure.hpp
vendored
Normal file
94
external/duckdb/extension/json/include/json_structure.hpp
vendored
Normal file
@@ -0,0 +1,94 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_structure.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "json_common.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct JSONStructureDescription;
|
||||
struct DateFormatMap;
|
||||
struct StrpTimeFormat;
|
||||
class MutableDateFormatMap;
|
||||
|
||||
struct JSONStructureNode {
|
||||
public:
|
||||
JSONStructureNode();
|
||||
JSONStructureNode(const char *key_ptr, const size_t key_len);
|
||||
JSONStructureNode(yyjson_val *key_p, yyjson_val *val_p, bool ignore_errors);
|
||||
|
||||
//! Disable copy constructors
|
||||
JSONStructureNode(const JSONStructureNode &other) = delete;
|
||||
JSONStructureNode &operator=(const JSONStructureNode &) = delete;
|
||||
//! Enable move constructors
|
||||
JSONStructureNode(JSONStructureNode &&other) noexcept;
|
||||
JSONStructureNode &operator=(JSONStructureNode &&) noexcept;
|
||||
|
||||
JSONStructureDescription &GetOrCreateDescription(LogicalTypeId type);
|
||||
|
||||
bool ContainsVarchar() const;
|
||||
void InitializeCandidateTypes(idx_t max_depth, bool convert_strings_to_integers, idx_t depth = 0);
|
||||
void RefineCandidateTypes(yyjson_val *vals[], idx_t val_count, Vector &string_vector, ArenaAllocator &allocator,
|
||||
MutableDateFormatMap &date_format_map);
|
||||
|
||||
private:
|
||||
void RefineCandidateTypesArray(yyjson_val *vals[], idx_t val_count, Vector &string_vector,
|
||||
ArenaAllocator &allocator, MutableDateFormatMap &date_format_map);
|
||||
void RefineCandidateTypesObject(yyjson_val *vals[], idx_t val_count, Vector &string_vector,
|
||||
ArenaAllocator &allocator, MutableDateFormatMap &date_format_map);
|
||||
void RefineCandidateTypesString(yyjson_val *vals[], idx_t val_count, Vector &string_vector,
|
||||
MutableDateFormatMap &date_format_map);
|
||||
void EliminateCandidateTypes(idx_t vec_count, Vector &string_vector, MutableDateFormatMap &date_format_map);
|
||||
bool EliminateCandidateFormats(idx_t vec_count, Vector &string_vector, const Vector &result_vector,
|
||||
MutableDateFormatMap &date_format_map);
|
||||
|
||||
public:
|
||||
unique_ptr<string> key;
|
||||
bool initialized = false;
|
||||
vector<JSONStructureDescription> descriptions;
|
||||
idx_t count;
|
||||
idx_t null_count;
|
||||
};
|
||||
|
||||
struct JSONStructureDescription {
|
||||
public:
|
||||
explicit JSONStructureDescription(LogicalTypeId type_p);
|
||||
//! Disable copy constructors
|
||||
JSONStructureDescription(const JSONStructureDescription &other) = delete;
|
||||
JSONStructureDescription &operator=(const JSONStructureDescription &) = delete;
|
||||
//! Enable move constructors
|
||||
JSONStructureDescription(JSONStructureDescription &&other) noexcept;
|
||||
JSONStructureDescription &operator=(JSONStructureDescription &&) noexcept;
|
||||
|
||||
JSONStructureNode &GetOrCreateChild();
|
||||
JSONStructureNode &GetOrCreateChild(const char *key_ptr, size_t key_size);
|
||||
JSONStructureNode &GetOrCreateChild(yyjson_val *key, yyjson_val *val, bool ignore_errors);
|
||||
|
||||
public:
|
||||
//! Type of this description
|
||||
LogicalTypeId type = LogicalTypeId::INVALID;
|
||||
|
||||
//! Map to children and children
|
||||
json_key_map_t<idx_t> key_map;
|
||||
vector<JSONStructureNode> children;
|
||||
|
||||
//! Candidate types (if auto-detecting and type == LogicalTypeId::VARCHAR)
|
||||
vector<LogicalTypeId> candidate_types;
|
||||
};
|
||||
|
||||
struct JSONStructure {
|
||||
public:
|
||||
static void ExtractStructure(yyjson_val *val, JSONStructureNode &node, bool ignore_errors);
|
||||
static void MergeNodes(JSONStructureNode &merged, const JSONStructureNode &node);
|
||||
static LogicalType StructureToType(ClientContext &context, const JSONStructureNode &node, idx_t max_depth,
|
||||
double field_appearance_threshold, idx_t map_inference_threshold,
|
||||
idx_t depth = 0, const LogicalType &null_type = LogicalType::JSON());
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
79
external/duckdb/extension/json/include/json_transform.hpp
vendored
Normal file
79
external/duckdb/extension/json/include/json_transform.hpp
vendored
Normal file
@@ -0,0 +1,79 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_functions.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/column_index.hpp"
|
||||
#include "duckdb/common/optional_ptr.hpp"
|
||||
#include "duckdb/function/scalar/strftime_format.hpp"
|
||||
#include "json_common.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct DateFormatMap;
|
||||
class JSONReader;
|
||||
|
||||
//! Options for error handling while transforming JSON
|
||||
struct JSONTransformOptions {
|
||||
public:
|
||||
JSONTransformOptions();
|
||||
JSONTransformOptions(bool strict_cast, bool error_duplicate_key, bool error_missing_key, bool error_unkown_key);
|
||||
|
||||
public:
|
||||
//! Throws an error if the cast doesn't work (instead of NULL-ing it)
|
||||
bool strict_cast = false;
|
||||
//! Throws an error if there is a duplicate key (instead of ignoring it)
|
||||
bool error_duplicate_key = false;
|
||||
//! Throws an error if a key is missing (instead of NULL-ing it)
|
||||
bool error_missing_key = false;
|
||||
//! Throws an error if an object has a key we didn't know about
|
||||
bool error_unknown_key = false;
|
||||
|
||||
//! Whether to delay the error when transforming (e.g., when non-strict casting or reading from file)
|
||||
bool delay_error = false;
|
||||
//! Date format used for parsing (can be NULL)
|
||||
optional_ptr<const DateFormatMap> date_format_map = nullptr;
|
||||
//! String to store errors in
|
||||
string error_message;
|
||||
//! Index of the object where the error occurred
|
||||
idx_t object_index = DConstants::INVALID_INDEX;
|
||||
//! Cast parameters
|
||||
CastParameters parameters;
|
||||
|
||||
public:
|
||||
void Serialize(Serializer &serializer) const;
|
||||
static JSONTransformOptions Deserialize(Deserializer &deserializer);
|
||||
};
|
||||
|
||||
struct TryParseDate {
|
||||
template <class T>
|
||||
static inline bool Operation(const StrpTimeFormat &format, const string_t &input, T &result,
|
||||
string &error_message) {
|
||||
return format.TryParseDate(input, result, error_message);
|
||||
}
|
||||
};
|
||||
|
||||
struct TryParseTimeStamp {
|
||||
template <class T>
|
||||
static inline bool Operation(const StrpTimeFormat &format, const string_t &input, T &result,
|
||||
string &error_message) {
|
||||
return format.TryParseTimestamp(input, result, error_message);
|
||||
}
|
||||
};
|
||||
|
||||
struct JSONTransform {
|
||||
static bool Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count,
|
||||
JSONTransformOptions &options, optional_ptr<const ColumnIndex> column_index);
|
||||
static bool TransformObject(yyjson_val *objects[], yyjson_alc *alc, const idx_t count, const vector<string> &names,
|
||||
const vector<Vector *> &result_vectors, JSONTransformOptions &options,
|
||||
optional_ptr<const vector<ColumnIndex>> column_indices, bool error_unknown_key);
|
||||
static bool GetStringVector(yyjson_val *vals[], const idx_t count, const LogicalType &target, Vector &string_vector,
|
||||
JSONTransformOptions &options);
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
393
external/duckdb/extension/json/json_common.cpp
vendored
Normal file
393
external/duckdb/extension/json/json_common.cpp
vendored
Normal file
@@ -0,0 +1,393 @@
|
||||
#include "json_common.hpp"
|
||||
|
||||
#include "duckdb/common/exception/binder_exception.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
using JSONPathType = JSONCommon::JSONPathType;
|
||||
|
||||
string JSONCommon::ValToString(yyjson_val *val, idx_t max_len) {
|
||||
JSONAllocator json_allocator(Allocator::DefaultAllocator());
|
||||
idx_t len;
|
||||
auto data = JSONCommon::WriteVal<yyjson_val>(val, json_allocator.GetYYAlc(), len);
|
||||
if (max_len < len) {
|
||||
return string(data, max_len) + "...";
|
||||
} else {
|
||||
return string(data, len);
|
||||
}
|
||||
}
|
||||
|
||||
void JSONCommon::ThrowValFormatError(string error_string, yyjson_val *val) {
|
||||
error_string = StringUtil::Format(error_string, JSONCommon::ValToString(val));
|
||||
throw InvalidInputException(error_string);
|
||||
}
|
||||
|
||||
string ThrowPathError(const char *ptr, const char *end, const bool binder) {
|
||||
ptr--;
|
||||
auto msg = StringUtil::Format("JSON path error near '%s'", string(ptr, end - ptr));
|
||||
if (binder) {
|
||||
throw BinderException(msg);
|
||||
} else {
|
||||
throw InvalidInputException(msg);
|
||||
}
|
||||
}
|
||||
|
||||
struct JSONKeyReadResult {
|
||||
public:
|
||||
static inline JSONKeyReadResult Empty() {
|
||||
return {idx_t(0), false, string()};
|
||||
}
|
||||
|
||||
static inline JSONKeyReadResult WildCard() {
|
||||
return {1, false, "*"};
|
||||
}
|
||||
|
||||
static inline JSONKeyReadResult RecWildCard() {
|
||||
return {2, true, "*"};
|
||||
}
|
||||
|
||||
static inline JSONKeyReadResult RecWildCardShortcut() {
|
||||
return {1, true, "*"};
|
||||
}
|
||||
|
||||
inline bool IsValid() {
|
||||
return (chars_read != 0);
|
||||
}
|
||||
|
||||
inline bool IsWildCard() {
|
||||
return key == "*";
|
||||
}
|
||||
|
||||
public:
|
||||
idx_t chars_read;
|
||||
bool recursive;
|
||||
string key;
|
||||
};
|
||||
|
||||
static inline JSONKeyReadResult ReadString(const char *ptr, const char *const end, const bool escaped) {
|
||||
const char *const before = ptr;
|
||||
if (escaped) {
|
||||
auto key = make_unsafe_uniq_array_uninitialized<char>(end - ptr);
|
||||
idx_t key_len = 0;
|
||||
|
||||
bool backslash = false;
|
||||
while (ptr != end) {
|
||||
if (backslash) {
|
||||
if (*ptr != '"' && *ptr != '\\') {
|
||||
key[key_len++] = '\\';
|
||||
}
|
||||
backslash = false;
|
||||
} else {
|
||||
if (*ptr == '"') {
|
||||
break;
|
||||
} else if (*ptr == '\\') {
|
||||
backslash = true;
|
||||
ptr++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
key[key_len++] = *ptr++;
|
||||
}
|
||||
if (ptr == end || backslash) {
|
||||
return JSONKeyReadResult::Empty();
|
||||
} else {
|
||||
return {idx_t(ptr - before), false, string(key.get(), key_len)};
|
||||
}
|
||||
} else {
|
||||
while (ptr != end) {
|
||||
if (*ptr == '.' || *ptr == '[') {
|
||||
break;
|
||||
}
|
||||
ptr++;
|
||||
}
|
||||
return {idx_t(ptr - before), false, string(before, ptr - before)};
|
||||
}
|
||||
}
|
||||
|
||||
static inline idx_t ReadInteger(const char *ptr, const char *const end, idx_t &idx) {
|
||||
static constexpr auto IDX_T_SAFE_DIG = 19;
|
||||
static constexpr auto IDX_T_MAX = ((idx_t)(~(idx_t)0));
|
||||
|
||||
const char *const before = ptr;
|
||||
idx = 0;
|
||||
for (idx_t i = 0; i < IDX_T_SAFE_DIG; i++) {
|
||||
if (ptr == end) {
|
||||
// No closing ']'
|
||||
return 0;
|
||||
}
|
||||
if (*ptr == ']') {
|
||||
break;
|
||||
}
|
||||
uint8_t add = (uint8_t)(*ptr - '0');
|
||||
if (add <= 9) {
|
||||
idx = add + idx * 10;
|
||||
} else {
|
||||
// Not a digit
|
||||
return 0;
|
||||
}
|
||||
ptr++;
|
||||
}
|
||||
// Invalid if overflow
|
||||
return idx >= (idx_t)IDX_T_MAX ? 0 : ptr - before;
|
||||
}
|
||||
|
||||
static inline JSONKeyReadResult ReadKey(const char *ptr, const char *const end) {
|
||||
D_ASSERT(ptr != end);
|
||||
if (*ptr == '*') { // Wildcard
|
||||
if (*(ptr + 1) == '*') {
|
||||
return JSONKeyReadResult::RecWildCard();
|
||||
}
|
||||
return JSONKeyReadResult::WildCard();
|
||||
}
|
||||
bool recursive = false;
|
||||
if (*ptr == '.') {
|
||||
char next = *(ptr + 1);
|
||||
if (next == '*') {
|
||||
return JSONKeyReadResult::RecWildCard();
|
||||
}
|
||||
if (next == '[') {
|
||||
return JSONKeyReadResult::RecWildCardShortcut();
|
||||
}
|
||||
ptr++;
|
||||
recursive = true;
|
||||
}
|
||||
bool escaped = false;
|
||||
if (*ptr == '"') {
|
||||
ptr++; // Skip past opening '"'
|
||||
escaped = true;
|
||||
}
|
||||
auto result = ReadString(ptr, end, escaped);
|
||||
if (!result.IsValid()) {
|
||||
return result;
|
||||
}
|
||||
if (escaped) {
|
||||
result.chars_read += 2; // Account for surrounding quotes
|
||||
}
|
||||
if (recursive) {
|
||||
result.chars_read += 1;
|
||||
result.recursive = true;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline bool ReadArrayIndex(const char *&ptr, const char *const end, idx_t &array_index, bool &from_back) {
|
||||
D_ASSERT(ptr != end);
|
||||
from_back = false;
|
||||
if (*ptr == '*') { // Wildcard
|
||||
ptr++;
|
||||
if (ptr == end || *ptr != ']') {
|
||||
return false;
|
||||
}
|
||||
array_index = DConstants::INVALID_INDEX;
|
||||
} else {
|
||||
if (*ptr == '#') { // SQLite syntax to index from back of array
|
||||
ptr++; // Skip over '#'
|
||||
if (ptr == end) {
|
||||
return false;
|
||||
}
|
||||
if (*ptr == ']') {
|
||||
// [#] always returns NULL in SQLite, so we return an array index that will do the same
|
||||
array_index = NumericLimits<uint32_t>::Maximum();
|
||||
ptr++;
|
||||
return true;
|
||||
}
|
||||
if (*ptr != '-') {
|
||||
return false;
|
||||
}
|
||||
from_back = true;
|
||||
}
|
||||
if (*ptr == '-') {
|
||||
ptr++; // Skip over '-'
|
||||
from_back = true;
|
||||
}
|
||||
auto idx_len = ReadInteger(ptr, end, array_index);
|
||||
if (idx_len == 0) {
|
||||
return false;
|
||||
}
|
||||
ptr += idx_len;
|
||||
}
|
||||
ptr++; // Skip past closing ']'
|
||||
return true;
|
||||
}
|
||||
|
||||
JSONPathType JSONCommon::ValidatePath(const char *ptr, const idx_t &len, const bool binder) {
|
||||
D_ASSERT(len >= 1 && *ptr == '$');
|
||||
JSONPathType path_type = JSONPathType::REGULAR;
|
||||
const char *const end = ptr + len;
|
||||
ptr++; // Skip past '$'
|
||||
while (ptr != end) {
|
||||
const auto &c = *ptr++;
|
||||
if (ptr == end) {
|
||||
ThrowPathError(ptr, end, binder);
|
||||
}
|
||||
switch (c) {
|
||||
case '.': { // Object field
|
||||
auto key = ReadKey(ptr, end);
|
||||
if (!key.IsValid()) {
|
||||
ThrowPathError(ptr, end, binder);
|
||||
} else if (key.IsWildCard() || key.recursive) {
|
||||
path_type = JSONPathType::WILDCARD;
|
||||
}
|
||||
ptr += key.chars_read;
|
||||
break;
|
||||
}
|
||||
case '[': { // Array index
|
||||
idx_t array_index;
|
||||
bool from_back;
|
||||
if (!ReadArrayIndex(ptr, end, array_index, from_back)) {
|
||||
ThrowPathError(ptr, end, binder);
|
||||
}
|
||||
if (array_index == DConstants::INVALID_INDEX) {
|
||||
path_type = JSONPathType::WILDCARD;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
ThrowPathError(ptr, end, binder);
|
||||
}
|
||||
}
|
||||
return path_type;
|
||||
}
|
||||
|
||||
yyjson_val *JSONCommon::GetPath(yyjson_val *val, const char *ptr, const idx_t &len) {
|
||||
// Path has been validated at this point
|
||||
const char *const end = ptr + len;
|
||||
ptr++; // Skip past '$'
|
||||
while (val != nullptr && ptr != end) {
|
||||
const auto &c = *ptr++;
|
||||
D_ASSERT(ptr != end);
|
||||
switch (c) {
|
||||
case '.': { // Object field
|
||||
if (!unsafe_yyjson_is_obj(val)) {
|
||||
return nullptr;
|
||||
}
|
||||
auto key_result = ReadKey(ptr, end);
|
||||
D_ASSERT(key_result.IsValid());
|
||||
ptr += key_result.chars_read;
|
||||
val = yyjson_obj_getn(val, key_result.key.c_str(), key_result.key.size());
|
||||
break;
|
||||
}
|
||||
case '[': { // Array index
|
||||
if (!unsafe_yyjson_is_arr(val)) {
|
||||
return nullptr;
|
||||
}
|
||||
idx_t array_index;
|
||||
bool from_back;
|
||||
#ifdef DEBUG
|
||||
bool success =
|
||||
#endif
|
||||
ReadArrayIndex(ptr, end, array_index, from_back);
|
||||
#ifdef DEBUG
|
||||
D_ASSERT(success);
|
||||
#endif
|
||||
if (from_back && array_index != 0) {
|
||||
array_index = unsafe_yyjson_get_len(val) - array_index;
|
||||
}
|
||||
val = yyjson_arr_get(val, array_index);
|
||||
break;
|
||||
}
|
||||
default: // LCOV_EXCL_START
|
||||
throw InternalException(
|
||||
"Invalid JSON Path encountered in JSONCommon::GetPath, call JSONCommon::ValidatePath first!");
|
||||
} // LCOV_EXCL_STOP
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
void GetWildcardPathInternal(yyjson_val *val, const char *ptr, const char *const end, vector<yyjson_val *> &vals) {
|
||||
while (val != nullptr && ptr != end) {
|
||||
const auto &c = *ptr++;
|
||||
D_ASSERT(ptr != end);
|
||||
switch (c) {
|
||||
case '.': { // Object field
|
||||
auto key_result = ReadKey(ptr, end);
|
||||
D_ASSERT(key_result.IsValid());
|
||||
if (key_result.recursive) {
|
||||
if (key_result.IsWildCard()) {
|
||||
ptr += key_result.chars_read;
|
||||
}
|
||||
vector<yyjson_val *> rec_vals;
|
||||
rec_vals.emplace_back(val);
|
||||
for (idx_t i = 0; i < rec_vals.size(); i++) {
|
||||
yyjson_val *rec_val = rec_vals[i];
|
||||
if (yyjson_is_arr(rec_val)) {
|
||||
size_t idx, max;
|
||||
yyjson_val *element;
|
||||
yyjson_arr_foreach(rec_val, idx, max, element) {
|
||||
rec_vals.emplace_back(element);
|
||||
}
|
||||
} else if (yyjson_is_obj(rec_val)) {
|
||||
size_t idx, max;
|
||||
yyjson_val *key, *element;
|
||||
yyjson_obj_foreach(rec_val, idx, max, key, element) {
|
||||
rec_vals.emplace_back(element);
|
||||
}
|
||||
}
|
||||
if (i > 0 || ptr != end) {
|
||||
GetWildcardPathInternal(rec_val, ptr, end, vals);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
ptr += key_result.chars_read;
|
||||
if (!unsafe_yyjson_is_obj(val)) {
|
||||
return;
|
||||
}
|
||||
if (key_result.IsWildCard()) { // Wildcard
|
||||
size_t idx, max;
|
||||
yyjson_val *key, *obj_val;
|
||||
yyjson_obj_foreach(val, idx, max, key, obj_val) {
|
||||
GetWildcardPathInternal(obj_val, ptr, end, vals);
|
||||
}
|
||||
return;
|
||||
}
|
||||
val = yyjson_obj_getn(val, key_result.key.c_str(), key_result.key.size());
|
||||
break;
|
||||
}
|
||||
case '[': { // Array index
|
||||
if (!unsafe_yyjson_is_arr(val)) {
|
||||
return;
|
||||
}
|
||||
idx_t array_index;
|
||||
bool from_back;
|
||||
#ifdef DEBUG
|
||||
bool success =
|
||||
#endif
|
||||
ReadArrayIndex(ptr, end, array_index, from_back);
|
||||
#ifdef DEBUG
|
||||
D_ASSERT(success);
|
||||
#endif
|
||||
|
||||
if (array_index == DConstants::INVALID_INDEX) { // Wildcard
|
||||
size_t idx, max;
|
||||
yyjson_val *arr_val;
|
||||
yyjson_arr_foreach(val, idx, max, arr_val) {
|
||||
GetWildcardPathInternal(arr_val, ptr, end, vals);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (from_back && array_index != 0) {
|
||||
array_index = unsafe_yyjson_get_len(val) - array_index;
|
||||
}
|
||||
val = yyjson_arr_get(val, array_index);
|
||||
break;
|
||||
}
|
||||
default: // LCOV_EXCL_START
|
||||
throw InternalException(
|
||||
"Invalid JSON Path encountered in GetWildcardPathInternal, call JSONCommon::ValidatePath first!");
|
||||
} // LCOV_EXCL_STOP
|
||||
}
|
||||
if (val != nullptr) {
|
||||
vals.emplace_back(val);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void JSONCommon::GetWildcardPath(yyjson_val *val, const char *ptr, const idx_t &len, vector<yyjson_val *> &vals) {
|
||||
// Path has been validated at this point
|
||||
const char *const end = ptr + len;
|
||||
ptr++; // Skip past '$'
|
||||
GetWildcardPathInternal(val, ptr, end, vals);
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
16
external/duckdb/extension/json/json_config.py
vendored
Normal file
16
external/duckdb/extension/json/json_config.py
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
import os
|
||||
|
||||
# list all include directories
|
||||
include_directories = [os.path.sep.join(x.split('/')) for x in ['extension/json/include']]
|
||||
|
||||
|
||||
# source files
|
||||
def list_files_recursive(rootdir, suffix):
|
||||
file_list = []
|
||||
for root, _, files in os.walk(rootdir):
|
||||
file_list += [os.path.join(root, f) for f in files if f.endswith(suffix)]
|
||||
return file_list
|
||||
|
||||
|
||||
prefix = os.path.join('extension', 'json')
|
||||
source_files = list_files_recursive(prefix, '.cpp')
|
||||
283
external/duckdb/extension/json/json_deserializer.cpp
vendored
Normal file
283
external/duckdb/extension/json/json_deserializer.cpp
vendored
Normal file
@@ -0,0 +1,283 @@
|
||||
#include "json_deserializer.hpp"
|
||||
#include "duckdb/common/types/blob.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
void JsonDeserializer::OnPropertyBegin(const field_id_t, const char *tag) {
|
||||
current_tag = tag;
|
||||
}
|
||||
|
||||
void JsonDeserializer::OnPropertyEnd() {
|
||||
}
|
||||
|
||||
bool JsonDeserializer::OnOptionalPropertyBegin(const field_id_t, const char *tag) {
|
||||
auto parent = Current();
|
||||
auto present = yyjson_obj_get(parent.val, tag) != nullptr;
|
||||
if (present) {
|
||||
current_tag = tag;
|
||||
}
|
||||
return present;
|
||||
}
|
||||
|
||||
void JsonDeserializer::OnOptionalPropertyEnd(bool) {
|
||||
}
|
||||
|
||||
// If inside an object, return the value associated by the current tag (property name)
|
||||
// If inside an array, return the next element in the sequence
|
||||
yyjson_val *JsonDeserializer::GetNextValue() {
|
||||
auto &parent_val = Current();
|
||||
yyjson_val *val;
|
||||
if (yyjson_is_obj(parent_val.val)) {
|
||||
val = yyjson_obj_get(parent_val.val, current_tag);
|
||||
if (!val) {
|
||||
const char *json = yyjson_val_write(Current().val, 0, nullptr);
|
||||
auto msg =
|
||||
StringUtil::Format("Expected but did not find property '%s' in json object: '%s'", current_tag, json);
|
||||
free((void *)json);
|
||||
throw ParserException(msg);
|
||||
}
|
||||
} else if (yyjson_is_arr(parent_val.val)) {
|
||||
val = yyjson_arr_iter_next(&parent_val.arr_iter);
|
||||
if (!val) {
|
||||
const char *json = yyjson_val_write(Current().val, 0, nullptr);
|
||||
auto msg =
|
||||
StringUtil::Format("Expected but did not find another value after exhausting json array: '%s'", json);
|
||||
free((void *)json);
|
||||
throw ParserException(msg);
|
||||
}
|
||||
} else {
|
||||
// unreachable?
|
||||
throw InternalException("Cannot get value from non-array/object");
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
void JsonDeserializer::ThrowTypeError(yyjson_val *val, const char *expected) {
|
||||
auto actual = yyjson_get_type_desc(val);
|
||||
auto &parent = Current();
|
||||
if (yyjson_is_obj(parent.val)) {
|
||||
auto msg =
|
||||
StringUtil::Format("property '%s' expected type '%s', but got type: '%s'", current_tag, expected, actual);
|
||||
throw ParserException(msg);
|
||||
} else if (yyjson_is_arr(parent.val)) {
|
||||
auto msg = StringUtil::Format("Sequence expect child of type '%s', but got type: %s", expected, actual);
|
||||
throw ParserException(msg);
|
||||
} else {
|
||||
// unreachable?
|
||||
throw InternalException("cannot get nested value from non object or array-type");
|
||||
}
|
||||
}
|
||||
|
||||
void JsonDeserializer::DumpDoc() {
|
||||
const char *json = yyjson_write(doc, 0, nullptr);
|
||||
printf("json: %s\n", json);
|
||||
free((void *)json);
|
||||
}
|
||||
|
||||
void JsonDeserializer::DumpCurrent() {
|
||||
const char *json = yyjson_val_write(Current().val, 0, nullptr);
|
||||
printf("json: %s\n", json);
|
||||
free((void *)json);
|
||||
}
|
||||
|
||||
void JsonDeserializer::Dump(yyjson_mut_val *val) {
|
||||
const char *json = yyjson_mut_val_write(val, 0, nullptr);
|
||||
printf("json: %s\n", json);
|
||||
free((void *)json);
|
||||
}
|
||||
|
||||
void JsonDeserializer::Dump(yyjson_val *val) {
|
||||
const char *json = yyjson_val_write(val, 0, nullptr);
|
||||
printf("json: %s\n", json);
|
||||
free((void *)json);
|
||||
}
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Nested Types Hooks
|
||||
//===--------------------------------------------------------------------===//
|
||||
void JsonDeserializer::OnObjectBegin() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_obj(val)) {
|
||||
ThrowTypeError(val, "object");
|
||||
}
|
||||
Push(val);
|
||||
}
|
||||
|
||||
void JsonDeserializer::OnObjectEnd() {
|
||||
stack.pop_back();
|
||||
}
|
||||
|
||||
idx_t JsonDeserializer::OnListBegin() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_arr(val)) {
|
||||
ThrowTypeError(val, "array");
|
||||
}
|
||||
Push(val);
|
||||
return yyjson_arr_size(val);
|
||||
}
|
||||
|
||||
void JsonDeserializer::OnListEnd() {
|
||||
Pop();
|
||||
}
|
||||
|
||||
bool JsonDeserializer::OnNullableBegin() {
|
||||
auto &parent_val = Current();
|
||||
yyjson_arr_iter iter;
|
||||
if (yyjson_is_arr(parent_val.val)) {
|
||||
iter = parent_val.arr_iter;
|
||||
}
|
||||
auto val = GetNextValue();
|
||||
|
||||
// Recover the iterator if we are inside an array
|
||||
if (yyjson_is_arr(parent_val.val)) {
|
||||
parent_val.arr_iter = iter;
|
||||
}
|
||||
|
||||
if (yyjson_is_null(val)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void JsonDeserializer::OnNullableEnd() {
|
||||
}
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Primitive Types
|
||||
//===--------------------------------------------------------------------===//
|
||||
bool JsonDeserializer::ReadBool() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_bool(val)) {
|
||||
ThrowTypeError(val, "bool");
|
||||
}
|
||||
return yyjson_get_bool(val);
|
||||
}
|
||||
|
||||
int8_t JsonDeserializer::ReadSignedInt8() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_int(val)) {
|
||||
ThrowTypeError(val, "int8_t");
|
||||
}
|
||||
return yyjson_get_sint(val);
|
||||
}
|
||||
|
||||
uint8_t JsonDeserializer::ReadUnsignedInt8() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_uint(val)) {
|
||||
ThrowTypeError(val, "uint8_t");
|
||||
}
|
||||
return yyjson_get_uint(val);
|
||||
}
|
||||
|
||||
int16_t JsonDeserializer::ReadSignedInt16() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_int(val)) {
|
||||
ThrowTypeError(val, "int16_t");
|
||||
}
|
||||
return yyjson_get_sint(val);
|
||||
}
|
||||
|
||||
uint16_t JsonDeserializer::ReadUnsignedInt16() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_uint(val)) {
|
||||
ThrowTypeError(val, "uint16_t");
|
||||
}
|
||||
return yyjson_get_uint(val);
|
||||
}
|
||||
|
||||
int32_t JsonDeserializer::ReadSignedInt32() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_int(val)) {
|
||||
ThrowTypeError(val, "int32_t");
|
||||
}
|
||||
return yyjson_get_sint(val);
|
||||
}
|
||||
|
||||
uint32_t JsonDeserializer::ReadUnsignedInt32() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_uint(val)) {
|
||||
ThrowTypeError(val, "uint32_t");
|
||||
}
|
||||
return yyjson_get_uint(val);
|
||||
}
|
||||
|
||||
int64_t JsonDeserializer::ReadSignedInt64() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_int(val)) {
|
||||
ThrowTypeError(val, "int64_t");
|
||||
}
|
||||
return yyjson_get_sint(val);
|
||||
}
|
||||
|
||||
uint64_t JsonDeserializer::ReadUnsignedInt64() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_uint(val)) {
|
||||
ThrowTypeError(val, "uint64_t");
|
||||
}
|
||||
return yyjson_get_uint(val);
|
||||
}
|
||||
|
||||
float JsonDeserializer::ReadFloat() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_real(val)) {
|
||||
ThrowTypeError(val, "float");
|
||||
}
|
||||
return yyjson_get_real(val);
|
||||
}
|
||||
|
||||
double JsonDeserializer::ReadDouble() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_real(val)) {
|
||||
ThrowTypeError(val, "double");
|
||||
}
|
||||
return yyjson_get_real(val);
|
||||
}
|
||||
|
||||
string JsonDeserializer::ReadString() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_str(val)) {
|
||||
ThrowTypeError(val, "string");
|
||||
}
|
||||
return yyjson_get_str(val);
|
||||
}
|
||||
|
||||
hugeint_t JsonDeserializer::ReadHugeInt() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_obj(val)) {
|
||||
ThrowTypeError(val, "object");
|
||||
}
|
||||
Push(val);
|
||||
hugeint_t result;
|
||||
ReadProperty(100, "upper", result.upper);
|
||||
ReadProperty(101, "lower", result.lower);
|
||||
Pop();
|
||||
return result;
|
||||
}
|
||||
|
||||
uhugeint_t JsonDeserializer::ReadUhugeInt() {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_obj(val)) {
|
||||
ThrowTypeError(val, "object");
|
||||
}
|
||||
Push(val);
|
||||
uhugeint_t result;
|
||||
ReadProperty(100, "upper", result.upper);
|
||||
ReadProperty(101, "lower", result.lower);
|
||||
Pop();
|
||||
return result;
|
||||
}
|
||||
|
||||
void JsonDeserializer::ReadDataPtr(data_ptr_t &ptr, idx_t count) {
|
||||
auto val = GetNextValue();
|
||||
if (!yyjson_is_str(val)) {
|
||||
ThrowTypeError(val, "string");
|
||||
}
|
||||
auto str = yyjson_get_str(val);
|
||||
auto len = yyjson_get_len(val);
|
||||
D_ASSERT(len == count);
|
||||
auto blob = string_t(str, len);
|
||||
Blob::ToString(blob, char_ptr_cast(ptr));
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
105
external/duckdb/extension/json/json_enums.cpp
vendored
Normal file
105
external/duckdb/extension/json/json_enums.cpp
vendored
Normal file
@@ -0,0 +1,105 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// This file is automatically generated by scripts/generate_enums.py
|
||||
// Do not edit this file manually, your changes will be overwritten
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "json_enums.hpp"
|
||||
#include "duckdb/common/string_util.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
template<>
|
||||
const char* EnumUtil::ToChars<JSONScanType>(JSONScanType value) {
|
||||
switch(value) {
|
||||
case JSONScanType::INVALID:
|
||||
return "INVALID";
|
||||
case JSONScanType::READ_JSON:
|
||||
return "READ_JSON";
|
||||
case JSONScanType::READ_JSON_OBJECTS:
|
||||
return "READ_JSON_OBJECTS";
|
||||
case JSONScanType::SAMPLE:
|
||||
return "SAMPLE";
|
||||
default:
|
||||
throw NotImplementedException(StringUtil::Format("Enum value of type JSONScanType: '%d' not implemented", value));
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
JSONScanType EnumUtil::FromString<JSONScanType>(const char *value) {
|
||||
if (StringUtil::Equals(value, "INVALID")) {
|
||||
return JSONScanType::INVALID;
|
||||
}
|
||||
if (StringUtil::Equals(value, "READ_JSON")) {
|
||||
return JSONScanType::READ_JSON;
|
||||
}
|
||||
if (StringUtil::Equals(value, "READ_JSON_OBJECTS")) {
|
||||
return JSONScanType::READ_JSON_OBJECTS;
|
||||
}
|
||||
if (StringUtil::Equals(value, "SAMPLE")) {
|
||||
return JSONScanType::SAMPLE;
|
||||
}
|
||||
throw NotImplementedException(StringUtil::Format("Enum value of type JSONScanType: '%s' not implemented", value));
|
||||
}
|
||||
|
||||
template<>
|
||||
const char* EnumUtil::ToChars<JSONRecordType>(JSONRecordType value) {
|
||||
switch(value) {
|
||||
case JSONRecordType::AUTO_DETECT:
|
||||
return "AUTO_DETECT";
|
||||
case JSONRecordType::RECORDS:
|
||||
return "RECORDS";
|
||||
case JSONRecordType::VALUES:
|
||||
return "VALUES";
|
||||
default:
|
||||
throw NotImplementedException(StringUtil::Format("Enum value of type JSONRecordType: '%d' not implemented", value));
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
JSONRecordType EnumUtil::FromString<JSONRecordType>(const char *value) {
|
||||
if (StringUtil::Equals(value, "AUTO_DETECT")) {
|
||||
return JSONRecordType::AUTO_DETECT;
|
||||
}
|
||||
if (StringUtil::Equals(value, "RECORDS")) {
|
||||
return JSONRecordType::RECORDS;
|
||||
}
|
||||
if (StringUtil::Equals(value, "VALUES")) {
|
||||
return JSONRecordType::VALUES;
|
||||
}
|
||||
throw NotImplementedException(StringUtil::Format("Enum value of type JSONRecordType: '%s' not implemented", value));
|
||||
}
|
||||
|
||||
template<>
|
||||
const char* EnumUtil::ToChars<JSONFormat>(JSONFormat value) {
|
||||
switch(value) {
|
||||
case JSONFormat::AUTO_DETECT:
|
||||
return "AUTO_DETECT";
|
||||
case JSONFormat::UNSTRUCTURED:
|
||||
return "UNSTRUCTURED";
|
||||
case JSONFormat::NEWLINE_DELIMITED:
|
||||
return "NEWLINE_DELIMITED";
|
||||
case JSONFormat::ARRAY:
|
||||
return "ARRAY";
|
||||
default:
|
||||
throw NotImplementedException(StringUtil::Format("Enum value of type JSONFormat: '%d' not implemented", value));
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
JSONFormat EnumUtil::FromString<JSONFormat>(const char *value) {
|
||||
if (StringUtil::Equals(value, "AUTO_DETECT")) {
|
||||
return JSONFormat::AUTO_DETECT;
|
||||
}
|
||||
if (StringUtil::Equals(value, "UNSTRUCTURED")) {
|
||||
return JSONFormat::UNSTRUCTURED;
|
||||
}
|
||||
if (StringUtil::Equals(value, "NEWLINE_DELIMITED")) {
|
||||
return JSONFormat::NEWLINE_DELIMITED;
|
||||
}
|
||||
if (StringUtil::Equals(value, "ARRAY")) {
|
||||
return JSONFormat::ARRAY;
|
||||
}
|
||||
throw NotImplementedException(StringUtil::Format("Enum value of type JSONFormat: '%s' not implemented", value));
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
103
external/duckdb/extension/json/json_extension.cpp
vendored
Normal file
103
external/duckdb/extension/json/json_extension.cpp
vendored
Normal file
@@ -0,0 +1,103 @@
|
||||
#include "json_extension.hpp"
|
||||
|
||||
#include "json_common.hpp"
|
||||
#include "json_functions.hpp"
|
||||
|
||||
#include "duckdb/catalog/catalog_entry/macro_catalog_entry.hpp"
|
||||
#include "duckdb/catalog/default/default_functions.hpp"
|
||||
#include "duckdb/function/copy_function.hpp"
|
||||
#include "duckdb/main/extension/extension_loader.hpp"
|
||||
#include "duckdb/parser/expression/function_expression.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static const DefaultMacro JSON_MACROS[] = {
|
||||
{DEFAULT_SCHEMA,
|
||||
"json_group_array",
|
||||
{"x", nullptr},
|
||||
{{nullptr, nullptr}},
|
||||
"CAST('[' || string_agg(CASE WHEN x IS NULL THEN 'null'::JSON ELSE to_json(x) END, ',') || ']' AS JSON)"},
|
||||
{DEFAULT_SCHEMA,
|
||||
"json_group_object",
|
||||
{"n", "v", nullptr},
|
||||
{{nullptr, nullptr}},
|
||||
"CAST('{' || string_agg(to_json(n::VARCHAR) || ':' || CASE WHEN v IS NULL THEN 'null'::JSON ELSE to_json(v) END, "
|
||||
"',') || '}' AS JSON)"},
|
||||
{DEFAULT_SCHEMA,
|
||||
"json_group_structure",
|
||||
{"x", nullptr},
|
||||
{{nullptr, nullptr}},
|
||||
"json_structure(json_group_array(x))->0"},
|
||||
{DEFAULT_SCHEMA, "json", {"x", nullptr}, {{nullptr, nullptr}}, "json_extract(x, '$')"},
|
||||
{nullptr, nullptr, {nullptr}, {{nullptr, nullptr}}, nullptr}};
|
||||
|
||||
static void LoadInternal(ExtensionLoader &loader) {
|
||||
// JSON type
|
||||
auto json_type = LogicalType::JSON();
|
||||
loader.RegisterType(LogicalType::JSON_TYPE_NAME, std::move(json_type));
|
||||
|
||||
// JSON casts
|
||||
JSONFunctions::RegisterSimpleCastFunctions(loader);
|
||||
JSONFunctions::RegisterJSONCreateCastFunctions(loader);
|
||||
JSONFunctions::RegisterJSONTransformCastFunctions(loader);
|
||||
|
||||
// JSON scalar functions
|
||||
for (auto &fun : JSONFunctions::GetScalarFunctions()) {
|
||||
loader.RegisterFunction(fun);
|
||||
}
|
||||
|
||||
// JSON table functions
|
||||
for (auto &fun : JSONFunctions::GetTableFunctions()) {
|
||||
loader.RegisterFunction(fun);
|
||||
}
|
||||
|
||||
// JSON pragma functions
|
||||
for (auto &fun : JSONFunctions::GetPragmaFunctions()) {
|
||||
loader.RegisterFunction(fun);
|
||||
}
|
||||
|
||||
// JSON replacement scan
|
||||
DBConfig::GetConfig(loader.GetDatabaseInstance())
|
||||
.replacement_scans.emplace_back(JSONFunctions::ReadJSONReplacement);
|
||||
|
||||
// JSON copy function
|
||||
auto copy_fun = JSONFunctions::GetJSONCopyFunction();
|
||||
loader.RegisterFunction(copy_fun);
|
||||
copy_fun.extension = "ndjson";
|
||||
copy_fun.name = "ndjson";
|
||||
loader.RegisterFunction(copy_fun);
|
||||
copy_fun.extension = "jsonl";
|
||||
copy_fun.name = "jsonl";
|
||||
loader.RegisterFunction(copy_fun);
|
||||
|
||||
// JSON macro's
|
||||
for (idx_t index = 0; JSON_MACROS[index].name != nullptr; index++) {
|
||||
auto info = DefaultFunctionGenerator::CreateInternalMacroInfo(JSON_MACROS[index]);
|
||||
loader.RegisterFunction(*info);
|
||||
}
|
||||
}
|
||||
|
||||
void JsonExtension::Load(ExtensionLoader &loader) {
|
||||
LoadInternal(loader);
|
||||
}
|
||||
|
||||
std::string JsonExtension::Name() {
|
||||
return "json";
|
||||
}
|
||||
|
||||
std::string JsonExtension::Version() const {
|
||||
#ifdef EXT_VERSION_JSON
|
||||
return EXT_VERSION_JSON;
|
||||
#else
|
||||
return "";
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
|
||||
extern "C" {
|
||||
|
||||
DUCKDB_CPP_EXTENSION_ENTRY(json, loader) {
|
||||
duckdb::LoadInternal(loader);
|
||||
}
|
||||
}
|
||||
409
external/duckdb/extension/json/json_functions.cpp
vendored
Normal file
409
external/duckdb/extension/json/json_functions.cpp
vendored
Normal file
@@ -0,0 +1,409 @@
|
||||
#include "json_functions.hpp"
|
||||
|
||||
#include "duckdb/common/file_system.hpp"
|
||||
#include "duckdb/execution/expression_executor.hpp"
|
||||
#include "duckdb/function/cast/cast_function_set.hpp"
|
||||
#include "duckdb/function/cast/default_casts.hpp"
|
||||
#include "duckdb/function/replacement_scan.hpp"
|
||||
#include "duckdb/parser/expression/constant_expression.hpp"
|
||||
#include "duckdb/parser/expression/function_expression.hpp"
|
||||
#include "duckdb/parser/parsed_data/create_pragma_function_info.hpp"
|
||||
#include "duckdb/parser/tableref/table_function_ref.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
using JSONPathType = JSONCommon::JSONPathType;
|
||||
|
||||
JSONPathType JSONReadFunctionData::CheckPath(const Value &path_val, string &path, idx_t &len) {
|
||||
if (path_val.IsNull()) {
|
||||
throw BinderException("JSON path cannot be NULL");
|
||||
}
|
||||
const auto path_str_val = path_val.DefaultCastAs(LogicalType::VARCHAR);
|
||||
auto path_str = path_str_val.GetValueUnsafe<string_t>();
|
||||
len = path_str.GetSize();
|
||||
const auto ptr = path_str.GetData();
|
||||
JSONPathType path_type = JSONPathType::REGULAR;
|
||||
// Copy over string to the bind data
|
||||
if (len != 0) {
|
||||
if (*ptr == '/' || *ptr == '$') {
|
||||
path = string(ptr, len);
|
||||
} else if (path_val.type().IsIntegral()) {
|
||||
path = "$[" + string(ptr, len) + "]";
|
||||
} else if (memchr(ptr, '"', len)) {
|
||||
path = "/" + string(ptr, len);
|
||||
} else {
|
||||
path = "$.\"" + string(ptr, len) + "\"";
|
||||
}
|
||||
len = path.length();
|
||||
if (*path.c_str() == '$') {
|
||||
path_type = JSONCommon::ValidatePath(path.c_str(), len, true);
|
||||
}
|
||||
}
|
||||
return path_type;
|
||||
}
|
||||
|
||||
JSONReadFunctionData::JSONReadFunctionData(bool constant, string path_p, idx_t len, JSONPathType path_type_p)
|
||||
: constant(constant), path(std::move(path_p)), path_type(path_type_p), ptr(path.c_str()), len(len) {
|
||||
}
|
||||
|
||||
unique_ptr<FunctionData> JSONReadFunctionData::Copy() const {
|
||||
return make_uniq<JSONReadFunctionData>(constant, path, len, path_type);
|
||||
}
|
||||
|
||||
bool JSONReadFunctionData::Equals(const FunctionData &other_p) const {
|
||||
auto &other = other_p.Cast<JSONReadFunctionData>();
|
||||
return constant == other.constant && path == other.path && len == other.len && path_type == other.path_type;
|
||||
}
|
||||
|
||||
unique_ptr<FunctionData> JSONReadFunctionData::Bind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments) {
|
||||
D_ASSERT(bound_function.arguments.size() == 2);
|
||||
bool constant = false;
|
||||
string path;
|
||||
idx_t len = 0;
|
||||
JSONPathType path_type = JSONPathType::REGULAR;
|
||||
if (arguments[1]->IsFoldable()) {
|
||||
const auto path_val = ExpressionExecutor::EvaluateScalar(context, *arguments[1]);
|
||||
if (!path_val.IsNull()) {
|
||||
constant = true;
|
||||
path_type = CheckPath(path_val, path, len);
|
||||
}
|
||||
}
|
||||
if (arguments[1]->return_type.IsIntegral()) {
|
||||
bound_function.arguments[1] = LogicalType::BIGINT;
|
||||
} else {
|
||||
bound_function.arguments[1] = LogicalType::VARCHAR;
|
||||
}
|
||||
if (path_type == JSONCommon::JSONPathType::WILDCARD) {
|
||||
bound_function.return_type = LogicalType::LIST(bound_function.return_type);
|
||||
}
|
||||
return make_uniq<JSONReadFunctionData>(constant, std::move(path), len, path_type);
|
||||
}
|
||||
|
||||
JSONReadManyFunctionData::JSONReadManyFunctionData(vector<string> paths_p, vector<idx_t> lens_p)
|
||||
: paths(std::move(paths_p)), lens(std::move(lens_p)) {
|
||||
for (const auto &path : paths) {
|
||||
ptrs.push_back(path.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<FunctionData> JSONReadManyFunctionData::Copy() const {
|
||||
return make_uniq<JSONReadManyFunctionData>(paths, lens);
|
||||
}
|
||||
|
||||
bool JSONReadManyFunctionData::Equals(const FunctionData &other_p) const {
|
||||
auto &other = other_p.Cast<JSONReadManyFunctionData>();
|
||||
return paths == other.paths && lens == other.lens;
|
||||
}
|
||||
|
||||
unique_ptr<FunctionData> JSONReadManyFunctionData::Bind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments) {
|
||||
D_ASSERT(bound_function.arguments.size() == 2);
|
||||
if (arguments[1]->HasParameter()) {
|
||||
throw ParameterNotResolvedException();
|
||||
}
|
||||
if (!arguments[1]->IsFoldable()) {
|
||||
throw BinderException("List of paths must be constant");
|
||||
}
|
||||
|
||||
vector<string> paths;
|
||||
vector<idx_t> lens;
|
||||
auto paths_val = ExpressionExecutor::EvaluateScalar(context, *arguments[1]);
|
||||
|
||||
for (auto &path_val : ListValue::GetChildren(paths_val)) {
|
||||
paths.emplace_back("");
|
||||
lens.push_back(0);
|
||||
if (JSONReadFunctionData::CheckPath(path_val, paths.back(), lens.back()) == JSONPathType::WILDCARD) {
|
||||
throw BinderException("Cannot have wildcards in JSON path when supplying multiple paths");
|
||||
}
|
||||
}
|
||||
|
||||
return make_uniq<JSONReadManyFunctionData>(std::move(paths), std::move(lens));
|
||||
}
|
||||
|
||||
JSONFunctionLocalState::JSONFunctionLocalState(Allocator &allocator)
|
||||
: json_allocator(make_shared_ptr<JSONAllocator>(allocator)) {
|
||||
}
|
||||
|
||||
JSONFunctionLocalState::JSONFunctionLocalState(ClientContext &context)
|
||||
: JSONFunctionLocalState(BufferAllocator::Get(context)) {
|
||||
}
|
||||
|
||||
unique_ptr<FunctionLocalState> JSONFunctionLocalState::Init(ExpressionState &state, const BoundFunctionExpression &expr,
|
||||
FunctionData *bind_data) {
|
||||
return make_uniq<JSONFunctionLocalState>(state.GetContext());
|
||||
}
|
||||
|
||||
unique_ptr<FunctionLocalState> JSONFunctionLocalState::InitCastLocalState(CastLocalStateParameters ¶meters) {
|
||||
return parameters.context ? make_uniq<JSONFunctionLocalState>(*parameters.context)
|
||||
: make_uniq<JSONFunctionLocalState>(Allocator::DefaultAllocator());
|
||||
}
|
||||
|
||||
JSONFunctionLocalState &JSONFunctionLocalState::ResetAndGet(ExpressionState &state) {
|
||||
auto &lstate = ExecuteFunctionState::GetFunctionState(state)->Cast<JSONFunctionLocalState>();
|
||||
lstate.json_allocator->Reset();
|
||||
return lstate;
|
||||
}
|
||||
|
||||
vector<ScalarFunctionSet> JSONFunctions::GetScalarFunctions() {
|
||||
vector<ScalarFunctionSet> functions;
|
||||
|
||||
// Extract functions
|
||||
AddAliases({"json_extract", "json_extract_path"}, GetExtractFunction(), functions);
|
||||
AddAliases({"json_extract_string", "json_extract_path_text", "->>"}, GetExtractStringFunction(), functions);
|
||||
|
||||
// Create functions
|
||||
functions.push_back(GetArrayFunction());
|
||||
functions.push_back(GetObjectFunction());
|
||||
AddAliases({"to_json", "json_quote"}, GetToJSONFunction(), functions);
|
||||
functions.push_back(GetArrayToJSONFunction());
|
||||
functions.push_back(GetRowToJSONFunction());
|
||||
functions.push_back(GetMergePatchFunction());
|
||||
|
||||
// Structure/Transform
|
||||
functions.push_back(GetStructureFunction());
|
||||
AddAliases({"json_transform", "from_json"}, GetTransformFunction(), functions);
|
||||
AddAliases({"json_transform_strict", "from_json_strict"}, GetTransformStrictFunction(), functions);
|
||||
|
||||
// Other
|
||||
functions.push_back(GetArrayLengthFunction());
|
||||
functions.push_back(GetContainsFunction());
|
||||
functions.push_back(GetExistsFunction());
|
||||
functions.push_back(GetKeysFunction());
|
||||
functions.push_back(GetTypeFunction());
|
||||
functions.push_back(GetValidFunction());
|
||||
functions.push_back(GetValueFunction());
|
||||
functions.push_back(GetSerializePlanFunction());
|
||||
functions.push_back(GetSerializeSqlFunction());
|
||||
functions.push_back(GetDeserializeSqlFunction());
|
||||
|
||||
functions.push_back(GetPrettyPrintFunction());
|
||||
|
||||
return functions;
|
||||
}
|
||||
|
||||
vector<PragmaFunctionSet> JSONFunctions::GetPragmaFunctions() {
|
||||
vector<PragmaFunctionSet> functions;
|
||||
functions.push_back(GetExecuteJsonSerializedSqlPragmaFunction());
|
||||
return functions;
|
||||
}
|
||||
|
||||
vector<TableFunctionSet> JSONFunctions::GetTableFunctions() {
|
||||
vector<TableFunctionSet> functions;
|
||||
|
||||
// Reads JSON as string
|
||||
functions.push_back(GetReadJSONObjectsFunction());
|
||||
functions.push_back(GetReadNDJSONObjectsFunction());
|
||||
functions.push_back(GetReadJSONObjectsAutoFunction());
|
||||
|
||||
// Read JSON as columnar data
|
||||
functions.push_back(GetReadJSONFunction());
|
||||
functions.push_back(GetReadNDJSONFunction());
|
||||
functions.push_back(GetReadJSONAutoFunction());
|
||||
functions.push_back(GetReadNDJSONAutoFunction());
|
||||
|
||||
// Table in-out
|
||||
functions.push_back(GetJSONEachFunction());
|
||||
functions.push_back(GetJSONTreeFunction());
|
||||
|
||||
// Serialized plan
|
||||
functions.push_back(GetExecuteJsonSerializedSqlFunction());
|
||||
|
||||
return functions;
|
||||
}
|
||||
|
||||
unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context, ReplacementScanInput &input,
|
||||
optional_ptr<ReplacementScanData> data) {
|
||||
auto table_name = ReplacementScan::GetFullPath(input);
|
||||
if (!ReplacementScan::CanReplace(table_name, {"json", "jsonl", "ndjson"})) {
|
||||
return nullptr;
|
||||
}
|
||||
auto table_function = make_uniq<TableFunctionRef>();
|
||||
vector<unique_ptr<ParsedExpression>> children;
|
||||
children.push_back(make_uniq<ConstantExpression>(Value(table_name)));
|
||||
table_function->function = make_uniq<FunctionExpression>("read_json_auto", std::move(children));
|
||||
|
||||
if (!FileSystem::HasGlob(table_name)) {
|
||||
auto &fs = FileSystem::GetFileSystem(context);
|
||||
table_function->alias = fs.ExtractBaseName(table_name);
|
||||
}
|
||||
|
||||
return std::move(table_function);
|
||||
}
|
||||
|
||||
static bool CastVarcharToJSON(Vector &source, Vector &result, idx_t count, CastParameters ¶meters) {
|
||||
auto &lstate = parameters.local_state->Cast<JSONFunctionLocalState>();
|
||||
lstate.json_allocator->Reset();
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
|
||||
bool success = true;
|
||||
UnaryExecutor::ExecuteWithNulls<string_t, string_t>(
|
||||
source, result, count, [&](string_t input, ValidityMask &mask, idx_t idx) {
|
||||
auto data = input.GetDataWriteable();
|
||||
const auto length = input.GetSize();
|
||||
|
||||
yyjson_read_err error;
|
||||
auto doc = JSONCommon::ReadDocumentUnsafe(data, length, JSONCommon::READ_FLAG, alc, &error);
|
||||
|
||||
if (!doc) {
|
||||
mask.SetInvalid(idx);
|
||||
if (success) {
|
||||
HandleCastError::AssignError(JSONCommon::FormatParseError(data, length, error), parameters);
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
|
||||
return input;
|
||||
});
|
||||
StringVector::AddHeapReference(result, source);
|
||||
return success;
|
||||
}
|
||||
|
||||
static bool CastJSONListToVarchar(Vector &source, Vector &result, idx_t count, CastParameters &) {
|
||||
UnifiedVectorFormat child_format;
|
||||
ListVector::GetEntry(source).ToUnifiedFormat(ListVector::GetListSize(source), child_format);
|
||||
const auto input_jsons = UnifiedVectorFormat::GetData<string_t>(child_format);
|
||||
|
||||
static constexpr char const *NULL_STRING = "NULL";
|
||||
static constexpr idx_t NULL_STRING_LENGTH = 4;
|
||||
|
||||
UnaryExecutor::Execute<list_entry_t, string_t>(
|
||||
source, result, count,
|
||||
[&](const list_entry_t &input) {
|
||||
// Compute len (start with [] and ,)
|
||||
idx_t len = 2;
|
||||
len += input.length == 0 ? 0 : (input.length - 1) * 2;
|
||||
for (idx_t json_idx = input.offset; json_idx < input.offset + input.length; json_idx++) {
|
||||
const auto sel_json_idx = child_format.sel->get_index(json_idx);
|
||||
if (child_format.validity.RowIsValid(sel_json_idx)) {
|
||||
len += input_jsons[sel_json_idx].GetSize();
|
||||
} else {
|
||||
len += NULL_STRING_LENGTH;
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate string
|
||||
auto res = StringVector::EmptyString(result, len);
|
||||
auto ptr = res.GetDataWriteable();
|
||||
|
||||
// Populate string
|
||||
*ptr++ = '[';
|
||||
for (idx_t json_idx = input.offset; json_idx < input.offset + input.length; json_idx++) {
|
||||
const auto sel_json_idx = child_format.sel->get_index(json_idx);
|
||||
if (child_format.validity.RowIsValid(sel_json_idx)) {
|
||||
auto &input_json = input_jsons[sel_json_idx];
|
||||
memcpy(ptr, input_json.GetData(), input_json.GetSize());
|
||||
ptr += input_json.GetSize();
|
||||
} else {
|
||||
memcpy(ptr, NULL_STRING, NULL_STRING_LENGTH);
|
||||
ptr += NULL_STRING_LENGTH;
|
||||
}
|
||||
if (json_idx != input.offset + input.length - 1) {
|
||||
*ptr++ = ',';
|
||||
*ptr++ = ' ';
|
||||
}
|
||||
}
|
||||
*ptr = ']';
|
||||
|
||||
res.Finalize();
|
||||
return res;
|
||||
},
|
||||
FunctionErrors::CANNOT_ERROR);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool CastVarcharToJSONList(Vector &source, Vector &result, idx_t count, CastParameters ¶meters) {
|
||||
auto &lstate = parameters.local_state->Cast<JSONFunctionLocalState>();
|
||||
lstate.json_allocator->Reset();
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
|
||||
bool success = true;
|
||||
UnaryExecutor::ExecuteWithNulls<string_t, list_entry_t>(
|
||||
source, result, count, [&](const string_t &input, ValidityMask &mask, idx_t idx) -> list_entry_t {
|
||||
// Figure out if the cast can succeed
|
||||
yyjson_read_err error;
|
||||
const auto doc = JSONCommon::ReadDocumentUnsafe(input.GetDataWriteable(), input.GetSize(),
|
||||
JSONCommon::READ_FLAG, alc, &error);
|
||||
if (!doc || !unsafe_yyjson_is_arr(doc->root)) {
|
||||
mask.SetInvalid(idx);
|
||||
if (success) {
|
||||
if (!doc) {
|
||||
HandleCastError::AssignError(
|
||||
JSONCommon::FormatParseError(input.GetDataWriteable(), input.GetSize(), error), parameters);
|
||||
} else if (!unsafe_yyjson_is_arr(doc->root)) {
|
||||
auto truncated_input =
|
||||
input.GetSize() > 50 ? string(input.GetData(), 47) + "..." : input.GetString();
|
||||
HandleCastError::AssignError(
|
||||
StringUtil::Format("Cannot cast to list of JSON. Input \"%s\"", truncated_input),
|
||||
parameters);
|
||||
}
|
||||
success = false;
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
auto current_size = ListVector::GetListSize(result);
|
||||
const auto arr_len = unsafe_yyjson_get_len(doc->root);
|
||||
const auto new_size = current_size + arr_len;
|
||||
|
||||
// Grow list if needed
|
||||
if (ListVector::GetListCapacity(result) < new_size) {
|
||||
ListVector::Reserve(result, new_size);
|
||||
}
|
||||
|
||||
// Populate list
|
||||
const auto result_jsons = FlatVector::GetData<string_t>(ListVector::GetEntry(result));
|
||||
size_t arr_idx, max;
|
||||
yyjson_val *val;
|
||||
yyjson_arr_foreach(doc->root, arr_idx, max, val) {
|
||||
result_jsons[current_size + arr_idx] = JSONCommon::WriteVal(val, alc);
|
||||
}
|
||||
|
||||
// Update size
|
||||
ListVector::SetListSize(result, current_size + arr_len);
|
||||
|
||||
return {current_size, arr_len};
|
||||
});
|
||||
|
||||
JSONAllocator::AddBuffer(ListVector::GetEntry(result), alc);
|
||||
return success;
|
||||
}
|
||||
|
||||
void JSONFunctions::RegisterSimpleCastFunctions(ExtensionLoader &loader) {
|
||||
auto &db = loader.GetDatabaseInstance();
|
||||
|
||||
// JSON to VARCHAR is basically free
|
||||
loader.RegisterCastFunction(LogicalType::JSON(), LogicalType::VARCHAR, DefaultCasts::ReinterpretCast, 1);
|
||||
|
||||
// VARCHAR to JSON requires a parse so it's not free. Let's make it 1 more than a cast to STRUCT
|
||||
const auto varchar_to_json_cost =
|
||||
CastFunctionSet::ImplicitCastCost(db, LogicalType::SQLNULL, LogicalTypeId::STRUCT) + 1;
|
||||
BoundCastInfo varchar_to_json_info(CastVarcharToJSON, nullptr, JSONFunctionLocalState::InitCastLocalState);
|
||||
loader.RegisterCastFunction(LogicalType::VARCHAR, LogicalType::JSON(), std::move(varchar_to_json_info),
|
||||
varchar_to_json_cost);
|
||||
|
||||
// Register NULL to JSON with a different cost than NULL to VARCHAR so the binder can disambiguate functions
|
||||
const auto null_to_json_cost =
|
||||
CastFunctionSet::ImplicitCastCost(db, LogicalType::SQLNULL, LogicalTypeId::VARCHAR) + 1;
|
||||
loader.RegisterCastFunction(LogicalType::SQLNULL, LogicalType::JSON(), DefaultCasts::TryVectorNullCast,
|
||||
null_to_json_cost);
|
||||
|
||||
// JSON[] to VARCHAR (this needs a special case otherwise the cast will escape quotes)
|
||||
const auto json_list_to_varchar_cost =
|
||||
CastFunctionSet::ImplicitCastCost(db, LogicalType::LIST(LogicalType::JSON()), LogicalTypeId::VARCHAR) - 1;
|
||||
loader.RegisterCastFunction(LogicalType::LIST(LogicalType::JSON()), LogicalTypeId::VARCHAR, CastJSONListToVarchar,
|
||||
json_list_to_varchar_cost);
|
||||
|
||||
// JSON[] to JSON is allowed implicitly
|
||||
loader.RegisterCastFunction(LogicalType::LIST(LogicalType::JSON()), LogicalType::JSON(), CastJSONListToVarchar,
|
||||
100);
|
||||
|
||||
// VARCHAR to JSON[] (also needs a special case otherwise we get a VARCHAR -> VARCHAR[] cast first)
|
||||
const auto varchar_to_json_list_cost =
|
||||
CastFunctionSet::ImplicitCastCost(db, LogicalType::VARCHAR, LogicalType::LIST(LogicalType::JSON())) - 1;
|
||||
BoundCastInfo varchar_to_json_list_info(CastVarcharToJSONList, nullptr, JSONFunctionLocalState::InitCastLocalState);
|
||||
loader.RegisterCastFunction(LogicalType::VARCHAR, LogicalType::LIST(LogicalType::JSON()),
|
||||
std::move(varchar_to_json_list_info), varchar_to_json_list_cost);
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
25
external/duckdb/extension/json/json_functions/CMakeLists.txt
vendored
Normal file
25
external/duckdb/extension/json/json_functions/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
add_library_unity(
|
||||
duckdb_json_functions
|
||||
OBJECT
|
||||
copy_json.cpp
|
||||
json_array_length.cpp
|
||||
json_contains.cpp
|
||||
json_create.cpp
|
||||
json_exists.cpp
|
||||
json_extract.cpp
|
||||
json_keys.cpp
|
||||
json_merge_patch.cpp
|
||||
json_pretty.cpp
|
||||
json_structure.cpp
|
||||
json_table_in_out.cpp
|
||||
json_transform.cpp
|
||||
json_type.cpp
|
||||
json_valid.cpp
|
||||
json_value.cpp
|
||||
json_serialize_plan.cpp
|
||||
json_serialize_sql.cpp
|
||||
read_json.cpp
|
||||
read_json_objects.cpp)
|
||||
set(ALL_OBJECT_FILES
|
||||
${ALL_OBJECT_FILES} $<TARGET_OBJECTS:duckdb_json_functions>
|
||||
PARENT_SCOPE)
|
||||
133
external/duckdb/extension/json/json_functions/copy_json.cpp
vendored
Normal file
133
external/duckdb/extension/json/json_functions/copy_json.cpp
vendored
Normal file
@@ -0,0 +1,133 @@
|
||||
#include "duckdb/function/copy_function.hpp"
|
||||
#include "duckdb/parser/expression/constant_expression.hpp"
|
||||
#include "duckdb/parser/expression/function_expression.hpp"
|
||||
#include "duckdb/parser/expression/positional_reference_expression.hpp"
|
||||
#include "duckdb/parser/query_node/select_node.hpp"
|
||||
#include "duckdb/parser/tableref/subqueryref.hpp"
|
||||
#include "duckdb/planner/binder.hpp"
|
||||
#include "duckdb/common/helper.hpp"
|
||||
#include "json_functions.hpp"
|
||||
#include "json_scan.hpp"
|
||||
#include "json_transform.hpp"
|
||||
#include "json_multi_file_info.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static void ThrowJSONCopyParameterException(const string &loption) {
|
||||
throw BinderException("COPY (FORMAT JSON) parameter %s expects a single argument.", loption);
|
||||
}
|
||||
|
||||
static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
|
||||
static const unordered_set<string> SUPPORTED_BASE_OPTIONS {
|
||||
"compression", "encoding", "use_tmp_file", "overwrite_or_ignore", "overwrite", "append", "filename_pattern",
|
||||
"file_extension", "per_thread_output", "file_size_bytes",
|
||||
// "partition_by", unsupported
|
||||
"return_files", "preserve_order", "return_stats", "write_partition_columns", "write_empty_file",
|
||||
"hive_file_pattern"};
|
||||
|
||||
auto stmt_copy = stmt.Copy();
|
||||
auto © = stmt_copy->Cast<CopyStatement>();
|
||||
auto &copied_info = *copy.info;
|
||||
|
||||
// Parse the options, creating options for the CSV writer while doing so
|
||||
string date_format;
|
||||
string timestamp_format;
|
||||
// We insert the JSON file extension here so it works properly with PER_THREAD_OUTPUT/FILE_SIZE_BYTES etc.
|
||||
case_insensitive_map_t<vector<Value>> csv_copy_options {{"file_extension", {"json"}}};
|
||||
for (const auto &kv : copied_info.options) {
|
||||
const auto &loption = StringUtil::Lower(kv.first);
|
||||
if (loption == "dateformat" || loption == "date_format") {
|
||||
if (kv.second.size() != 1) {
|
||||
ThrowJSONCopyParameterException(loption);
|
||||
}
|
||||
date_format = StringValue::Get(kv.second.back());
|
||||
} else if (loption == "timestampformat" || loption == "timestamp_format") {
|
||||
if (kv.second.size() != 1) {
|
||||
ThrowJSONCopyParameterException(loption);
|
||||
}
|
||||
timestamp_format = StringValue::Get(kv.second.back());
|
||||
} else if (loption == "array") {
|
||||
if (kv.second.size() > 1) {
|
||||
ThrowJSONCopyParameterException(loption);
|
||||
}
|
||||
if (kv.second.empty() || BooleanValue::Get(kv.second.back().DefaultCastAs(LogicalTypeId::BOOLEAN))) {
|
||||
csv_copy_options["prefix"] = {"[\n\t"};
|
||||
csv_copy_options["suffix"] = {"\n]\n"};
|
||||
csv_copy_options["new_line"] = {",\n\t"};
|
||||
}
|
||||
} else if (SUPPORTED_BASE_OPTIONS.find(loption) != SUPPORTED_BASE_OPTIONS.end()) {
|
||||
// We support these base options
|
||||
csv_copy_options.insert(kv);
|
||||
} else {
|
||||
throw BinderException("Unknown option for COPY ... TO ... (FORMAT JSON): \"%s\".", loption);
|
||||
}
|
||||
}
|
||||
|
||||
// Bind the select statement of the original to resolve the types
|
||||
auto dummy_binder = Binder::CreateBinder(binder.context, &binder);
|
||||
auto bound_original = dummy_binder->Bind(*stmt.info->select_statement);
|
||||
|
||||
// Create new SelectNode with the original SelectNode as a subquery in the FROM clause
|
||||
auto select_stmt = make_uniq<SelectStatement>();
|
||||
select_stmt->node = std::move(copied_info.select_statement);
|
||||
auto subquery_ref = make_uniq<SubqueryRef>(std::move(select_stmt));
|
||||
|
||||
copied_info.select_statement = make_uniq_base<QueryNode, SelectNode>();
|
||||
auto &select_node = copied_info.select_statement->Cast<SelectNode>();
|
||||
select_node.from_table = std::move(subquery_ref);
|
||||
|
||||
// Create new select list
|
||||
vector<unique_ptr<ParsedExpression>> select_list;
|
||||
select_list.reserve(bound_original.types.size());
|
||||
|
||||
// strftime if the user specified a format (loop also gives columns a name, needed for struct_pack)
|
||||
// TODO: deal with date/timestamp within nested types
|
||||
vector<unique_ptr<ParsedExpression>> strftime_children;
|
||||
for (idx_t col_idx = 0; col_idx < bound_original.types.size(); col_idx++) {
|
||||
auto column = make_uniq_base<ParsedExpression, PositionalReferenceExpression>(col_idx + 1);
|
||||
strftime_children = vector<unique_ptr<ParsedExpression>>();
|
||||
const auto &type = bound_original.types[col_idx];
|
||||
const auto &name = bound_original.names[col_idx];
|
||||
if (!date_format.empty() && type == LogicalTypeId::DATE) {
|
||||
strftime_children.emplace_back(std::move(column));
|
||||
strftime_children.emplace_back(make_uniq<ConstantExpression>(date_format));
|
||||
column = make_uniq<FunctionExpression>("strftime", std::move(strftime_children));
|
||||
} else if (!timestamp_format.empty() && type == LogicalTypeId::TIMESTAMP) {
|
||||
strftime_children.emplace_back(std::move(column));
|
||||
strftime_children.emplace_back(make_uniq<ConstantExpression>(timestamp_format));
|
||||
column = make_uniq<FunctionExpression>("strftime", std::move(strftime_children));
|
||||
}
|
||||
column->SetAlias(name);
|
||||
select_list.emplace_back(std::move(column));
|
||||
}
|
||||
|
||||
// Now create the struct_pack/to_json to create a JSON object per row
|
||||
vector<unique_ptr<ParsedExpression>> struct_pack_child;
|
||||
struct_pack_child.emplace_back(make_uniq<FunctionExpression>("struct_pack", std::move(select_list)));
|
||||
select_node.select_list.emplace_back(make_uniq<FunctionExpression>("to_json", std::move(struct_pack_child)));
|
||||
|
||||
// Now we can just use the CSV writer
|
||||
copied_info.format = "csv";
|
||||
copied_info.options = std::move(csv_copy_options);
|
||||
copied_info.options["quote"] = {""};
|
||||
copied_info.options["escape"] = {""};
|
||||
copied_info.options["delimiter"] = {"\n"};
|
||||
copied_info.options["header"] = {{0}};
|
||||
|
||||
return binder.Bind(*stmt_copy);
|
||||
}
|
||||
|
||||
CopyFunction JSONFunctions::GetJSONCopyFunction() {
|
||||
CopyFunction function("json");
|
||||
function.extension = "json";
|
||||
|
||||
function.plan = CopyToJSONPlan;
|
||||
|
||||
function.copy_from_bind = MultiFileFunction<JSONMultiFileInfo>::MultiFileBindCopy;
|
||||
function.copy_from_function = JSONFunctions::GetReadJSONTableFunction(make_shared_ptr<JSONScanInfo>(
|
||||
JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT, JSONRecordType::RECORDS, false));
|
||||
|
||||
return function;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
38
external/duckdb/extension/json/json_functions/json_array_length.cpp
vendored
Normal file
38
external/duckdb/extension/json/json_functions/json_array_length.cpp
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
#include "json_executors.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static inline uint64_t GetArrayLength(yyjson_val *val, yyjson_alc *, Vector &, ValidityMask &, idx_t) {
|
||||
return yyjson_arr_size(val);
|
||||
}
|
||||
|
||||
static void UnaryArrayLengthFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::UnaryExecute<uint64_t>(args, state, result, GetArrayLength);
|
||||
}
|
||||
|
||||
static void BinaryArrayLengthFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::BinaryExecute<uint64_t>(args, state, result, GetArrayLength);
|
||||
}
|
||||
|
||||
static void ManyArrayLengthFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::ExecuteMany<uint64_t>(args, state, result, GetArrayLength);
|
||||
}
|
||||
|
||||
static void GetArrayLengthFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
|
||||
set.AddFunction(ScalarFunction({input_type}, LogicalType::UBIGINT, UnaryArrayLengthFunction, nullptr, nullptr,
|
||||
nullptr, JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::UBIGINT, BinaryArrayLengthFunction,
|
||||
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
|
||||
LogicalType::LIST(LogicalType::UBIGINT), ManyArrayLengthFunction,
|
||||
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetArrayLengthFunction() {
|
||||
ScalarFunctionSet set("json_array_length");
|
||||
GetArrayLengthFunctionsInternal(set, LogicalType::VARCHAR);
|
||||
GetArrayLengthFunctionsInternal(set, LogicalType::JSON());
|
||||
return set;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
155
external/duckdb/extension/json/json_functions/json_contains.cpp
vendored
Normal file
155
external/duckdb/extension/json/json_functions/json_contains.cpp
vendored
Normal file
@@ -0,0 +1,155 @@
|
||||
#include "json_executors.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static inline bool JSONContains(yyjson_val *haystack, yyjson_val *needle);
|
||||
static inline bool JSONFuzzyEquals(yyjson_val *haystack, yyjson_val *needle);
|
||||
|
||||
static inline bool JSONArrayFuzzyEquals(yyjson_val *haystack, yyjson_val *needle) {
|
||||
D_ASSERT(yyjson_get_tag(haystack) == (YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE) &&
|
||||
yyjson_get_tag(needle) == (YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE));
|
||||
|
||||
size_t needle_idx, needle_max, haystack_idx, haystack_max;
|
||||
yyjson_val *needle_child, *haystack_child;
|
||||
yyjson_arr_foreach(needle, needle_idx, needle_max, needle_child) {
|
||||
bool found = false;
|
||||
yyjson_arr_foreach(haystack, haystack_idx, haystack_max, haystack_child) {
|
||||
if (JSONFuzzyEquals(haystack_child, needle_child)) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool JSONObjectFuzzyEquals(yyjson_val *haystack, yyjson_val *needle) {
|
||||
D_ASSERT(yyjson_get_tag(haystack) == (YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE) &&
|
||||
yyjson_get_tag(needle) == (YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE));
|
||||
|
||||
size_t idx, max;
|
||||
yyjson_val *key, *needle_child;
|
||||
yyjson_obj_foreach(needle, idx, max, key, needle_child) {
|
||||
auto haystack_child = yyjson_obj_getn(haystack, unsafe_yyjson_get_str(key), unsafe_yyjson_get_len(key));
|
||||
if (!haystack_child || !JSONFuzzyEquals(haystack_child, needle_child)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool JSONFuzzyEquals(yyjson_val *haystack, yyjson_val *needle) {
|
||||
D_ASSERT(haystack && needle);
|
||||
|
||||
// Strict equality
|
||||
if (unsafe_yyjson_equals(haystack, needle)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto haystack_tag = yyjson_get_tag(needle);
|
||||
if (haystack_tag != yyjson_get_tag(haystack)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Fuzzy equality (contained in)
|
||||
switch (haystack_tag) {
|
||||
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
||||
return JSONArrayFuzzyEquals(haystack, needle);
|
||||
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
||||
return JSONObjectFuzzyEquals(haystack, needle);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool JSONArrayContains(yyjson_val *haystack_array, yyjson_val *needle) {
|
||||
D_ASSERT(yyjson_get_tag(haystack_array) == (YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE));
|
||||
|
||||
size_t idx, max;
|
||||
yyjson_val *child_haystack;
|
||||
yyjson_arr_foreach(haystack_array, idx, max, child_haystack) {
|
||||
if (JSONContains(child_haystack, needle)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool JSONObjectContains(yyjson_val *haystack_object, yyjson_val *needle) {
|
||||
D_ASSERT(yyjson_get_tag(haystack_object) == (YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE));
|
||||
|
||||
size_t idx, max;
|
||||
yyjson_val *key, *child_haystack;
|
||||
yyjson_obj_foreach(haystack_object, idx, max, key, child_haystack) {
|
||||
if (JSONContains(child_haystack, needle)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool JSONContains(yyjson_val *haystack, yyjson_val *needle) {
|
||||
if (JSONFuzzyEquals(haystack, needle)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
switch (yyjson_get_tag(haystack)) {
|
||||
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
||||
return JSONArrayContains(haystack, needle);
|
||||
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
||||
return JSONObjectContains(haystack, needle);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static void JSONContainsFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
D_ASSERT(args.data.size() == 2);
|
||||
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
|
||||
auto &haystacks = args.data[0];
|
||||
auto &needles = args.data[1];
|
||||
|
||||
if (needles.GetVectorType() == VectorType::CONSTANT_VECTOR) {
|
||||
if (ConstantVector::IsNull(needles)) {
|
||||
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
||||
ConstantVector::SetNull(result, true);
|
||||
return;
|
||||
}
|
||||
auto &needle_str = *ConstantVector::GetData<string_t>(needles);
|
||||
auto needle_doc = JSONCommon::ReadDocument(needle_str, JSONCommon::READ_FLAG, alc);
|
||||
UnaryExecutor::Execute<string_t, bool>(haystacks, result, args.size(), [&](string_t haystack_str) {
|
||||
auto haystack_doc = JSONCommon::ReadDocument(haystack_str, JSONCommon::READ_FLAG, alc);
|
||||
return JSONContains(haystack_doc->root, needle_doc->root);
|
||||
});
|
||||
} else {
|
||||
BinaryExecutor::Execute<string_t, string_t, bool>(
|
||||
haystacks, needles, result, args.size(), [&](string_t haystack_str, string_t needle_str) {
|
||||
auto needle_doc = JSONCommon::ReadDocument(needle_str, JSONCommon::READ_FLAG, alc);
|
||||
auto haystack_doc = JSONCommon::ReadDocument(haystack_str, JSONCommon::READ_FLAG, alc);
|
||||
return JSONContains(haystack_doc->root, needle_doc->root);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void GetContainsFunctionInternal(ScalarFunctionSet &set, const LogicalType &lhs, const LogicalType &rhs) {
|
||||
set.AddFunction(ScalarFunction({lhs, rhs}, LogicalType::BOOLEAN, JSONContainsFunction, nullptr, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init));
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetContainsFunction() {
|
||||
ScalarFunctionSet set("json_contains");
|
||||
GetContainsFunctionInternal(set, LogicalType::VARCHAR, LogicalType::VARCHAR);
|
||||
GetContainsFunctionInternal(set, LogicalType::VARCHAR, LogicalType::JSON());
|
||||
GetContainsFunctionInternal(set, LogicalType::JSON(), LogicalType::VARCHAR);
|
||||
GetContainsFunctionInternal(set, LogicalType::JSON(), LogicalType::JSON());
|
||||
// TODO: implement json_contains that accepts path argument as well
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
831
external/duckdb/extension/json/json_functions/json_create.cpp
vendored
Normal file
831
external/duckdb/extension/json/json_functions/json_create.cpp
vendored
Normal file
@@ -0,0 +1,831 @@
|
||||
#include "duckdb/function/cast/cast_function_set.hpp"
|
||||
#include "duckdb/function/cast/default_casts.hpp"
|
||||
#include "duckdb/planner/expression/bound_parameter_expression.hpp"
|
||||
#include "json_common.hpp"
|
||||
#include "json_functions.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
using StructNames = unordered_map<string, unique_ptr<Vector>>;
|
||||
|
||||
struct JSONCreateFunctionData : public FunctionData {
|
||||
public:
|
||||
explicit JSONCreateFunctionData(unordered_map<string, unique_ptr<Vector>> const_struct_names)
|
||||
: const_struct_names(std::move(const_struct_names)) {
|
||||
}
|
||||
unique_ptr<FunctionData> Copy() const override {
|
||||
// Have to do this because we can't implicitly copy Vector
|
||||
unordered_map<string, unique_ptr<Vector>> map_copy;
|
||||
for (const auto &kv : const_struct_names) {
|
||||
// The vectors are const vectors of the key value
|
||||
map_copy[kv.first] = make_uniq<Vector>(Value(kv.first));
|
||||
}
|
||||
return make_uniq<JSONCreateFunctionData>(std::move(map_copy));
|
||||
}
|
||||
bool Equals(const FunctionData &other_p) const override {
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
// Const struct name vectors live here so they don't have to be re-initialized for every DataChunk
|
||||
StructNames const_struct_names;
|
||||
};
|
||||
|
||||
static LogicalType GetJSONType(StructNames &const_struct_names, const LogicalType &type) {
|
||||
if (type.IsJSONType()) {
|
||||
return type;
|
||||
}
|
||||
|
||||
switch (type.id()) {
|
||||
// These types can go directly into JSON
|
||||
case LogicalTypeId::SQLNULL:
|
||||
case LogicalTypeId::BOOLEAN:
|
||||
case LogicalTypeId::TINYINT:
|
||||
case LogicalTypeId::SMALLINT:
|
||||
case LogicalTypeId::INTEGER:
|
||||
case LogicalTypeId::BIGINT:
|
||||
case LogicalTypeId::HUGEINT:
|
||||
case LogicalTypeId::UHUGEINT:
|
||||
case LogicalTypeId::UTINYINT:
|
||||
case LogicalTypeId::USMALLINT:
|
||||
case LogicalTypeId::UINTEGER:
|
||||
case LogicalTypeId::UBIGINT:
|
||||
case LogicalTypeId::FLOAT:
|
||||
case LogicalTypeId::DOUBLE:
|
||||
case LogicalTypeId::BIT:
|
||||
case LogicalTypeId::BLOB:
|
||||
case LogicalTypeId::VARCHAR:
|
||||
case LogicalTypeId::AGGREGATE_STATE:
|
||||
case LogicalTypeId::ENUM:
|
||||
case LogicalTypeId::DATE:
|
||||
case LogicalTypeId::INTERVAL:
|
||||
case LogicalTypeId::TIME:
|
||||
case LogicalTypeId::TIME_TZ:
|
||||
case LogicalTypeId::TIMESTAMP:
|
||||
case LogicalTypeId::TIMESTAMP_TZ:
|
||||
case LogicalTypeId::TIMESTAMP_NS:
|
||||
case LogicalTypeId::TIMESTAMP_MS:
|
||||
case LogicalTypeId::TIMESTAMP_SEC:
|
||||
case LogicalTypeId::UUID:
|
||||
case LogicalTypeId::BIGNUM:
|
||||
case LogicalTypeId::DECIMAL:
|
||||
return type;
|
||||
case LogicalTypeId::LIST:
|
||||
return LogicalType::LIST(GetJSONType(const_struct_names, ListType::GetChildType(type)));
|
||||
case LogicalTypeId::ARRAY:
|
||||
return LogicalType::ARRAY(GetJSONType(const_struct_names, ArrayType::GetChildType(type)),
|
||||
ArrayType::GetSize(type));
|
||||
// Struct and MAP are treated as JSON values
|
||||
case LogicalTypeId::STRUCT: {
|
||||
child_list_t<LogicalType> child_types;
|
||||
for (const auto &child_type : StructType::GetChildTypes(type)) {
|
||||
const_struct_names[child_type.first] = make_uniq<Vector>(Value(child_type.first));
|
||||
child_types.emplace_back(child_type.first, GetJSONType(const_struct_names, child_type.second));
|
||||
}
|
||||
return LogicalType::STRUCT(child_types);
|
||||
}
|
||||
case LogicalTypeId::MAP: {
|
||||
return LogicalType::MAP(LogicalType::VARCHAR, GetJSONType(const_struct_names, MapType::ValueType(type)));
|
||||
}
|
||||
case LogicalTypeId::UNION: {
|
||||
child_list_t<LogicalType> member_types;
|
||||
for (idx_t member_idx = 0; member_idx < UnionType::GetMemberCount(type); member_idx++) {
|
||||
auto &member_name = UnionType::GetMemberName(type, member_idx);
|
||||
auto &member_type = UnionType::GetMemberType(type, member_idx);
|
||||
|
||||
const_struct_names[member_name] = make_uniq<Vector>(Value(member_name));
|
||||
member_types.emplace_back(member_name, GetJSONType(const_struct_names, member_type));
|
||||
}
|
||||
return LogicalType::UNION(member_types);
|
||||
}
|
||||
// All other types (e.g. date) are cast to VARCHAR
|
||||
default:
|
||||
return LogicalTypeId::VARCHAR;
|
||||
}
|
||||
}
|
||||
|
||||
static unique_ptr<FunctionData> JSONCreateBindParams(ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments, bool object) {
|
||||
unordered_map<string, unique_ptr<Vector>> const_struct_names;
|
||||
for (idx_t i = 0; i < arguments.size(); i++) {
|
||||
auto &type = arguments[i]->return_type;
|
||||
if (arguments[i]->HasParameter()) {
|
||||
throw ParameterNotResolvedException();
|
||||
} else if (object && i % 2 == 0) {
|
||||
if (type != LogicalType::VARCHAR) {
|
||||
throw BinderException("json_object() keys must be VARCHAR, add an explicit cast to argument \"%s\"",
|
||||
arguments[i]->GetName());
|
||||
}
|
||||
bound_function.arguments.push_back(LogicalType::VARCHAR);
|
||||
} else {
|
||||
// Value, cast to types that we can put in JSON
|
||||
bound_function.arguments.push_back(GetJSONType(const_struct_names, type));
|
||||
}
|
||||
}
|
||||
return make_uniq<JSONCreateFunctionData>(std::move(const_struct_names));
|
||||
}
|
||||
|
||||
static unique_ptr<FunctionData> JSONObjectBind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments) {
|
||||
if (arguments.size() % 2 != 0) {
|
||||
throw BinderException("json_object() requires an even number of arguments");
|
||||
}
|
||||
return JSONCreateBindParams(bound_function, arguments, true);
|
||||
}
|
||||
|
||||
static unique_ptr<FunctionData> JSONArrayBind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments) {
|
||||
return JSONCreateBindParams(bound_function, arguments, false);
|
||||
}
|
||||
|
||||
static unique_ptr<FunctionData> ToJSONBind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments) {
|
||||
if (arguments.size() != 1) {
|
||||
throw BinderException("to_json() takes exactly one argument");
|
||||
}
|
||||
return JSONCreateBindParams(bound_function, arguments, false);
|
||||
}
|
||||
|
||||
static unique_ptr<FunctionData> ArrayToJSONBind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments) {
|
||||
if (arguments.size() != 1) {
|
||||
throw BinderException("array_to_json() takes exactly one argument");
|
||||
}
|
||||
auto arg_id = arguments[0]->return_type.id();
|
||||
if (arguments[0]->HasParameter()) {
|
||||
throw ParameterNotResolvedException();
|
||||
}
|
||||
if (arg_id != LogicalTypeId::LIST && arg_id != LogicalTypeId::SQLNULL) {
|
||||
throw BinderException("array_to_json() argument type must be LIST");
|
||||
}
|
||||
return JSONCreateBindParams(bound_function, arguments, false);
|
||||
}
|
||||
|
||||
static unique_ptr<FunctionData> RowToJSONBind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments) {
|
||||
if (arguments.size() != 1) {
|
||||
throw BinderException("row_to_json() takes exactly one argument");
|
||||
}
|
||||
auto arg_id = arguments[0]->return_type.id();
|
||||
if (arguments[0]->HasParameter()) {
|
||||
throw ParameterNotResolvedException();
|
||||
}
|
||||
if (arguments[0]->return_type.id() != LogicalTypeId::STRUCT && arg_id != LogicalTypeId::SQLNULL) {
|
||||
throw BinderException("row_to_json() argument type must be STRUCT");
|
||||
}
|
||||
return JSONCreateBindParams(bound_function, arguments, false);
|
||||
}
|
||||
|
||||
template <class INPUT_TYPE, class RESULT_TYPE>
|
||||
struct CreateJSONValue {
|
||||
static inline RESULT_TYPE Operation(const INPUT_TYPE &input) {
|
||||
throw NotImplementedException("Unsupported type for CreateJSONValue");
|
||||
}
|
||||
};
|
||||
|
||||
template <class INPUT_TYPE>
|
||||
struct CreateJSONValue<INPUT_TYPE, bool> {
|
||||
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const INPUT_TYPE &input) {
|
||||
return yyjson_mut_bool(doc, input);
|
||||
}
|
||||
};
|
||||
|
||||
template <class INPUT_TYPE>
|
||||
struct CreateJSONValue<INPUT_TYPE, uint64_t> {
|
||||
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const INPUT_TYPE &input) {
|
||||
return yyjson_mut_uint(doc, input);
|
||||
}
|
||||
};
|
||||
|
||||
template <class INPUT_TYPE>
|
||||
struct CreateJSONValue<INPUT_TYPE, int64_t> {
|
||||
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const INPUT_TYPE &input) {
|
||||
return yyjson_mut_sint(doc, input);
|
||||
}
|
||||
};
|
||||
|
||||
template <class INPUT_TYPE>
|
||||
struct CreateJSONValue<INPUT_TYPE, double> {
|
||||
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const INPUT_TYPE &input) {
|
||||
return yyjson_mut_real(doc, input);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CreateJSONValue<string_t, string_t> {
|
||||
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const string_t &input) {
|
||||
return yyjson_mut_strncpy(doc, input.GetData(), input.GetSize());
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CreateJSONValue<hugeint_t, string_t> {
|
||||
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const hugeint_t &input) {
|
||||
const auto input_string = input.ToString();
|
||||
return yyjson_mut_rawncpy(doc, input_string.c_str(), input_string.length());
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CreateJSONValue<uhugeint_t, string_t> {
|
||||
static inline yyjson_mut_val *Operation(yyjson_mut_doc *doc, const uhugeint_t &input) {
|
||||
const auto input_string = input.ToString();
|
||||
return yyjson_mut_rawncpy(doc, input_string.c_str(), input_string.length());
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
inline yyjson_mut_val *CreateJSONValueFromJSON(yyjson_mut_doc *doc, const T &value) {
|
||||
return nullptr; // This function should only be called with string_t as template
|
||||
}
|
||||
|
||||
template <>
|
||||
inline yyjson_mut_val *CreateJSONValueFromJSON(yyjson_mut_doc *doc, const string_t &value) {
|
||||
auto value_doc = JSONCommon::ReadDocument(value, JSONCommon::READ_FLAG, &doc->alc);
|
||||
auto result = yyjson_val_mut_copy(doc, value_doc->root);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Forward declaration so we can recurse for nested types
|
||||
static void CreateValues(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
|
||||
idx_t count);
|
||||
|
||||
static void AddKeyValuePairs(yyjson_mut_doc *doc, yyjson_mut_val *objs[], Vector &key_v, yyjson_mut_val *vals[],
|
||||
idx_t count) {
|
||||
UnifiedVectorFormat key_data;
|
||||
key_v.ToUnifiedFormat(count, key_data);
|
||||
auto keys = UnifiedVectorFormat::GetData<string_t>(key_data);
|
||||
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
auto key_idx = key_data.sel->get_index(i);
|
||||
if (!key_data.validity.RowIsValid(key_idx)) {
|
||||
continue;
|
||||
}
|
||||
auto key = CreateJSONValue<string_t, string_t>::Operation(doc, keys[key_idx]);
|
||||
yyjson_mut_obj_add(objs[i], key, vals[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void CreateKeyValuePairs(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *objs[],
|
||||
yyjson_mut_val *vals[], Vector &key_v, Vector &value_v, idx_t count) {
|
||||
CreateValues(names, doc, vals, value_v, count);
|
||||
AddKeyValuePairs(doc, objs, key_v, vals, count);
|
||||
}
|
||||
|
||||
static void CreateValuesNull(yyjson_mut_doc *doc, yyjson_mut_val *vals[], idx_t count) {
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
vals[i] = yyjson_mut_null(doc);
|
||||
}
|
||||
}
|
||||
|
||||
template <class INPUT_TYPE, class TARGET_TYPE>
|
||||
static void TemplatedCreateValues(yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v, idx_t count) {
|
||||
UnifiedVectorFormat value_data;
|
||||
value_v.ToUnifiedFormat(count, value_data);
|
||||
auto values = UnifiedVectorFormat::GetData<INPUT_TYPE>(value_data);
|
||||
|
||||
const auto type_is_json = value_v.GetType().IsJSONType();
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
idx_t val_idx = value_data.sel->get_index(i);
|
||||
if (!value_data.validity.RowIsValid(val_idx)) {
|
||||
vals[i] = yyjson_mut_null(doc);
|
||||
} else if (type_is_json) {
|
||||
vals[i] = CreateJSONValueFromJSON(doc, values[val_idx]);
|
||||
} else {
|
||||
vals[i] = CreateJSONValue<INPUT_TYPE, TARGET_TYPE>::Operation(doc, values[val_idx]);
|
||||
}
|
||||
D_ASSERT(vals[i] != nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
static void CreateRawValues(yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v, idx_t count) {
|
||||
UnifiedVectorFormat value_data;
|
||||
value_v.ToUnifiedFormat(count, value_data);
|
||||
auto values = UnifiedVectorFormat::GetData<string_t>(value_data);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
idx_t val_idx = value_data.sel->get_index(i);
|
||||
if (!value_data.validity.RowIsValid(val_idx)) {
|
||||
vals[i] = yyjson_mut_null(doc);
|
||||
} else {
|
||||
const auto &str = values[val_idx];
|
||||
vals[i] = yyjson_mut_rawncpy(doc, str.GetData(), str.GetSize());
|
||||
}
|
||||
D_ASSERT(vals[i] != nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
static void CreateValuesStruct(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
|
||||
idx_t count) {
|
||||
// Structs become values, therefore we initialize vals to JSON values
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
vals[i] = yyjson_mut_obj(doc);
|
||||
}
|
||||
// Initialize re-usable array for the nested values
|
||||
auto nested_vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
|
||||
|
||||
// Add the key/value pairs to the values
|
||||
auto &entries = StructVector::GetEntries(value_v);
|
||||
for (idx_t entry_i = 0; entry_i < entries.size(); entry_i++) {
|
||||
auto &struct_key_v = *names.at(StructType::GetChildName(value_v.GetType(), entry_i));
|
||||
auto &struct_val_v = *entries[entry_i];
|
||||
CreateKeyValuePairs(names, doc, vals, nested_vals, struct_key_v, struct_val_v, count);
|
||||
}
|
||||
// Whole struct can be NULL
|
||||
UnifiedVectorFormat struct_data;
|
||||
value_v.ToUnifiedFormat(count, struct_data);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
idx_t idx = struct_data.sel->get_index(i);
|
||||
if (!struct_data.validity.RowIsValid(idx)) {
|
||||
vals[i] = yyjson_mut_null(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void CreateValuesMap(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
|
||||
idx_t count) {
|
||||
// Create nested keys
|
||||
auto &map_key_v = MapVector::GetKeys(value_v);
|
||||
auto map_key_count = ListVector::GetListSize(value_v);
|
||||
Vector map_keys_string(LogicalType::VARCHAR, map_key_count);
|
||||
VectorOperations::DefaultCast(map_key_v, map_keys_string, map_key_count);
|
||||
auto nested_keys = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, map_key_count);
|
||||
TemplatedCreateValues<string_t, string_t>(doc, nested_keys, map_keys_string, map_key_count);
|
||||
// Create nested values
|
||||
auto &map_val_v = MapVector::GetValues(value_v);
|
||||
auto map_val_count = ListVector::GetListSize(value_v);
|
||||
auto nested_vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, map_val_count);
|
||||
CreateValues(names, doc, nested_vals, map_val_v, map_val_count);
|
||||
// Add the key/value pairs to the values
|
||||
UnifiedVectorFormat map_data;
|
||||
value_v.ToUnifiedFormat(count, map_data);
|
||||
auto map_key_list_entries = UnifiedVectorFormat::GetData<list_entry_t>(map_data);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
idx_t idx = map_data.sel->get_index(i);
|
||||
if (!map_data.validity.RowIsValid(idx)) {
|
||||
// Whole map can be NULL
|
||||
vals[i] = yyjson_mut_null(doc);
|
||||
} else {
|
||||
vals[i] = yyjson_mut_obj(doc);
|
||||
const auto &key_list_entry = map_key_list_entries[idx];
|
||||
for (idx_t child_i = key_list_entry.offset; child_i < key_list_entry.offset + key_list_entry.length;
|
||||
child_i++) {
|
||||
if (!unsafe_yyjson_is_null(nested_keys[child_i])) {
|
||||
yyjson_mut_obj_add(vals[i], nested_keys[child_i], nested_vals[child_i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void CreateValuesUnion(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
|
||||
idx_t count) {
|
||||
// Structs become values, therefore we initialize vals to JSON values
|
||||
UnifiedVectorFormat value_data;
|
||||
value_v.ToUnifiedFormat(count, value_data);
|
||||
if (value_data.validity.AllValid()) {
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
vals[i] = yyjson_mut_obj(doc);
|
||||
}
|
||||
} else {
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
auto index = value_data.sel->get_index(i);
|
||||
if (!value_data.validity.RowIsValid(index)) {
|
||||
// Make the entry NULL if the Union value is NULL
|
||||
vals[i] = yyjson_mut_null(doc);
|
||||
} else {
|
||||
vals[i] = yyjson_mut_obj(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize re-usable array for the nested values
|
||||
auto nested_vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
|
||||
|
||||
auto &tag_v = UnionVector::GetTags(value_v);
|
||||
UnifiedVectorFormat tag_data;
|
||||
tag_v.ToUnifiedFormat(count, tag_data);
|
||||
|
||||
// Add the key/value pairs to the values
|
||||
for (idx_t member_idx = 0; member_idx < UnionType::GetMemberCount(value_v.GetType()); member_idx++) {
|
||||
auto &member_val_v = UnionVector::GetMember(value_v, member_idx);
|
||||
auto &member_key_v = *names.at(UnionType::GetMemberName(value_v.GetType(), member_idx));
|
||||
|
||||
// This implementation is not optimal since we convert the entire member vector,
|
||||
// and then skip the rows not matching the tag afterwards.
|
||||
|
||||
CreateValues(names, doc, nested_vals, member_val_v, count);
|
||||
|
||||
// This is a inlined copy of AddKeyValuePairs but we also skip null tags
|
||||
// and the rows where the member is not matching the tag
|
||||
UnifiedVectorFormat key_data;
|
||||
member_key_v.ToUnifiedFormat(count, key_data);
|
||||
auto keys = UnifiedVectorFormat::GetData<string_t>(key_data);
|
||||
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
auto value_index = value_data.sel->get_index(i);
|
||||
if (!value_data.validity.RowIsValid(value_index)) {
|
||||
// This entry is just NULL in it's entirety
|
||||
continue;
|
||||
}
|
||||
auto tag_idx = tag_data.sel->get_index(i);
|
||||
if (!tag_data.validity.RowIsValid(tag_idx)) {
|
||||
continue;
|
||||
}
|
||||
auto tag = (UnifiedVectorFormat::GetData<uint8_t>(tag_data))[tag_idx];
|
||||
if (tag != member_idx) {
|
||||
continue;
|
||||
}
|
||||
auto key_idx = key_data.sel->get_index(i);
|
||||
if (!key_data.validity.RowIsValid(key_idx)) {
|
||||
continue;
|
||||
}
|
||||
auto key = CreateJSONValue<string_t, string_t>::Operation(doc, keys[key_idx]);
|
||||
yyjson_mut_obj_add(vals[i], key, nested_vals[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void CreateValuesList(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
|
||||
idx_t count) {
|
||||
// Initialize array for the nested values
|
||||
auto &child_v = ListVector::GetEntry(value_v);
|
||||
auto child_count = ListVector::GetListSize(value_v);
|
||||
auto nested_vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, child_count);
|
||||
// Fill nested_vals with list values
|
||||
CreateValues(names, doc, nested_vals, child_v, child_count);
|
||||
// Now we add the values to the appropriate JSON arrays
|
||||
UnifiedVectorFormat list_data;
|
||||
value_v.ToUnifiedFormat(count, list_data);
|
||||
auto list_entries = UnifiedVectorFormat::GetData<list_entry_t>(list_data);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
idx_t idx = list_data.sel->get_index(i);
|
||||
if (!list_data.validity.RowIsValid(idx)) {
|
||||
vals[i] = yyjson_mut_null(doc);
|
||||
} else {
|
||||
vals[i] = yyjson_mut_arr(doc);
|
||||
const auto &entry = list_entries[idx];
|
||||
for (idx_t child_i = entry.offset; child_i < entry.offset + entry.length; child_i++) {
|
||||
yyjson_mut_arr_append(vals[i], nested_vals[child_i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void CreateValuesArray(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
|
||||
idx_t count) {
|
||||
|
||||
value_v.Flatten(count);
|
||||
|
||||
// Initialize array for the nested values
|
||||
auto &child_v = ArrayVector::GetEntry(value_v);
|
||||
auto array_size = ArrayType::GetSize(value_v.GetType());
|
||||
auto child_count = count * array_size;
|
||||
|
||||
auto nested_vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, child_count);
|
||||
// Fill nested_vals with list values
|
||||
CreateValues(names, doc, nested_vals, child_v, child_count);
|
||||
// Now we add the values to the appropriate JSON arrays
|
||||
UnifiedVectorFormat list_data;
|
||||
value_v.ToUnifiedFormat(count, list_data);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
idx_t idx = list_data.sel->get_index(i);
|
||||
if (!list_data.validity.RowIsValid(idx)) {
|
||||
vals[i] = yyjson_mut_null(doc);
|
||||
} else {
|
||||
vals[i] = yyjson_mut_arr(doc);
|
||||
auto offset = idx * array_size;
|
||||
for (idx_t child_i = offset; child_i < offset + array_size; child_i++) {
|
||||
yyjson_mut_arr_append(vals[i], nested_vals[child_i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void CreateValues(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
|
||||
idx_t count) {
|
||||
const auto &type = value_v.GetType();
|
||||
switch (type.id()) {
|
||||
case LogicalTypeId::SQLNULL:
|
||||
CreateValuesNull(doc, vals, count);
|
||||
break;
|
||||
case LogicalTypeId::BOOLEAN:
|
||||
TemplatedCreateValues<bool, bool>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::TINYINT:
|
||||
TemplatedCreateValues<int8_t, int64_t>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::SMALLINT:
|
||||
TemplatedCreateValues<int16_t, int64_t>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::INTEGER:
|
||||
TemplatedCreateValues<int32_t, int64_t>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::BIGINT:
|
||||
TemplatedCreateValues<int64_t, int64_t>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::HUGEINT:
|
||||
TemplatedCreateValues<hugeint_t, string_t>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::UHUGEINT:
|
||||
TemplatedCreateValues<uhugeint_t, string_t>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::UTINYINT:
|
||||
TemplatedCreateValues<uint8_t, uint64_t>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::USMALLINT:
|
||||
TemplatedCreateValues<uint16_t, uint64_t>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::UINTEGER:
|
||||
TemplatedCreateValues<uint32_t, uint64_t>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::UBIGINT:
|
||||
TemplatedCreateValues<uint64_t, uint64_t>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::FLOAT:
|
||||
TemplatedCreateValues<float, double>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::DOUBLE:
|
||||
TemplatedCreateValues<double, double>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::VARCHAR:
|
||||
TemplatedCreateValues<string_t, string_t>(doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::STRUCT:
|
||||
CreateValuesStruct(names, doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::MAP:
|
||||
CreateValuesMap(names, doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::LIST:
|
||||
CreateValuesList(names, doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::UNION:
|
||||
CreateValuesUnion(names, doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::ARRAY:
|
||||
CreateValuesArray(names, doc, vals, value_v, count);
|
||||
break;
|
||||
case LogicalTypeId::BIT:
|
||||
case LogicalTypeId::BLOB:
|
||||
case LogicalTypeId::AGGREGATE_STATE:
|
||||
case LogicalTypeId::ENUM:
|
||||
case LogicalTypeId::DATE:
|
||||
case LogicalTypeId::INTERVAL:
|
||||
case LogicalTypeId::TIME:
|
||||
case LogicalTypeId::TIME_NS:
|
||||
case LogicalTypeId::TIME_TZ:
|
||||
case LogicalTypeId::TIMESTAMP:
|
||||
case LogicalTypeId::TIMESTAMP_TZ:
|
||||
case LogicalTypeId::TIMESTAMP_NS:
|
||||
case LogicalTypeId::TIMESTAMP_MS:
|
||||
case LogicalTypeId::TIMESTAMP_SEC:
|
||||
case LogicalTypeId::UUID: {
|
||||
Vector string_vector(LogicalTypeId::VARCHAR, count);
|
||||
VectorOperations::DefaultCast(value_v, string_vector, count);
|
||||
TemplatedCreateValues<string_t, string_t>(doc, vals, string_vector, count);
|
||||
break;
|
||||
}
|
||||
case LogicalTypeId::BIGNUM: {
|
||||
Vector string_vector(LogicalTypeId::VARCHAR, count);
|
||||
VectorOperations::DefaultCast(value_v, string_vector, count);
|
||||
CreateRawValues(doc, vals, string_vector, count);
|
||||
break;
|
||||
}
|
||||
case LogicalTypeId::DECIMAL: {
|
||||
if (DecimalType::GetWidth(type) > 15) {
|
||||
Vector string_vector(LogicalTypeId::VARCHAR, count);
|
||||
VectorOperations::DefaultCast(value_v, string_vector, count);
|
||||
CreateRawValues(doc, vals, string_vector, count);
|
||||
} else {
|
||||
Vector double_vector(LogicalType::DOUBLE, count);
|
||||
VectorOperations::DefaultCast(value_v, double_vector, count);
|
||||
TemplatedCreateValues<double, double>(doc, vals, double_vector, count);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case LogicalTypeId::INVALID:
|
||||
case LogicalTypeId::UNKNOWN:
|
||||
case LogicalTypeId::ANY:
|
||||
case LogicalTypeId::USER:
|
||||
case LogicalTypeId::TEMPLATE:
|
||||
case LogicalTypeId::VARIANT:
|
||||
case LogicalTypeId::CHAR:
|
||||
case LogicalTypeId::STRING_LITERAL:
|
||||
case LogicalTypeId::INTEGER_LITERAL:
|
||||
case LogicalTypeId::POINTER:
|
||||
case LogicalTypeId::VALIDITY:
|
||||
case LogicalTypeId::TABLE:
|
||||
case LogicalTypeId::LAMBDA:
|
||||
case LogicalTypeId::GEOMETRY: // TODO! Add support for GEOMETRY
|
||||
throw InternalException("Unsupported type arrived at JSON create function");
|
||||
}
|
||||
}
|
||||
|
||||
static void ObjectFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
|
||||
const auto &info = func_expr.bind_info->Cast<JSONCreateFunctionData>();
|
||||
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
|
||||
// Initialize values
|
||||
const idx_t count = args.size();
|
||||
auto doc = JSONCommon::CreateDocument(alc);
|
||||
auto objs = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
objs[i] = yyjson_mut_obj(doc);
|
||||
}
|
||||
// Initialize a re-usable value array
|
||||
auto vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
|
||||
// Loop through key/value pairs
|
||||
for (idx_t pair_idx = 0; pair_idx < args.data.size() / 2; pair_idx++) {
|
||||
Vector &key_v = args.data[pair_idx * 2];
|
||||
Vector &value_v = args.data[pair_idx * 2 + 1];
|
||||
CreateKeyValuePairs(info.const_struct_names, doc, objs, vals, key_v, value_v, count);
|
||||
}
|
||||
// Write JSON values to string
|
||||
auto objects = FlatVector::GetData<string_t>(result);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
objects[i] = JSONCommon::WriteVal<yyjson_mut_val>(objs[i], alc);
|
||||
}
|
||||
if (args.AllConstant()) {
|
||||
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
||||
}
|
||||
|
||||
JSONAllocator::AddBuffer(result, alc);
|
||||
}
|
||||
|
||||
static void ArrayFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
|
||||
const auto &info = func_expr.bind_info->Cast<JSONCreateFunctionData>();
|
||||
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
|
||||
// Initialize arrays
|
||||
const idx_t count = args.size();
|
||||
auto doc = JSONCommon::CreateDocument(alc);
|
||||
auto arrs = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
arrs[i] = yyjson_mut_arr(doc);
|
||||
}
|
||||
// Initialize a re-usable value array
|
||||
auto vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
|
||||
// Loop through args
|
||||
for (auto &v : args.data) {
|
||||
CreateValues(info.const_struct_names, doc, vals, v, count);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
yyjson_mut_arr_append(arrs[i], vals[i]);
|
||||
}
|
||||
}
|
||||
// Write JSON arrays to string
|
||||
auto objects = FlatVector::GetData<string_t>(result);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
objects[i] = JSONCommon::WriteVal<yyjson_mut_val>(arrs[i], alc);
|
||||
}
|
||||
if (args.AllConstant()) {
|
||||
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
||||
}
|
||||
|
||||
JSONAllocator::AddBuffer(result, alc);
|
||||
}
|
||||
|
||||
static void ToJSONFunctionInternal(const StructNames &names, Vector &input, const idx_t count, Vector &result,
|
||||
yyjson_alc *alc) {
|
||||
// Initialize array for values
|
||||
auto doc = JSONCommon::CreateDocument(alc);
|
||||
auto vals = JSONCommon::AllocateArray<yyjson_mut_val *>(doc, count);
|
||||
CreateValues(names, doc, vals, input, count);
|
||||
|
||||
// Write JSON values to string
|
||||
auto objects = FlatVector::GetData<string_t>(result);
|
||||
auto &result_validity = FlatVector::Validity(result);
|
||||
UnifiedVectorFormat input_data;
|
||||
input.ToUnifiedFormat(count, input_data);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
idx_t idx = input_data.sel->get_index(i);
|
||||
if (input_data.validity.RowIsValid(idx)) {
|
||||
objects[i] = JSONCommon::WriteVal<yyjson_mut_val>(vals[i], alc);
|
||||
} else {
|
||||
result_validity.SetInvalid(i);
|
||||
}
|
||||
}
|
||||
|
||||
if (input.GetVectorType() == VectorType::CONSTANT_VECTOR || count == 1) {
|
||||
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
||||
}
|
||||
|
||||
JSONAllocator::AddBuffer(result, alc);
|
||||
}
|
||||
|
||||
static void ToJSONFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
|
||||
const auto &info = func_expr.bind_info->Cast<JSONCreateFunctionData>();
|
||||
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
|
||||
ToJSONFunctionInternal(info.const_struct_names, args.data[0], args.size(), result, alc);
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetObjectFunction() {
|
||||
ScalarFunction fun("json_object", {}, LogicalType::JSON(), ObjectFunction, JSONObjectBind, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init);
|
||||
fun.varargs = LogicalType::ANY;
|
||||
fun.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
|
||||
return ScalarFunctionSet(fun);
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetArrayFunction() {
|
||||
ScalarFunction fun("json_array", {}, LogicalType::JSON(), ArrayFunction, JSONArrayBind, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init);
|
||||
fun.varargs = LogicalType::ANY;
|
||||
fun.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
|
||||
return ScalarFunctionSet(fun);
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetToJSONFunction() {
|
||||
ScalarFunction fun("to_json", {}, LogicalType::JSON(), ToJSONFunction, ToJSONBind, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init);
|
||||
fun.varargs = LogicalType::ANY;
|
||||
return ScalarFunctionSet(fun);
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetArrayToJSONFunction() {
|
||||
ScalarFunction fun("array_to_json", {}, LogicalType::JSON(), ToJSONFunction, ArrayToJSONBind, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init);
|
||||
fun.varargs = LogicalType::ANY;
|
||||
return ScalarFunctionSet(fun);
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetRowToJSONFunction() {
|
||||
ScalarFunction fun("row_to_json", {}, LogicalType::JSON(), ToJSONFunction, RowToJSONBind, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init);
|
||||
fun.varargs = LogicalType::ANY;
|
||||
return ScalarFunctionSet(fun);
|
||||
}
|
||||
|
||||
struct NestedToJSONCastData : public BoundCastData {
|
||||
public:
|
||||
NestedToJSONCastData() {
|
||||
}
|
||||
|
||||
unique_ptr<BoundCastData> Copy() const override {
|
||||
auto result = make_uniq<NestedToJSONCastData>();
|
||||
for (auto &csn : const_struct_names) {
|
||||
result->const_struct_names.emplace(csn.first, make_uniq<Vector>(csn.second->GetValue(0)));
|
||||
}
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
public:
|
||||
StructNames const_struct_names;
|
||||
};
|
||||
|
||||
static bool AnyToJSONCast(Vector &source, Vector &result, idx_t count, CastParameters ¶meters) {
|
||||
auto &lstate = parameters.local_state->Cast<JSONFunctionLocalState>();
|
||||
lstate.json_allocator->Reset();
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
const auto &names = parameters.cast_data->Cast<NestedToJSONCastData>().const_struct_names;
|
||||
|
||||
ToJSONFunctionInternal(names, source, count, result, alc);
|
||||
return true;
|
||||
}
|
||||
|
||||
BoundCastInfo AnyToJSONCastBind(BindCastInput &input, const LogicalType &source, const LogicalType &target) {
|
||||
auto cast_data = make_uniq<NestedToJSONCastData>();
|
||||
GetJSONType(cast_data->const_struct_names, source);
|
||||
return BoundCastInfo(AnyToJSONCast, std::move(cast_data), JSONFunctionLocalState::InitCastLocalState);
|
||||
}
|
||||
|
||||
void JSONFunctions::RegisterJSONCreateCastFunctions(ExtensionLoader &loader) {
|
||||
// Anything can be cast to JSON
|
||||
for (const auto &type : LogicalType::AllTypes()) {
|
||||
LogicalType source_type;
|
||||
switch (type.id()) {
|
||||
case LogicalTypeId::STRUCT:
|
||||
source_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
|
||||
break;
|
||||
case LogicalTypeId::LIST:
|
||||
source_type = LogicalType::LIST(LogicalType::ANY);
|
||||
break;
|
||||
case LogicalTypeId::MAP:
|
||||
source_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
|
||||
break;
|
||||
case LogicalTypeId::UNION:
|
||||
source_type = LogicalType::UNION({{"any", LogicalType::ANY}});
|
||||
break;
|
||||
case LogicalTypeId::ARRAY:
|
||||
source_type = LogicalType::ARRAY(LogicalType::ANY, optional_idx());
|
||||
break;
|
||||
case LogicalTypeId::VARCHAR:
|
||||
// We skip this one here as it's handled in json_functions.cpp
|
||||
continue;
|
||||
default:
|
||||
source_type = type;
|
||||
}
|
||||
// We prefer going to JSON over going to VARCHAR if a function can do either
|
||||
const auto source_to_json_cost = MaxValue<int64_t>(
|
||||
CastFunctionSet::ImplicitCastCost(loader.GetDatabaseInstance(), source_type, LogicalType::VARCHAR) - 1, 0);
|
||||
loader.RegisterCastFunction(source_type, LogicalType::JSON(), AnyToJSONCastBind, source_to_json_cost);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
32
external/duckdb/extension/json/json_functions/json_exists.cpp
vendored
Normal file
32
external/duckdb/extension/json/json_functions/json_exists.cpp
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
#include "json_executors.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static inline bool JSONExists(yyjson_val *val, yyjson_alc *, Vector &, ValidityMask &, idx_t) {
|
||||
return val;
|
||||
}
|
||||
|
||||
static void BinaryExistsFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::BinaryExecute<bool, false>(args, state, result, JSONExists);
|
||||
}
|
||||
|
||||
static void ManyExistsFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::ExecuteMany<bool, false>(args, state, result, JSONExists);
|
||||
}
|
||||
|
||||
static void GetExistsFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::BOOLEAN, BinaryExistsFunction,
|
||||
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
|
||||
LogicalType::LIST(LogicalType::BOOLEAN), ManyExistsFunction,
|
||||
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetExistsFunction() {
|
||||
ScalarFunctionSet set("json_exists");
|
||||
GetExistsFunctionsInternal(set, LogicalType::VARCHAR);
|
||||
GetExistsFunctionsInternal(set, LogicalType::JSON());
|
||||
return set;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
74
external/duckdb/extension/json/json_functions/json_extract.cpp
vendored
Normal file
74
external/duckdb/extension/json/json_functions/json_extract.cpp
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
#include "json_executors.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static inline string_t ExtractFromVal(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &, idx_t) {
|
||||
return JSONCommon::WriteVal<yyjson_val>(val, alc);
|
||||
}
|
||||
|
||||
static inline string_t ExtractStringFromVal(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &mask, idx_t idx) {
|
||||
switch (yyjson_get_tag(val)) {
|
||||
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
|
||||
mask.SetInvalid(idx);
|
||||
return string_t {};
|
||||
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC:
|
||||
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
|
||||
return string_t(unsafe_yyjson_get_str(val), unsafe_yyjson_get_len(val));
|
||||
default:
|
||||
return JSONCommon::WriteVal<yyjson_val>(val, alc);
|
||||
}
|
||||
}
|
||||
|
||||
static void ExtractFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::BinaryExecute<string_t>(args, state, result, ExtractFromVal);
|
||||
}
|
||||
|
||||
static void ExtractManyFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::ExecuteMany<string_t>(args, state, result, ExtractFromVal);
|
||||
}
|
||||
|
||||
static void ExtractStringFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::BinaryExecute<string_t>(args, state, result, ExtractStringFromVal);
|
||||
}
|
||||
|
||||
static void ExtractStringManyFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::ExecuteMany<string_t>(args, state, result, ExtractStringFromVal);
|
||||
}
|
||||
|
||||
static void GetExtractFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::BIGINT}, LogicalType::JSON(), ExtractFunction,
|
||||
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::JSON(), ExtractFunction,
|
||||
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
|
||||
LogicalType::LIST(LogicalType::JSON()), ExtractManyFunction,
|
||||
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetExtractFunction() {
|
||||
// Generic extract function
|
||||
ScalarFunctionSet set("json_extract");
|
||||
GetExtractFunctionsInternal(set, LogicalType::VARCHAR);
|
||||
GetExtractFunctionsInternal(set, LogicalType::JSON());
|
||||
return set;
|
||||
}
|
||||
|
||||
static void GetExtractStringFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::BIGINT}, LogicalType::VARCHAR, ExtractStringFunction,
|
||||
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::VARCHAR, ExtractStringFunction,
|
||||
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
|
||||
LogicalType::LIST(LogicalType::VARCHAR), ExtractStringManyFunction,
|
||||
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetExtractStringFunction() {
|
||||
// String extract function
|
||||
ScalarFunctionSet set("json_extract_string");
|
||||
GetExtractStringFunctionsInternal(set, LogicalType::VARCHAR);
|
||||
GetExtractStringFunctionsInternal(set, LogicalType::JSON());
|
||||
return set;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
59
external/duckdb/extension/json/json_functions/json_keys.cpp
vendored
Normal file
59
external/duckdb/extension/json/json_functions/json_keys.cpp
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
#include "json_executors.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static inline list_entry_t GetJSONKeys(yyjson_val *val, yyjson_alc *, Vector &result, ValidityMask &, idx_t) {
|
||||
auto num_keys = yyjson_obj_size(val);
|
||||
auto current_size = ListVector::GetListSize(result);
|
||||
auto new_size = current_size + num_keys;
|
||||
|
||||
// Grow list if needed
|
||||
if (ListVector::GetListCapacity(result) < new_size) {
|
||||
ListVector::Reserve(result, new_size);
|
||||
}
|
||||
|
||||
// Write the strings to the child vector
|
||||
auto keys = FlatVector::GetData<string_t>(ListVector::GetEntry(result));
|
||||
size_t idx, max;
|
||||
yyjson_val *key, *child_val;
|
||||
yyjson_obj_foreach(val, idx, max, key, child_val) {
|
||||
keys[current_size + idx] = string_t(unsafe_yyjson_get_str(key), unsafe_yyjson_get_len(key));
|
||||
}
|
||||
|
||||
// Update size
|
||||
ListVector::SetListSize(result, current_size + num_keys);
|
||||
|
||||
return {current_size, num_keys};
|
||||
}
|
||||
|
||||
static void UnaryJSONKeysFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::UnaryExecute<list_entry_t>(args, state, result, GetJSONKeys);
|
||||
}
|
||||
|
||||
static void BinaryJSONKeysFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::BinaryExecute<list_entry_t>(args, state, result, GetJSONKeys);
|
||||
}
|
||||
|
||||
static void ManyJSONKeysFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::ExecuteMany<list_entry_t>(args, state, result, GetJSONKeys);
|
||||
}
|
||||
|
||||
static void GetJSONKeysFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
|
||||
set.AddFunction(ScalarFunction({input_type}, LogicalType::LIST(LogicalType::VARCHAR), UnaryJSONKeysFunction,
|
||||
nullptr, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::LIST(LogicalType::VARCHAR),
|
||||
BinaryJSONKeysFunction, JSONReadFunctionData::Bind, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
|
||||
LogicalType::LIST(LogicalType::LIST(LogicalType::VARCHAR)), ManyJSONKeysFunction,
|
||||
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetKeysFunction() {
|
||||
ScalarFunctionSet set("json_keys");
|
||||
GetJSONKeysFunctionsInternal(set, LogicalType::VARCHAR);
|
||||
GetJSONKeysFunctionsInternal(set, LogicalType::JSON());
|
||||
return set;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
92
external/duckdb/extension/json/json_functions/json_merge_patch.cpp
vendored
Normal file
92
external/duckdb/extension/json/json_functions/json_merge_patch.cpp
vendored
Normal file
@@ -0,0 +1,92 @@
|
||||
#include "json_common.hpp"
|
||||
#include "json_functions.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static inline yyjson_mut_val *MergePatch(yyjson_mut_doc *doc, yyjson_mut_val *orig, yyjson_mut_val *patch) {
|
||||
if ((yyjson_mut_get_tag(orig) != (YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE)) ||
|
||||
(yyjson_mut_get_tag(patch) != (YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE))) {
|
||||
// If either is not an object, we just return the second argument
|
||||
return patch;
|
||||
}
|
||||
|
||||
// Both are object, do the merge
|
||||
return yyjson_mut_merge_patch(doc, orig, patch);
|
||||
}
|
||||
|
||||
static inline void ReadObjects(yyjson_mut_doc *doc, Vector &input, yyjson_mut_val *objs[], const idx_t count) {
|
||||
UnifiedVectorFormat input_data;
|
||||
auto &input_vector = input;
|
||||
input_vector.ToUnifiedFormat(count, input_data);
|
||||
auto inputs = UnifiedVectorFormat::GetData<string_t>(input_data);
|
||||
|
||||
// Read the documents
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
auto idx = input_data.sel->get_index(i);
|
||||
if (!input_data.validity.RowIsValid(idx)) {
|
||||
objs[i] = nullptr;
|
||||
} else {
|
||||
objs[i] =
|
||||
yyjson_val_mut_copy(doc, JSONCommon::ReadDocument(inputs[idx], JSONCommon::READ_FLAG, &doc->alc)->root);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//! Follows MySQL behaviour
|
||||
static void MergePatchFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
|
||||
auto doc = JSONCommon::CreateDocument(alc);
|
||||
const auto count = args.size();
|
||||
|
||||
// Read the first json arg
|
||||
auto origs = JSONCommon::AllocateArray<yyjson_mut_val *>(alc, count);
|
||||
ReadObjects(doc, args.data[0], origs, count);
|
||||
|
||||
// Read the next json args one by one and merge them into the first json arg
|
||||
auto patches = JSONCommon::AllocateArray<yyjson_mut_val *>(alc, count);
|
||||
for (idx_t arg_idx = 1; arg_idx < args.data.size(); arg_idx++) {
|
||||
ReadObjects(doc, args.data[arg_idx], patches, count);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
if (patches[i] == nullptr) {
|
||||
// Next json arg is NULL, obj becomes NULL
|
||||
origs[i] = nullptr;
|
||||
} else if (origs[i] == nullptr) {
|
||||
// Current obj is NULL, obj becomes next json arg
|
||||
origs[i] = patches[i];
|
||||
} else {
|
||||
// Neither is NULL, merge them
|
||||
origs[i] = MergePatch(doc, origs[i], patches[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write to result vector
|
||||
auto result_data = FlatVector::GetData<string_t>(result);
|
||||
auto &result_validity = FlatVector::Validity(result);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
if (origs[i] == nullptr) {
|
||||
result_validity.SetInvalid(i);
|
||||
} else {
|
||||
result_data[i] = JSONCommon::WriteVal<yyjson_mut_val>(origs[i], alc);
|
||||
}
|
||||
}
|
||||
|
||||
if (args.AllConstant()) {
|
||||
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
||||
}
|
||||
|
||||
JSONAllocator::AddBuffer(result, alc);
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetMergePatchFunction() {
|
||||
ScalarFunction fun("json_merge_patch", {LogicalType::JSON(), LogicalType::JSON()}, LogicalType::JSON(),
|
||||
MergePatchFunction, nullptr, nullptr, nullptr, JSONFunctionLocalState::Init);
|
||||
fun.varargs = LogicalType::JSON();
|
||||
fun.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
|
||||
|
||||
return ScalarFunctionSet(fun);
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
32
external/duckdb/extension/json/json_functions/json_pretty.cpp
vendored
Normal file
32
external/duckdb/extension/json/json_functions/json_pretty.cpp
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
#include "json_executors.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
//! Pretty Print a given JSON Document
|
||||
string_t PrettyPrint(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &, idx_t) {
|
||||
D_ASSERT(alc);
|
||||
size_t len_size_t;
|
||||
auto data = yyjson_val_write_opts(val, JSONCommon::WRITE_PRETTY_FLAG, alc, &len_size_t, nullptr);
|
||||
idx_t len = len_size_t;
|
||||
return string_t(data, len);
|
||||
}
|
||||
|
||||
static void PrettyPrintFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
auto json_type = args.data[0].GetType();
|
||||
D_ASSERT(json_type == LogicalType::VARCHAR || json_type == LogicalType::JSON());
|
||||
|
||||
JSONExecutors::UnaryExecute<string_t>(args, state, result, PrettyPrint);
|
||||
}
|
||||
|
||||
static void GetPrettyPrintFunctionInternal(ScalarFunctionSet &set, const LogicalType &json) {
|
||||
set.AddFunction(ScalarFunction("json_pretty", {json}, LogicalType::VARCHAR, PrettyPrintFunction, nullptr, nullptr,
|
||||
nullptr, JSONFunctionLocalState::Init));
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetPrettyPrintFunction() {
|
||||
ScalarFunctionSet set("json_pretty");
|
||||
GetPrettyPrintFunctionInternal(set, LogicalType::JSON());
|
||||
return set;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
223
external/duckdb/extension/json/json_functions/json_serialize_plan.cpp
vendored
Normal file
223
external/duckdb/extension/json/json_functions/json_serialize_plan.cpp
vendored
Normal file
@@ -0,0 +1,223 @@
|
||||
#include "duckdb/execution/column_binding_resolver.hpp"
|
||||
#include "duckdb/execution/expression_executor.hpp"
|
||||
#include "duckdb/main/connection.hpp"
|
||||
#include "duckdb/main/database.hpp"
|
||||
#include "duckdb/optimizer/optimizer.hpp"
|
||||
#include "duckdb/parser/parsed_data/create_pragma_function_info.hpp"
|
||||
#include "duckdb/parser/parser.hpp"
|
||||
#include "duckdb/planner/planner.hpp"
|
||||
#include "json_common.hpp"
|
||||
#include "json_deserializer.hpp"
|
||||
#include "json_functions.hpp"
|
||||
#include "json_serializer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// json_serialize_plan
|
||||
//-----------------------------------------------------------------------------
|
||||
struct JsonSerializePlanBindData : public FunctionData {
|
||||
bool skip_if_null = false;
|
||||
bool skip_if_empty = false;
|
||||
bool skip_if_default = false;
|
||||
bool format = false;
|
||||
bool optimize = false;
|
||||
|
||||
JsonSerializePlanBindData(bool skip_if_null_p, bool skip_if_empty_p, bool skip_if_default_p, bool format_p,
|
||||
bool optimize_p)
|
||||
: skip_if_null(skip_if_null_p), skip_if_empty(skip_if_empty_p), skip_if_default(skip_if_default_p),
|
||||
format(format_p), optimize(optimize_p) {
|
||||
}
|
||||
|
||||
public:
|
||||
unique_ptr<FunctionData> Copy() const override {
|
||||
return make_uniq<JsonSerializePlanBindData>(skip_if_null, skip_if_empty, skip_if_default, format, optimize);
|
||||
}
|
||||
bool Equals(const FunctionData &other_p) const override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
static unique_ptr<FunctionData> JsonSerializePlanBind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments) {
|
||||
if (arguments.empty()) {
|
||||
throw BinderException("json_serialize_plan takes at least one argument");
|
||||
}
|
||||
|
||||
if (arguments[0]->return_type != LogicalType::VARCHAR) {
|
||||
throw InvalidTypeException("json_serialize_plan first argument must be a VARCHAR");
|
||||
}
|
||||
|
||||
// Optional arguments
|
||||
bool skip_if_null = false;
|
||||
bool skip_if_empty = false;
|
||||
bool skip_if_default = false;
|
||||
bool format = false;
|
||||
bool optimize = false;
|
||||
|
||||
for (idx_t i = 1; i < arguments.size(); i++) {
|
||||
auto &arg = arguments[i];
|
||||
if (arg->HasParameter()) {
|
||||
throw ParameterNotResolvedException();
|
||||
}
|
||||
if (!arg->IsFoldable()) {
|
||||
throw BinderException("json_serialize_plan: arguments must be constant");
|
||||
}
|
||||
auto &alias = arg->GetAlias();
|
||||
if (alias == "skip_null") {
|
||||
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
|
||||
throw BinderException("json_serialize_plan: 'skip_null' argument must be a boolean");
|
||||
}
|
||||
skip_if_null = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
|
||||
} else if (alias == "skip_empty") {
|
||||
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
|
||||
throw BinderException("json_serialize_plan: 'skip_empty' argument must be a boolean");
|
||||
}
|
||||
skip_if_empty = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
|
||||
} else if (alias == "skip_default") {
|
||||
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
|
||||
throw BinderException("json_serialize_plan: 'skip_default' argument must be a boolean");
|
||||
}
|
||||
skip_if_default = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
|
||||
} else if (alias == "format") {
|
||||
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
|
||||
throw BinderException("json_serialize_plan: 'format' argument must be a boolean");
|
||||
}
|
||||
format = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
|
||||
} else if (alias == "optimize") {
|
||||
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
|
||||
throw BinderException("json_serialize_plan: 'optimize' argument must be a boolean");
|
||||
}
|
||||
optimize = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
|
||||
} else {
|
||||
throw BinderException(StringUtil::Format("json_serialize_plan: Unknown argument '%s'", alias));
|
||||
}
|
||||
}
|
||||
return make_uniq<JsonSerializePlanBindData>(skip_if_null, skip_if_empty, skip_if_default, format, optimize);
|
||||
}
|
||||
|
||||
static bool OperatorSupportsSerialization(LogicalOperator &op, string &operator_name) {
|
||||
for (auto &child : op.children) {
|
||||
if (!OperatorSupportsSerialization(*child, operator_name)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
auto supported = op.SupportSerialization();
|
||||
if (!supported) {
|
||||
operator_name = EnumUtil::ToString(op.type);
|
||||
}
|
||||
return supported;
|
||||
}
|
||||
|
||||
static void JsonSerializePlanFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
auto &local_state = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = local_state.json_allocator->GetYYAlc();
|
||||
auto &inputs = args.data[0];
|
||||
|
||||
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
|
||||
const auto &info = func_expr.bind_info->Cast<JsonSerializePlanBindData>();
|
||||
|
||||
if (!state.HasContext()) {
|
||||
throw InvalidInputException("json_serialize_plan: No client context available");
|
||||
}
|
||||
auto &context = state.GetContext();
|
||||
|
||||
UnaryExecutor::Execute<string_t, string_t>(inputs, result, args.size(), [&](string_t input) {
|
||||
auto doc = JSONCommon::CreateDocument(alc);
|
||||
auto result_obj = yyjson_mut_obj(doc);
|
||||
yyjson_mut_doc_set_root(doc, result_obj);
|
||||
|
||||
try {
|
||||
Parser parser;
|
||||
parser.ParseQuery(input.GetString());
|
||||
auto plans_arr = yyjson_mut_arr(doc);
|
||||
|
||||
for (auto &statement : parser.statements) {
|
||||
auto stmt = std::move(statement);
|
||||
|
||||
Planner planner(context);
|
||||
planner.CreatePlan(std::move(stmt));
|
||||
auto plan = std::move(planner.plan);
|
||||
|
||||
if (info.optimize && plan->RequireOptimizer()) {
|
||||
Optimizer optimizer(*planner.binder, context);
|
||||
plan = optimizer.Optimize(std::move(plan));
|
||||
}
|
||||
|
||||
ColumnBindingResolver resolver;
|
||||
resolver.Verify(*plan);
|
||||
resolver.VisitOperator(*plan);
|
||||
plan->ResolveOperatorTypes();
|
||||
|
||||
string operator_name;
|
||||
if (!OperatorSupportsSerialization(*plan, operator_name)) {
|
||||
throw InvalidInputException("Operator '%s' does not support serialization", operator_name);
|
||||
}
|
||||
|
||||
auto plan_json =
|
||||
JsonSerializer::Serialize(*plan, doc, info.skip_if_null, info.skip_if_empty, info.skip_if_default);
|
||||
yyjson_mut_arr_append(plans_arr, plan_json);
|
||||
}
|
||||
|
||||
yyjson_mut_obj_add_false(doc, result_obj, "error");
|
||||
yyjson_mut_obj_add_val(doc, result_obj, "plans", plans_arr);
|
||||
|
||||
size_t len_size_t;
|
||||
auto data = yyjson_mut_val_write_opts(result_obj,
|
||||
info.format ? JSONCommon::WRITE_PRETTY_FLAG : JSONCommon::WRITE_FLAG,
|
||||
alc, &len_size_t, nullptr);
|
||||
idx_t len = len_size_t;
|
||||
if (data == nullptr) {
|
||||
throw SerializationException(
|
||||
"Failed to serialize json, perhaps the query contains invalid utf8 characters?");
|
||||
}
|
||||
|
||||
return StringVector::AddString(result, data, len);
|
||||
|
||||
} catch (std::exception &ex) {
|
||||
ErrorData error(ex);
|
||||
yyjson_mut_obj_add_true(doc, result_obj, "error");
|
||||
// error type and message
|
||||
yyjson_mut_obj_add_strcpy(doc, result_obj, "error_type",
|
||||
StringUtil::Lower(Exception::ExceptionTypeToString(error.Type())).c_str());
|
||||
yyjson_mut_obj_add_strcpy(doc, result_obj, "error_message", error.RawMessage().c_str());
|
||||
// add extra info
|
||||
for (auto &entry : error.ExtraInfo()) {
|
||||
yyjson_mut_obj_add_strcpy(doc, result_obj, entry.first.c_str(), entry.second.c_str());
|
||||
}
|
||||
|
||||
size_t len_size_t;
|
||||
auto data = yyjson_mut_val_write_opts(result_obj,
|
||||
info.format ? JSONCommon::WRITE_PRETTY_FLAG : JSONCommon::WRITE_FLAG,
|
||||
alc, &len_size_t, nullptr);
|
||||
idx_t len = len_size_t;
|
||||
return StringVector::AddString(result, data, len);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetSerializePlanFunction() {
|
||||
ScalarFunctionSet set("json_serialize_plan");
|
||||
|
||||
set.AddFunction(ScalarFunction({LogicalType::VARCHAR}, LogicalType::JSON(), JsonSerializePlanFunction,
|
||||
JsonSerializePlanBind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
|
||||
set.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BOOLEAN}, LogicalType::JSON(),
|
||||
JsonSerializePlanFunction, JsonSerializePlanBind, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init));
|
||||
|
||||
set.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BOOLEAN, LogicalType::BOOLEAN},
|
||||
LogicalType::JSON(), JsonSerializePlanFunction, JsonSerializePlanBind, nullptr,
|
||||
nullptr, JSONFunctionLocalState::Init));
|
||||
|
||||
set.AddFunction(ScalarFunction(
|
||||
{LogicalType::VARCHAR, LogicalType::BOOLEAN, LogicalType::BOOLEAN, LogicalType::BOOLEAN}, LogicalType::JSON(),
|
||||
JsonSerializePlanFunction, JsonSerializePlanBind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction(
|
||||
{LogicalType::VARCHAR, LogicalType::BOOLEAN, LogicalType::BOOLEAN, LogicalType::BOOLEAN, LogicalType::BOOLEAN},
|
||||
LogicalType::JSON(), JsonSerializePlanFunction, JsonSerializePlanBind, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init));
|
||||
return set;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
323
external/duckdb/extension/json/json_functions/json_serialize_sql.cpp
vendored
Normal file
323
external/duckdb/extension/json/json_functions/json_serialize_sql.cpp
vendored
Normal file
@@ -0,0 +1,323 @@
|
||||
#include "duckdb/execution/expression_executor.hpp"
|
||||
#include "duckdb/main/connection.hpp"
|
||||
#include "duckdb/main/database.hpp"
|
||||
#include "duckdb/parser/parsed_data/create_pragma_function_info.hpp"
|
||||
#include "duckdb/parser/parser.hpp"
|
||||
#include "json_deserializer.hpp"
|
||||
#include "json_functions.hpp"
|
||||
#include "json_serializer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct JsonSerializeBindData : public FunctionData {
|
||||
bool skip_if_null = false;
|
||||
bool skip_if_empty = false;
|
||||
bool skip_if_default = false;
|
||||
bool format = false;
|
||||
|
||||
JsonSerializeBindData(bool skip_if_null_p, bool skip_if_empty_p, bool skip_if_default_p, bool format_p)
|
||||
: skip_if_null(skip_if_null_p), skip_if_empty(skip_if_empty_p), skip_if_default(skip_if_default_p),
|
||||
format(format_p) {
|
||||
}
|
||||
|
||||
public:
|
||||
unique_ptr<FunctionData> Copy() const override {
|
||||
return make_uniq<JsonSerializeBindData>(skip_if_null, skip_if_empty, skip_if_default, format);
|
||||
}
|
||||
bool Equals(const FunctionData &other_p) const override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
static unique_ptr<FunctionData> JsonSerializeBind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments) {
|
||||
if (arguments.empty()) {
|
||||
throw BinderException("json_serialize_sql takes at least one argument");
|
||||
}
|
||||
|
||||
if (arguments[0]->return_type != LogicalType::VARCHAR) {
|
||||
throw InvalidTypeException("json_serialize_sql first argument must be a VARCHAR");
|
||||
}
|
||||
|
||||
// Optional arguments
|
||||
|
||||
bool skip_if_null = false;
|
||||
bool skip_if_empty = false;
|
||||
bool skip_if_default = false;
|
||||
bool format = false;
|
||||
|
||||
for (idx_t i = 1; i < arguments.size(); i++) {
|
||||
auto &arg = arguments[i];
|
||||
if (arg->HasParameter()) {
|
||||
throw ParameterNotResolvedException();
|
||||
}
|
||||
if (!arg->IsFoldable()) {
|
||||
throw BinderException("json_serialize_sql: arguments must be constant");
|
||||
}
|
||||
auto &alias = arg->GetAlias();
|
||||
if (alias == "skip_null") {
|
||||
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
|
||||
throw BinderException("json_serialize_sql: 'skip_null' argument must be a boolean");
|
||||
}
|
||||
skip_if_null = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
|
||||
} else if (alias == "skip_empty") {
|
||||
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
|
||||
throw BinderException("json_serialize_sql: 'skip_empty' argument must be a boolean");
|
||||
}
|
||||
skip_if_empty = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
|
||||
} else if (alias == "format") {
|
||||
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
|
||||
throw BinderException("json_serialize_sql: 'format' argument must be a boolean");
|
||||
}
|
||||
format = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
|
||||
} else if (alias == "skip_default") {
|
||||
if (arg->return_type.id() != LogicalTypeId::BOOLEAN) {
|
||||
throw BinderException("json_serialize_sql: 'skip_default' argument must be a boolean");
|
||||
}
|
||||
skip_if_default = BooleanValue::Get(ExpressionExecutor::EvaluateScalar(context, *arg));
|
||||
} else {
|
||||
throw BinderException(StringUtil::Format("json_serialize_sql: Unknown argument '%s'", alias));
|
||||
}
|
||||
}
|
||||
return make_uniq<JsonSerializeBindData>(skip_if_null, skip_if_empty, skip_if_default, format);
|
||||
}
|
||||
|
||||
static void JsonSerializeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
auto &local_state = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = local_state.json_allocator->GetYYAlc();
|
||||
auto &inputs = args.data[0];
|
||||
|
||||
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
|
||||
const auto &info = func_expr.bind_info->Cast<JsonSerializeBindData>();
|
||||
|
||||
UnaryExecutor::Execute<string_t, string_t>(inputs, result, args.size(), [&](string_t input) {
|
||||
auto doc = JSONCommon::CreateDocument(alc);
|
||||
auto result_obj = yyjson_mut_obj(doc);
|
||||
yyjson_mut_doc_set_root(doc, result_obj);
|
||||
|
||||
try {
|
||||
auto parser = Parser();
|
||||
parser.ParseQuery(input.GetString());
|
||||
|
||||
auto statements_arr = yyjson_mut_arr(doc);
|
||||
|
||||
for (auto &statement : parser.statements) {
|
||||
if (statement->type != StatementType::SELECT_STATEMENT) {
|
||||
throw NotImplementedException("Only SELECT statements can be serialized to json!");
|
||||
}
|
||||
auto &select = statement->Cast<SelectStatement>();
|
||||
auto json =
|
||||
JsonSerializer::Serialize(select, doc, info.skip_if_null, info.skip_if_empty, info.skip_if_default);
|
||||
|
||||
yyjson_mut_arr_append(statements_arr, json);
|
||||
}
|
||||
|
||||
yyjson_mut_obj_add_false(doc, result_obj, "error");
|
||||
yyjson_mut_obj_add_val(doc, result_obj, "statements", statements_arr);
|
||||
size_t len_size_t;
|
||||
auto data = yyjson_mut_val_write_opts(result_obj,
|
||||
info.format ? JSONCommon::WRITE_PRETTY_FLAG : JSONCommon::WRITE_FLAG,
|
||||
alc, &len_size_t, nullptr);
|
||||
idx_t len = len_size_t;
|
||||
if (data == nullptr) {
|
||||
throw SerializationException(
|
||||
"Failed to serialize json, perhaps the query contains invalid utf8 characters?");
|
||||
}
|
||||
return StringVector::AddString(result, data, len);
|
||||
|
||||
} catch (std::exception &ex) {
|
||||
ErrorData error(ex);
|
||||
yyjson_mut_obj_add_true(doc, result_obj, "error");
|
||||
yyjson_mut_obj_add_strcpy(doc, result_obj, "error_type",
|
||||
StringUtil::Lower(Exception::ExceptionTypeToString(error.Type())).c_str());
|
||||
yyjson_mut_obj_add_strcpy(doc, result_obj, "error_message", error.RawMessage().c_str());
|
||||
// add extra info
|
||||
for (auto &entry : error.ExtraInfo()) {
|
||||
yyjson_mut_obj_add_strcpy(doc, result_obj, entry.first.c_str(), entry.second.c_str());
|
||||
}
|
||||
|
||||
size_t len_size_t;
|
||||
auto data = yyjson_mut_val_write_opts(result_obj,
|
||||
info.format ? JSONCommon::WRITE_PRETTY_FLAG : JSONCommon::WRITE_FLAG,
|
||||
alc, &len_size_t, nullptr);
|
||||
idx_t len = len_size_t;
|
||||
return StringVector::AddString(result, data, len);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetSerializeSqlFunction() {
|
||||
ScalarFunctionSet set("json_serialize_sql");
|
||||
set.AddFunction(ScalarFunction({LogicalType::VARCHAR}, LogicalType::JSON(), JsonSerializeFunction,
|
||||
JsonSerializeBind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
|
||||
set.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BOOLEAN}, LogicalType::JSON(),
|
||||
JsonSerializeFunction, JsonSerializeBind, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init));
|
||||
|
||||
set.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BOOLEAN, LogicalType::BOOLEAN},
|
||||
LogicalType::JSON(), JsonSerializeFunction, JsonSerializeBind, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init));
|
||||
|
||||
set.AddFunction(ScalarFunction(
|
||||
{LogicalType::VARCHAR, LogicalType::BOOLEAN, LogicalType::BOOLEAN, LogicalType::BOOLEAN}, LogicalType::JSON(),
|
||||
JsonSerializeFunction, JsonSerializeBind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
|
||||
set.AddFunction(ScalarFunction(
|
||||
{LogicalType::VARCHAR, LogicalType::BOOLEAN, LogicalType::BOOLEAN, LogicalType::BOOLEAN, LogicalType::BOOLEAN},
|
||||
LogicalType::JSON(), JsonSerializeFunction, JsonSerializeBind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// JSON DESERIALIZE
|
||||
//----------------------------------------------------------------------
|
||||
static vector<unique_ptr<SelectStatement>> DeserializeSelectStatement(string_t input, yyjson_alc *alc) {
|
||||
auto doc = yyjson_doc_ptr(JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc));
|
||||
if (!doc) {
|
||||
throw ParserException("Could not parse json");
|
||||
}
|
||||
auto root = doc->root;
|
||||
auto err = yyjson_obj_get(root, "error");
|
||||
if (err && yyjson_is_true(err)) {
|
||||
auto err_type = yyjson_obj_get(root, "error_type");
|
||||
auto err_msg = yyjson_obj_get(root, "error_message");
|
||||
if (err_type && err_msg) {
|
||||
throw ParserException("Error parsing json: %s: %s", yyjson_get_str(err_type), yyjson_get_str(err_msg));
|
||||
}
|
||||
throw ParserException(
|
||||
"Error parsing json, expected error property to contain 'error_type' and 'error_message'");
|
||||
}
|
||||
|
||||
auto statements = yyjson_obj_get(root, "statements");
|
||||
if (!statements || !yyjson_is_arr(statements)) {
|
||||
throw ParserException("Error parsing json: no statements array");
|
||||
}
|
||||
auto size = yyjson_arr_size(statements);
|
||||
if (size == 0) {
|
||||
throw ParserException("Error parsing json: no statements");
|
||||
}
|
||||
|
||||
vector<unique_ptr<SelectStatement>> result;
|
||||
|
||||
idx_t idx;
|
||||
idx_t max;
|
||||
yyjson_val *stmt_json;
|
||||
yyjson_arr_foreach(statements, idx, max, stmt_json) {
|
||||
JsonDeserializer deserializer(stmt_json, doc);
|
||||
auto stmt = SelectStatement::Deserialize(deserializer);
|
||||
if (!stmt->node) {
|
||||
throw ParserException("Error parsing json: no select node found in json");
|
||||
}
|
||||
result.push_back(std::move(stmt));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// JSON DESERIALIZE SQL FUNCTION
|
||||
//----------------------------------------------------------------------
|
||||
static void JsonDeserializeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
auto &local_state = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = local_state.json_allocator->GetYYAlc();
|
||||
auto &inputs = args.data[0];
|
||||
|
||||
UnaryExecutor::Execute<string_t, string_t>(inputs, result, args.size(), [&](string_t input) {
|
||||
auto stmts = DeserializeSelectStatement(input, alc);
|
||||
// Combine all statements into a single semicolon separated string
|
||||
string str;
|
||||
for (idx_t i = 0; i < stmts.size(); i++) {
|
||||
if (i > 0) {
|
||||
str += "; ";
|
||||
}
|
||||
str += stmts[i]->ToString();
|
||||
}
|
||||
|
||||
return StringVector::AddString(result, str);
|
||||
});
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetDeserializeSqlFunction() {
|
||||
ScalarFunctionSet set("json_deserialize_sql");
|
||||
set.AddFunction(ScalarFunction({LogicalType::JSON()}, LogicalType::VARCHAR, JsonDeserializeFunction, nullptr,
|
||||
nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
return set;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// JSON EXECUTE SERIALIZED SQL (PRAGMA)
|
||||
//----------------------------------------------------------------------
|
||||
static string ExecuteJsonSerializedSqlPragmaFunction(ClientContext &context, const FunctionParameters ¶meters) {
|
||||
JSONFunctionLocalState local_state(context);
|
||||
auto alc = local_state.json_allocator->GetYYAlc();
|
||||
|
||||
auto input = parameters.values[0].GetValueUnsafe<string_t>();
|
||||
auto stmts = DeserializeSelectStatement(input, alc);
|
||||
if (stmts.size() != 1) {
|
||||
throw BinderException("json_execute_serialized_sql pragma expects exactly one statement");
|
||||
}
|
||||
return stmts[0]->ToString();
|
||||
}
|
||||
|
||||
PragmaFunctionSet JSONFunctions::GetExecuteJsonSerializedSqlPragmaFunction() {
|
||||
return PragmaFunctionSet(PragmaFunction::PragmaCall(
|
||||
"json_execute_serialized_sql", ExecuteJsonSerializedSqlPragmaFunction, {LogicalType::VARCHAR}));
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// JSON EXECUTE SERIALIZED SQL (TABLE FUNCTION)
|
||||
//----------------------------------------------------------------------
|
||||
struct ExecuteSqlTableFunction {
|
||||
struct BindData : public TableFunctionData {
|
||||
shared_ptr<Relation> plan;
|
||||
unique_ptr<QueryResult> result;
|
||||
unique_ptr<Connection> con;
|
||||
};
|
||||
|
||||
static unique_ptr<FunctionData> Bind(ClientContext &context, TableFunctionBindInput &input,
|
||||
vector<LogicalType> &return_types, vector<string> &names) {
|
||||
JSONFunctionLocalState local_state(context);
|
||||
auto alc = local_state.json_allocator->GetYYAlc();
|
||||
|
||||
auto result = make_uniq<BindData>();
|
||||
|
||||
result->con = make_uniq<Connection>(*context.db);
|
||||
if (input.inputs[0].IsNull()) {
|
||||
throw BinderException("json_execute_serialized_sql cannot execute NULL plan");
|
||||
}
|
||||
auto serialized = input.inputs[0].GetValueUnsafe<string>();
|
||||
auto stmts = DeserializeSelectStatement(serialized, alc);
|
||||
if (stmts.size() != 1) {
|
||||
throw BinderException("json_execute_serialized_sql expects exactly one statement");
|
||||
}
|
||||
result->plan = result->con->RelationFromQuery(std::move(stmts[0]));
|
||||
|
||||
for (auto &col : result->plan->Columns()) {
|
||||
return_types.emplace_back(col.Type());
|
||||
names.emplace_back(col.Name());
|
||||
}
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
static void Function(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
|
||||
auto &data = (BindData &)*data_p.bind_data;
|
||||
if (!data.result) {
|
||||
data.result = data.plan->Execute();
|
||||
}
|
||||
auto result_chunk = data.result->Fetch();
|
||||
if (!result_chunk) {
|
||||
return;
|
||||
}
|
||||
output.Move(*result_chunk);
|
||||
}
|
||||
};
|
||||
|
||||
TableFunctionSet JSONFunctions::GetExecuteJsonSerializedSqlFunction() {
|
||||
TableFunction func("json_execute_serialized_sql", {LogicalType::VARCHAR}, ExecuteSqlTableFunction::Function,
|
||||
ExecuteSqlTableFunction::Bind);
|
||||
return TableFunctionSet(func);
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
809
external/duckdb/extension/json/json_functions/json_structure.cpp
vendored
Normal file
809
external/duckdb/extension/json/json_functions/json_structure.cpp
vendored
Normal file
@@ -0,0 +1,809 @@
|
||||
#include "json_structure.hpp"
|
||||
|
||||
#include "duckdb/common/enum_util.hpp"
|
||||
#include "duckdb/common/extra_type_info.hpp"
|
||||
#include "json_executors.hpp"
|
||||
#include "json_scan.hpp"
|
||||
#include "json_transform.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static bool IsNumeric(LogicalTypeId type) {
|
||||
return type == LogicalTypeId::DOUBLE || type == LogicalTypeId::UBIGINT || type == LogicalTypeId::BIGINT;
|
||||
}
|
||||
|
||||
static LogicalTypeId MaxNumericType(const LogicalTypeId &a, const LogicalTypeId &b) {
|
||||
D_ASSERT(a != b);
|
||||
if (a == LogicalTypeId::DOUBLE || b == LogicalTypeId::DOUBLE) {
|
||||
return LogicalTypeId::DOUBLE;
|
||||
}
|
||||
return LogicalTypeId::BIGINT;
|
||||
}
|
||||
|
||||
JSONStructureNode::JSONStructureNode() : count(0), null_count(0) {
|
||||
}
|
||||
|
||||
JSONStructureNode::JSONStructureNode(const char *key_ptr, const size_t key_len) : JSONStructureNode() {
|
||||
key = make_uniq<string>(key_ptr, key_len);
|
||||
}
|
||||
|
||||
JSONStructureNode::JSONStructureNode(yyjson_val *key_p, yyjson_val *val_p, const bool ignore_errors)
|
||||
: JSONStructureNode(unsafe_yyjson_get_str(key_p), unsafe_yyjson_get_len(key_p)) {
|
||||
JSONStructure::ExtractStructure(val_p, *this, ignore_errors);
|
||||
}
|
||||
|
||||
static void SwapJSONStructureNode(JSONStructureNode &a, JSONStructureNode &b) noexcept {
|
||||
std::swap(a.key, b.key);
|
||||
std::swap(a.initialized, b.initialized);
|
||||
std::swap(a.descriptions, b.descriptions);
|
||||
std::swap(a.count, b.count);
|
||||
std::swap(a.null_count, b.null_count);
|
||||
}
|
||||
|
||||
JSONStructureNode::JSONStructureNode(JSONStructureNode &&other) noexcept {
|
||||
SwapJSONStructureNode(*this, other);
|
||||
}
|
||||
|
||||
JSONStructureNode &JSONStructureNode::operator=(JSONStructureNode &&other) noexcept {
|
||||
SwapJSONStructureNode(*this, other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
JSONStructureDescription &JSONStructureNode::GetOrCreateDescription(const LogicalTypeId type) {
|
||||
if (descriptions.empty()) {
|
||||
// Empty, just put this type in there
|
||||
descriptions.emplace_back(type);
|
||||
return descriptions.back();
|
||||
}
|
||||
|
||||
if (descriptions.size() == 1 && descriptions[0].type == LogicalTypeId::SQLNULL) {
|
||||
// Only a NULL in there, override
|
||||
descriptions[0].type = type;
|
||||
return descriptions[0];
|
||||
}
|
||||
|
||||
if (type == LogicalTypeId::SQLNULL) {
|
||||
// 'descriptions' is non-empty, so let's not add NULL
|
||||
return descriptions.back();
|
||||
}
|
||||
|
||||
// Check if type is already in there or if we can merge numerics
|
||||
const auto is_numeric = IsNumeric(type);
|
||||
for (auto &description : descriptions) {
|
||||
if (type == description.type) {
|
||||
return description;
|
||||
}
|
||||
if (is_numeric && IsNumeric(description.type)) {
|
||||
description.type = MaxNumericType(type, description.type);
|
||||
return description;
|
||||
}
|
||||
}
|
||||
// Type was not there, create a new description
|
||||
descriptions.emplace_back(type);
|
||||
return descriptions.back();
|
||||
}
|
||||
|
||||
bool JSONStructureNode::ContainsVarchar() const {
|
||||
if (descriptions.size() != 1) {
|
||||
// We can't refine types if we have more than 1 description (yet), defaults to JSON type for now
|
||||
return false;
|
||||
}
|
||||
auto &description = descriptions[0];
|
||||
if (description.type == LogicalTypeId::VARCHAR) {
|
||||
return true;
|
||||
}
|
||||
for (auto &child : description.children) {
|
||||
if (child.ContainsVarchar()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void JSONStructureNode::InitializeCandidateTypes(const idx_t max_depth, const bool convert_strings_to_integers,
|
||||
const idx_t depth) {
|
||||
if (depth >= max_depth) {
|
||||
return;
|
||||
}
|
||||
if (descriptions.size() != 1) {
|
||||
// We can't refine types if we have more than 1 description (yet), defaults to JSON type for now
|
||||
return;
|
||||
}
|
||||
auto &description = descriptions[0];
|
||||
if (description.type == LogicalTypeId::VARCHAR && !initialized) {
|
||||
// We loop through the candidate types and format templates from back to front
|
||||
if (convert_strings_to_integers) {
|
||||
description.candidate_types = {LogicalTypeId::UUID, LogicalTypeId::BIGINT, LogicalTypeId::TIMESTAMP,
|
||||
LogicalTypeId::DATE, LogicalTypeId::TIME};
|
||||
} else {
|
||||
description.candidate_types = {LogicalTypeId::UUID, LogicalTypeId::TIMESTAMP, LogicalTypeId::DATE,
|
||||
LogicalTypeId::TIME};
|
||||
}
|
||||
initialized = true;
|
||||
} else {
|
||||
for (auto &child : description.children) {
|
||||
child.InitializeCandidateTypes(max_depth, convert_strings_to_integers, depth + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void JSONStructureNode::RefineCandidateTypes(yyjson_val *vals[], const idx_t val_count, Vector &string_vector,
|
||||
ArenaAllocator &allocator, MutableDateFormatMap &date_format_map) {
|
||||
if (descriptions.size() != 1) {
|
||||
// We can't refine types if we have more than 1 description (yet), defaults to JSON type for now
|
||||
return;
|
||||
}
|
||||
if (!ContainsVarchar()) {
|
||||
return;
|
||||
}
|
||||
auto &description = descriptions[0];
|
||||
switch (description.type) {
|
||||
case LogicalTypeId::LIST:
|
||||
return RefineCandidateTypesArray(vals, val_count, string_vector, allocator, date_format_map);
|
||||
case LogicalTypeId::STRUCT:
|
||||
return RefineCandidateTypesObject(vals, val_count, string_vector, allocator, date_format_map);
|
||||
case LogicalTypeId::VARCHAR:
|
||||
return RefineCandidateTypesString(vals, val_count, string_vector, date_format_map);
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void JSONStructureNode::RefineCandidateTypesArray(yyjson_val *vals[], const idx_t val_count, Vector &string_vector,
|
||||
ArenaAllocator &allocator, MutableDateFormatMap &date_format_map) {
|
||||
D_ASSERT(descriptions.size() == 1 && descriptions[0].type == LogicalTypeId::LIST);
|
||||
auto &desc = descriptions[0];
|
||||
D_ASSERT(desc.children.size() == 1);
|
||||
auto &child = desc.children[0];
|
||||
|
||||
idx_t total_list_size = 0;
|
||||
for (idx_t i = 0; i < val_count; i++) {
|
||||
if (vals[i] && !unsafe_yyjson_is_null(vals[i])) {
|
||||
D_ASSERT(yyjson_is_arr(vals[i]));
|
||||
total_list_size += unsafe_yyjson_get_len(vals[i]);
|
||||
}
|
||||
}
|
||||
|
||||
idx_t offset = 0;
|
||||
auto child_vals =
|
||||
reinterpret_cast<yyjson_val **>(allocator.AllocateAligned(total_list_size * sizeof(yyjson_val *)));
|
||||
|
||||
size_t idx, max;
|
||||
yyjson_val *child_val;
|
||||
for (idx_t i = 0; i < val_count; i++) {
|
||||
if (vals[i] && !unsafe_yyjson_is_null(vals[i])) {
|
||||
yyjson_arr_foreach(vals[i], idx, max, child_val) {
|
||||
child_vals[offset++] = child_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
child.RefineCandidateTypes(child_vals, total_list_size, string_vector, allocator, date_format_map);
|
||||
}
|
||||
|
||||
void JSONStructureNode::RefineCandidateTypesObject(yyjson_val *vals[], const idx_t val_count, Vector &string_vector,
|
||||
ArenaAllocator &allocator, MutableDateFormatMap &date_format_map) {
|
||||
D_ASSERT(descriptions.size() == 1 && descriptions[0].type == LogicalTypeId::STRUCT);
|
||||
auto &desc = descriptions[0];
|
||||
|
||||
const idx_t child_count = desc.children.size();
|
||||
vector<yyjson_val **> child_vals;
|
||||
child_vals.reserve(child_count);
|
||||
for (idx_t child_idx = 0; child_idx < child_count; child_idx++) {
|
||||
child_vals.emplace_back(
|
||||
reinterpret_cast<yyjson_val **>(allocator.AllocateAligned(val_count * sizeof(yyjson_val *))));
|
||||
}
|
||||
|
||||
const auto found_keys = reinterpret_cast<bool *>(allocator.AllocateAligned(sizeof(bool) * child_count));
|
||||
|
||||
const auto &key_map = desc.key_map;
|
||||
size_t idx, max;
|
||||
yyjson_val *child_key, *child_val;
|
||||
for (idx_t i = 0; i < val_count; i++) {
|
||||
if (vals[i] && !unsafe_yyjson_is_null(vals[i])) {
|
||||
idx_t found_key_count = 0;
|
||||
memset(found_keys, false, child_count);
|
||||
|
||||
D_ASSERT(yyjson_is_obj(vals[i]));
|
||||
yyjson_obj_foreach(vals[i], idx, max, child_key, child_val) {
|
||||
D_ASSERT(yyjson_is_str(child_key));
|
||||
const auto key_ptr = unsafe_yyjson_get_str(child_key);
|
||||
const auto key_len = unsafe_yyjson_get_len(child_key);
|
||||
auto it = key_map.find({key_ptr, key_len});
|
||||
D_ASSERT(it != key_map.end());
|
||||
const auto child_idx = it->second;
|
||||
child_vals[child_idx][i] = child_val;
|
||||
found_key_count += !found_keys[child_idx];
|
||||
found_keys[child_idx] = true;
|
||||
}
|
||||
|
||||
if (found_key_count != child_count) {
|
||||
// Set child val to nullptr so recursion doesn't break
|
||||
for (idx_t child_idx = 0; child_idx < child_count; child_idx++) {
|
||||
if (!found_keys[child_idx]) {
|
||||
child_vals[child_idx][i] = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (idx_t child_idx = 0; child_idx < child_count; child_idx++) {
|
||||
child_vals[child_idx][i] = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (idx_t child_idx = 0; child_idx < child_count; child_idx++) {
|
||||
desc.children[child_idx].RefineCandidateTypes(child_vals[child_idx], val_count, string_vector, allocator,
|
||||
date_format_map);
|
||||
}
|
||||
}
|
||||
|
||||
void JSONStructureNode::RefineCandidateTypesString(yyjson_val *vals[], const idx_t val_count, Vector &string_vector,
|
||||
MutableDateFormatMap &date_format_map) {
|
||||
D_ASSERT(descriptions.size() == 1 && descriptions[0].type == LogicalTypeId::VARCHAR);
|
||||
if (descriptions[0].candidate_types.empty()) {
|
||||
return;
|
||||
}
|
||||
static JSONTransformOptions OPTIONS;
|
||||
JSONTransform::GetStringVector(vals, val_count, LogicalType::SQLNULL, string_vector, OPTIONS);
|
||||
EliminateCandidateTypes(val_count, string_vector, date_format_map);
|
||||
}
|
||||
|
||||
void JSONStructureNode::EliminateCandidateTypes(const idx_t vec_count, Vector &string_vector,
|
||||
MutableDateFormatMap &date_format_map) {
|
||||
D_ASSERT(descriptions.size() == 1 && descriptions[0].type == LogicalTypeId::VARCHAR);
|
||||
auto &description = descriptions[0];
|
||||
auto &candidate_types = description.candidate_types;
|
||||
while (true) {
|
||||
if (candidate_types.empty()) {
|
||||
return;
|
||||
}
|
||||
const auto type = candidate_types.back();
|
||||
Vector result_vector(type, vec_count);
|
||||
if (date_format_map.HasFormats(type)) {
|
||||
if (EliminateCandidateFormats(vec_count, string_vector, result_vector, date_format_map)) {
|
||||
return;
|
||||
} else {
|
||||
candidate_types.pop_back();
|
||||
}
|
||||
} else {
|
||||
string error_message;
|
||||
if (!VectorOperations::DefaultTryCast(string_vector, result_vector, vec_count, &error_message, true)) {
|
||||
candidate_types.pop_back();
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class OP, class T>
|
||||
bool TryParse(Vector &string_vector, StrpTimeFormat &format, const idx_t count) {
|
||||
const auto strings = FlatVector::GetData<string_t>(string_vector);
|
||||
const auto &validity = FlatVector::Validity(string_vector);
|
||||
|
||||
T result;
|
||||
string error_message;
|
||||
if (validity.AllValid()) {
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
if (!OP::template Operation<T>(format, strings[i], result, error_message)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
if (validity.RowIsValid(i)) {
|
||||
if (!OP::template Operation<T>(format, strings[i], result, error_message)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool JSONStructureNode::EliminateCandidateFormats(const idx_t vec_count, Vector &string_vector,
|
||||
const Vector &result_vector, MutableDateFormatMap &date_format_map) {
|
||||
D_ASSERT(descriptions.size() == 1 && descriptions[0].type == LogicalTypeId::VARCHAR);
|
||||
|
||||
const auto type = result_vector.GetType().id();
|
||||
auto i = date_format_map.NumberOfFormats(type);
|
||||
for (; i != 0; i--) {
|
||||
StrpTimeFormat format;
|
||||
if (!date_format_map.GetFormatAtIndex(type, i - 1, format)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool success;
|
||||
switch (type) {
|
||||
case LogicalTypeId::DATE:
|
||||
success = TryParse<TryParseDate, date_t>(string_vector, format, vec_count);
|
||||
break;
|
||||
case LogicalTypeId::TIMESTAMP:
|
||||
success = TryParse<TryParseTimeStamp, timestamp_t>(string_vector, format, vec_count);
|
||||
break;
|
||||
default:
|
||||
throw InternalException("No date/timestamp formats for %s", EnumUtil::ToString(type));
|
||||
}
|
||||
|
||||
if (success) {
|
||||
date_format_map.ShrinkFormatsToSize(type, i);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
JSONStructureDescription::JSONStructureDescription(const LogicalTypeId type_p) : type(type_p) {
|
||||
}
|
||||
|
||||
static void SwapJSONStructureDescription(JSONStructureDescription &a, JSONStructureDescription &b) noexcept {
|
||||
std::swap(a.type, b.type);
|
||||
std::swap(a.key_map, b.key_map);
|
||||
std::swap(a.children, b.children);
|
||||
std::swap(a.candidate_types, b.candidate_types);
|
||||
}
|
||||
|
||||
JSONStructureDescription::JSONStructureDescription(JSONStructureDescription &&other) noexcept {
|
||||
SwapJSONStructureDescription(*this, other);
|
||||
}
|
||||
|
||||
JSONStructureDescription &JSONStructureDescription::operator=(JSONStructureDescription &&other) noexcept {
|
||||
SwapJSONStructureDescription(*this, other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
JSONStructureNode &JSONStructureDescription::GetOrCreateChild() {
|
||||
D_ASSERT(type == LogicalTypeId::LIST);
|
||||
if (children.empty()) {
|
||||
children.emplace_back();
|
||||
}
|
||||
D_ASSERT(children.size() == 1);
|
||||
return children.back();
|
||||
}
|
||||
|
||||
JSONStructureNode &JSONStructureDescription::GetOrCreateChild(const char *key_ptr, const size_t key_size) {
|
||||
// Check if there is already a child with the same key
|
||||
const JSONKey temp_key {key_ptr, key_size};
|
||||
const auto it = key_map.find(temp_key);
|
||||
if (it != key_map.end()) {
|
||||
return children[it->second]; // Found it
|
||||
}
|
||||
|
||||
// Didn't find, create a new child
|
||||
children.emplace_back(key_ptr, key_size);
|
||||
const auto &persistent_key_string = *children.back().key;
|
||||
JSONKey new_key {persistent_key_string.c_str(), persistent_key_string.length()};
|
||||
key_map.emplace(new_key, children.size() - 1);
|
||||
return children.back();
|
||||
}
|
||||
|
||||
JSONStructureNode &JSONStructureDescription::GetOrCreateChild(yyjson_val *key, yyjson_val *val,
|
||||
const bool ignore_errors) {
|
||||
D_ASSERT(yyjson_is_str(key));
|
||||
auto &child = GetOrCreateChild(unsafe_yyjson_get_str(key), unsafe_yyjson_get_len(key));
|
||||
JSONStructure::ExtractStructure(val, child, ignore_errors);
|
||||
return child;
|
||||
}
|
||||
|
||||
static void ExtractStructureArray(yyjson_val *arr, JSONStructureNode &node, const bool ignore_errors) {
|
||||
D_ASSERT(yyjson_is_arr(arr));
|
||||
auto &description = node.GetOrCreateDescription(LogicalTypeId::LIST);
|
||||
auto &child = description.GetOrCreateChild();
|
||||
|
||||
size_t idx, max;
|
||||
yyjson_val *val;
|
||||
yyjson_arr_foreach(arr, idx, max, val) {
|
||||
JSONStructure::ExtractStructure(val, child, ignore_errors);
|
||||
}
|
||||
}
|
||||
|
||||
static void ExtractStructureObject(yyjson_val *obj, JSONStructureNode &node, const bool ignore_errors) {
|
||||
D_ASSERT(yyjson_is_obj(obj));
|
||||
auto &description = node.GetOrCreateDescription(LogicalTypeId::STRUCT);
|
||||
|
||||
// Keep track of keys so we can detect duplicates
|
||||
unordered_set<string> obj_keys;
|
||||
case_insensitive_set_t ci_obj_keys;
|
||||
|
||||
size_t idx, max;
|
||||
yyjson_val *key, *val;
|
||||
yyjson_obj_foreach(obj, idx, max, key, val) {
|
||||
const string obj_key(unsafe_yyjson_get_str(key), unsafe_yyjson_get_len(key));
|
||||
auto insert_result = obj_keys.insert(obj_key);
|
||||
if (!ignore_errors && !insert_result.second) { // Exact match
|
||||
JSONCommon::ThrowValFormatError("Duplicate key \"" + obj_key + "\" in object %s", obj);
|
||||
}
|
||||
insert_result = ci_obj_keys.insert(obj_key);
|
||||
if (!ignore_errors && !insert_result.second) { // Case-insensitive match
|
||||
JSONCommon::ThrowValFormatError("Duplicate key (different case) \"" + obj_key + "\" and \"" +
|
||||
*insert_result.first + "\" in object %s",
|
||||
obj);
|
||||
}
|
||||
description.GetOrCreateChild(key, val, ignore_errors);
|
||||
}
|
||||
}
|
||||
|
||||
static void ExtractStructureVal(yyjson_val *val, JSONStructureNode &node) {
|
||||
D_ASSERT(!yyjson_is_arr(val) && !yyjson_is_obj(val));
|
||||
node.GetOrCreateDescription(JSONCommon::ValTypeToLogicalTypeId(val));
|
||||
}
|
||||
|
||||
void JSONStructure::ExtractStructure(yyjson_val *val, JSONStructureNode &node, const bool ignore_errors) {
|
||||
node.count++;
|
||||
const auto tag = yyjson_get_tag(val);
|
||||
if (tag == (YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE)) {
|
||||
node.null_count++;
|
||||
}
|
||||
|
||||
switch (tag) {
|
||||
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
||||
return ExtractStructureArray(val, node, ignore_errors);
|
||||
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
||||
return ExtractStructureObject(val, node, ignore_errors);
|
||||
default:
|
||||
return ExtractStructureVal(val, node);
|
||||
}
|
||||
}
|
||||
|
||||
JSONStructureNode ExtractStructureInternal(yyjson_val *val, const bool ignore_errors) {
|
||||
JSONStructureNode node;
|
||||
JSONStructure::ExtractStructure(val, node, ignore_errors);
|
||||
return node;
|
||||
}
|
||||
|
||||
//! Forward declaration for recursion
|
||||
static yyjson_mut_val *ConvertStructure(const JSONStructureNode &node, yyjson_mut_doc *doc);
|
||||
|
||||
static yyjson_mut_val *ConvertStructureArray(const JSONStructureNode &node, yyjson_mut_doc *doc) {
|
||||
D_ASSERT(node.descriptions.size() == 1 && node.descriptions[0].type == LogicalTypeId::LIST);
|
||||
const auto &desc = node.descriptions[0];
|
||||
D_ASSERT(desc.children.size() == 1);
|
||||
|
||||
const auto arr = yyjson_mut_arr(doc);
|
||||
yyjson_mut_arr_append(arr, ConvertStructure(desc.children[0], doc));
|
||||
return arr;
|
||||
}
|
||||
|
||||
static yyjson_mut_val *ConvertStructureObject(const JSONStructureNode &node, yyjson_mut_doc *doc) {
|
||||
D_ASSERT(node.descriptions.size() == 1 && node.descriptions[0].type == LogicalTypeId::STRUCT);
|
||||
auto &desc = node.descriptions[0];
|
||||
if (desc.children.empty()) {
|
||||
// Empty struct - let's do JSON instead
|
||||
return yyjson_mut_str(doc, LogicalType::JSON_TYPE_NAME);
|
||||
}
|
||||
|
||||
const auto obj = yyjson_mut_obj(doc);
|
||||
for (auto &child : desc.children) {
|
||||
D_ASSERT(child.key);
|
||||
yyjson_mut_obj_add(obj, yyjson_mut_strn(doc, child.key->c_str(), child.key->length()),
|
||||
ConvertStructure(child, doc));
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
|
||||
static yyjson_mut_val *ConvertStructure(const JSONStructureNode &node, yyjson_mut_doc *doc) {
|
||||
if (node.descriptions.empty()) {
|
||||
return yyjson_mut_str(doc, JSONCommon::TYPE_STRING_NULL);
|
||||
}
|
||||
if (node.descriptions.size() != 1) { // Inconsistent types, so we resort to JSON
|
||||
return yyjson_mut_str(doc, LogicalType::JSON_TYPE_NAME);
|
||||
}
|
||||
auto &desc = node.descriptions[0];
|
||||
D_ASSERT(desc.type != LogicalTypeId::INVALID);
|
||||
switch (desc.type) {
|
||||
case LogicalTypeId::LIST:
|
||||
return ConvertStructureArray(node, doc);
|
||||
case LogicalTypeId::STRUCT:
|
||||
return ConvertStructureObject(node, doc);
|
||||
default:
|
||||
return yyjson_mut_str(doc, EnumUtil::ToChars(desc.type));
|
||||
}
|
||||
}
|
||||
|
||||
static string_t JSONStructureFunction(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &, idx_t) {
|
||||
return JSONCommon::WriteVal<yyjson_mut_val>(
|
||||
ConvertStructure(ExtractStructureInternal(val, true), yyjson_mut_doc_new(alc)), alc);
|
||||
}
|
||||
|
||||
static void StructureFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::UnaryExecute<string_t>(args, state, result, JSONStructureFunction);
|
||||
}
|
||||
|
||||
static void GetStructureFunctionInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
|
||||
set.AddFunction(ScalarFunction({input_type}, LogicalType::JSON(), StructureFunction, nullptr, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init));
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetStructureFunction() {
|
||||
ScalarFunctionSet set("json_structure");
|
||||
GetStructureFunctionInternal(set, LogicalType::VARCHAR);
|
||||
GetStructureFunctionInternal(set, LogicalType::JSON());
|
||||
return set;
|
||||
}
|
||||
|
||||
static LogicalType StructureToTypeArray(ClientContext &context, const JSONStructureNode &node, const idx_t max_depth,
|
||||
const double field_appearance_threshold, const idx_t map_inference_threshold,
|
||||
const idx_t depth, const LogicalType &null_type) {
|
||||
D_ASSERT(node.descriptions.size() == 1 && node.descriptions[0].type == LogicalTypeId::LIST);
|
||||
const auto &desc = node.descriptions[0];
|
||||
D_ASSERT(desc.children.size() == 1);
|
||||
|
||||
return LogicalType::LIST(JSONStructure::StructureToType(context, desc.children[0], max_depth,
|
||||
field_appearance_threshold, map_inference_threshold,
|
||||
depth + 1, null_type));
|
||||
}
|
||||
|
||||
static void MergeNodeArray(JSONStructureNode &merged, const JSONStructureDescription &child_desc) {
|
||||
D_ASSERT(child_desc.type == LogicalTypeId::LIST);
|
||||
auto &merged_desc = merged.GetOrCreateDescription(LogicalTypeId::LIST);
|
||||
auto &merged_child = merged_desc.GetOrCreateChild();
|
||||
for (auto &list_child : child_desc.children) {
|
||||
JSONStructure::MergeNodes(merged_child, list_child);
|
||||
}
|
||||
}
|
||||
|
||||
static void MergeNodeObject(JSONStructureNode &merged, const JSONStructureDescription &child_desc) {
|
||||
D_ASSERT(child_desc.type == LogicalTypeId::STRUCT);
|
||||
auto &merged_desc = merged.GetOrCreateDescription(LogicalTypeId::STRUCT);
|
||||
for (auto &struct_child : child_desc.children) {
|
||||
const auto &struct_child_key = *struct_child.key;
|
||||
auto &merged_child = merged_desc.GetOrCreateChild(struct_child_key.c_str(), struct_child_key.length());
|
||||
JSONStructure::MergeNodes(merged_child, struct_child);
|
||||
}
|
||||
}
|
||||
|
||||
static void MergeNodeVal(JSONStructureNode &merged, const JSONStructureDescription &child_desc,
|
||||
const bool node_initialized) {
|
||||
D_ASSERT(child_desc.type != LogicalTypeId::LIST && child_desc.type != LogicalTypeId::STRUCT);
|
||||
auto &merged_desc = merged.GetOrCreateDescription(child_desc.type);
|
||||
if (merged_desc.type != LogicalTypeId::VARCHAR || !node_initialized || merged.descriptions.size() != 1) {
|
||||
return;
|
||||
}
|
||||
if (!merged.initialized) {
|
||||
merged_desc.candidate_types = child_desc.candidate_types;
|
||||
} else if (merged_desc.candidate_types.empty() != child_desc.candidate_types.empty() // both empty or neither empty
|
||||
|| (!merged_desc.candidate_types.empty() &&
|
||||
merged_desc.candidate_types.back() != child_desc.candidate_types.back())) { // non-empty: check type
|
||||
merged_desc.candidate_types.clear(); // Not the same, default to VARCHAR
|
||||
}
|
||||
|
||||
merged.initialized = true;
|
||||
}
|
||||
|
||||
void JSONStructure::MergeNodes(JSONStructureNode &merged, const JSONStructureNode &node) {
|
||||
merged.count += node.count;
|
||||
merged.null_count += node.null_count;
|
||||
for (const auto &child_desc : node.descriptions) {
|
||||
switch (child_desc.type) {
|
||||
case LogicalTypeId::LIST:
|
||||
MergeNodeArray(merged, child_desc);
|
||||
break;
|
||||
case LogicalTypeId::STRUCT:
|
||||
MergeNodeObject(merged, child_desc);
|
||||
break;
|
||||
default:
|
||||
MergeNodeVal(merged, child_desc, node.initialized);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static double CalculateTypeSimilarity(const LogicalType &merged, const LogicalType &type, idx_t max_depth, idx_t depth);
|
||||
|
||||
static double CalculateMapAndStructSimilarity(const LogicalType &map_type, const LogicalType &struct_type,
|
||||
const bool swapped, const idx_t max_depth, const idx_t depth) {
|
||||
const auto &map_value_type = MapType::ValueType(map_type);
|
||||
const auto &struct_child_types = StructType::GetChildTypes(struct_type);
|
||||
double total_similarity = 0;
|
||||
for (const auto &struct_child_type : struct_child_types) {
|
||||
const auto similarity =
|
||||
swapped ? CalculateTypeSimilarity(struct_child_type.second, map_value_type, max_depth, depth + 1)
|
||||
: CalculateTypeSimilarity(map_value_type, struct_child_type.second, max_depth, depth + 1);
|
||||
if (similarity < 0) {
|
||||
return similarity;
|
||||
}
|
||||
total_similarity += similarity;
|
||||
}
|
||||
return total_similarity / static_cast<double>(struct_child_types.size());
|
||||
}
|
||||
|
||||
static double CalculateTypeSimilarity(const LogicalType &merged, const LogicalType &type, const idx_t max_depth,
|
||||
const idx_t depth) {
|
||||
if (depth >= max_depth || merged.id() == LogicalTypeId::SQLNULL || type.id() == LogicalTypeId::SQLNULL) {
|
||||
return 1;
|
||||
}
|
||||
if (merged.IsJSONType()) {
|
||||
// Incompatible types
|
||||
return -1;
|
||||
}
|
||||
if (type.IsJSONType() || merged == type) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
switch (merged.id()) {
|
||||
case LogicalTypeId::STRUCT: {
|
||||
if (type.id() == LogicalTypeId::MAP) {
|
||||
// This can happen for empty structs/maps ("{}"), or in rare cases where an inconsistent struct becomes
|
||||
// consistent when merged, but does not have enough children to be considered a map.
|
||||
return CalculateMapAndStructSimilarity(type, merged, true, max_depth, depth);
|
||||
} else if (type.id() != LogicalTypeId::STRUCT) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Only structs can be merged into a struct
|
||||
D_ASSERT(type.id() == LogicalTypeId::STRUCT);
|
||||
const auto &merged_child_types = StructType::GetChildTypes(merged);
|
||||
const auto &type_child_types = StructType::GetChildTypes(type);
|
||||
|
||||
unordered_map<string, const LogicalType &> merged_child_types_map;
|
||||
for (const auto &merged_child : merged_child_types) {
|
||||
merged_child_types_map.emplace(merged_child.first, merged_child.second);
|
||||
}
|
||||
|
||||
double total_similarity = 0;
|
||||
for (const auto &type_child_type : type_child_types) {
|
||||
const auto it = merged_child_types_map.find(type_child_type.first);
|
||||
if (it == merged_child_types_map.end()) {
|
||||
return -1;
|
||||
}
|
||||
const auto similarity = CalculateTypeSimilarity(it->second, type_child_type.second, max_depth, depth + 1);
|
||||
if (similarity < 0) {
|
||||
return similarity;
|
||||
}
|
||||
total_similarity += similarity;
|
||||
}
|
||||
return total_similarity / static_cast<double>(merged_child_types.size());
|
||||
}
|
||||
case LogicalTypeId::MAP: {
|
||||
if (type.id() == LogicalTypeId::MAP) {
|
||||
return CalculateTypeSimilarity(MapType::ValueType(merged), MapType::ValueType(type), max_depth, depth + 1);
|
||||
}
|
||||
|
||||
// Only maps and structs can be merged into a map
|
||||
if (type.id() != LogicalTypeId::STRUCT) {
|
||||
return -1;
|
||||
}
|
||||
return CalculateMapAndStructSimilarity(merged, type, false, max_depth, depth);
|
||||
}
|
||||
case LogicalTypeId::LIST: {
|
||||
// Only lists can be merged into a list
|
||||
D_ASSERT(type.id() == LogicalTypeId::LIST);
|
||||
const auto &merged_child_type = ListType::GetChildType(merged);
|
||||
const auto &type_child_type = ListType::GetChildType(type);
|
||||
return CalculateTypeSimilarity(merged_child_type, type_child_type, max_depth, depth + 1);
|
||||
}
|
||||
default:
|
||||
// This is only reachable if type has been inferred using candidate_types, but candidate_types were not
|
||||
// consistent among all map values
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
static bool IsStructureInconsistent(const JSONStructureDescription &desc, const idx_t sample_count,
|
||||
const idx_t null_count, const double field_appearance_threshold) {
|
||||
D_ASSERT(sample_count > null_count);
|
||||
double total_child_counts = 0;
|
||||
for (const auto &child : desc.children) {
|
||||
total_child_counts += static_cast<double>(child.count) / static_cast<double>(sample_count - null_count);
|
||||
}
|
||||
const auto avg_occurrence = total_child_counts / static_cast<double>(desc.children.size());
|
||||
return avg_occurrence < field_appearance_threshold;
|
||||
}
|
||||
|
||||
static LogicalType GetMergedType(ClientContext &context, const JSONStructureNode &node, const idx_t max_depth,
|
||||
const double field_appearance_threshold, const idx_t map_inference_threshold,
|
||||
const idx_t depth, const LogicalType &null_type) {
|
||||
D_ASSERT(node.descriptions.size() == 1);
|
||||
auto &desc = node.descriptions[0];
|
||||
JSONStructureNode merged;
|
||||
for (const auto &child : desc.children) {
|
||||
JSONStructure::MergeNodes(merged, child);
|
||||
}
|
||||
return JSONStructure::StructureToType(context, merged, max_depth, field_appearance_threshold,
|
||||
map_inference_threshold, depth + 1, null_type);
|
||||
}
|
||||
|
||||
static LogicalType StructureToTypeObject(ClientContext &context, const JSONStructureNode &node, const idx_t max_depth,
|
||||
const double field_appearance_threshold, const idx_t map_inference_threshold,
|
||||
const idx_t depth, const LogicalType &null_type) {
|
||||
D_ASSERT(node.descriptions.size() == 1 && node.descriptions[0].type == LogicalTypeId::STRUCT);
|
||||
auto &desc = node.descriptions[0];
|
||||
|
||||
if (desc.children.empty()) {
|
||||
if (map_inference_threshold != DConstants::INVALID_INDEX) {
|
||||
// Empty struct - let's do MAP of JSON instead
|
||||
return LogicalType::MAP(LogicalType::VARCHAR, null_type);
|
||||
} else {
|
||||
return LogicalType::JSON();
|
||||
}
|
||||
}
|
||||
|
||||
// If it's an inconsistent object we also just do MAP with the best-possible, recursively-merged value type
|
||||
if (map_inference_threshold != DConstants::INVALID_INDEX &&
|
||||
IsStructureInconsistent(desc, node.count, node.null_count, field_appearance_threshold)) {
|
||||
return LogicalType::MAP(LogicalType::VARCHAR,
|
||||
GetMergedType(context, node, max_depth, field_appearance_threshold,
|
||||
map_inference_threshold, depth + 1, null_type));
|
||||
}
|
||||
|
||||
// We have a consistent object
|
||||
child_list_t<LogicalType> child_types;
|
||||
child_types.reserve(desc.children.size());
|
||||
for (auto &child : desc.children) {
|
||||
D_ASSERT(child.key);
|
||||
child_types.emplace_back(*child.key,
|
||||
JSONStructure::StructureToType(context, child, max_depth, field_appearance_threshold,
|
||||
map_inference_threshold, depth + 1, null_type));
|
||||
}
|
||||
|
||||
// If we have many children and all children have similar-enough types we infer map
|
||||
if (desc.children.size() >= map_inference_threshold) {
|
||||
LogicalType map_value_type = GetMergedType(context, node, max_depth, field_appearance_threshold,
|
||||
map_inference_threshold, depth + 1, LogicalTypeId::SQLNULL);
|
||||
|
||||
double total_similarity = 0;
|
||||
for (const auto &child_type : child_types) {
|
||||
const auto similarity = CalculateTypeSimilarity(map_value_type, child_type.second, max_depth, depth + 1);
|
||||
if (similarity < 0) {
|
||||
total_similarity = similarity;
|
||||
break;
|
||||
}
|
||||
total_similarity += similarity;
|
||||
}
|
||||
const auto avg_similarity = total_similarity / static_cast<double>(child_types.size());
|
||||
if (avg_similarity >= 0.8) {
|
||||
if (null_type != LogicalTypeId::SQLNULL) {
|
||||
map_value_type = GetMergedType(context, node, max_depth, field_appearance_threshold,
|
||||
map_inference_threshold, depth + 1, null_type);
|
||||
}
|
||||
return LogicalType::MAP(LogicalType::VARCHAR, map_value_type);
|
||||
}
|
||||
}
|
||||
|
||||
return LogicalType::STRUCT(child_types);
|
||||
}
|
||||
|
||||
static LogicalType StructureToTypeString(const JSONStructureNode &node) {
|
||||
D_ASSERT(node.descriptions.size() == 1 && node.descriptions[0].type == LogicalTypeId::VARCHAR);
|
||||
auto &desc = node.descriptions[0];
|
||||
if (desc.candidate_types.empty()) {
|
||||
return LogicalTypeId::VARCHAR;
|
||||
}
|
||||
return desc.candidate_types.back();
|
||||
}
|
||||
|
||||
LogicalType JSONStructure::StructureToType(ClientContext &context, const JSONStructureNode &node, const idx_t max_depth,
|
||||
const double field_appearance_threshold, const idx_t map_inference_threshold,
|
||||
const idx_t depth, const LogicalType &null_type) {
|
||||
if (depth >= max_depth) {
|
||||
return LogicalType::JSON();
|
||||
}
|
||||
if (node.descriptions.empty()) {
|
||||
return null_type;
|
||||
}
|
||||
if (node.descriptions.size() != 1) { // Inconsistent types, so we resort to JSON
|
||||
return LogicalType::JSON();
|
||||
}
|
||||
auto &desc = node.descriptions[0];
|
||||
D_ASSERT(desc.type != LogicalTypeId::INVALID);
|
||||
switch (desc.type) {
|
||||
case LogicalTypeId::LIST:
|
||||
return StructureToTypeArray(context, node, max_depth, field_appearance_threshold, map_inference_threshold,
|
||||
depth, null_type);
|
||||
case LogicalTypeId::STRUCT:
|
||||
return StructureToTypeObject(context, node, max_depth, field_appearance_threshold, map_inference_threshold,
|
||||
depth, null_type);
|
||||
case LogicalTypeId::VARCHAR:
|
||||
return StructureToTypeString(node);
|
||||
case LogicalTypeId::UBIGINT:
|
||||
return LogicalTypeId::BIGINT; // We prefer not to return UBIGINT in our type auto-detection
|
||||
case LogicalTypeId::SQLNULL:
|
||||
return null_type;
|
||||
default:
|
||||
return desc.type;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
414
external/duckdb/extension/json/json_functions/json_table_in_out.cpp
vendored
Normal file
414
external/duckdb/extension/json/json_functions/json_table_in_out.cpp
vendored
Normal file
@@ -0,0 +1,414 @@
|
||||
#include "json_common.hpp"
|
||||
#include "json_functions.hpp"
|
||||
#include "duckdb/function/table_function.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
enum class JSONTableInOutType { EACH, TREE };
|
||||
|
||||
static unique_ptr<FunctionData> JSONTableInOutBind(ClientContext &, TableFunctionBindInput &input,
|
||||
vector<LogicalType> &return_types, vector<string> &names) {
|
||||
const child_list_t<LogicalType> schema {
|
||||
{"key", LogicalType::VARCHAR}, {"value", LogicalType::JSON()}, {"type", LogicalType::VARCHAR},
|
||||
{"atom", LogicalType::JSON()}, {"id", LogicalType::UBIGINT}, {"parent", LogicalType::UBIGINT},
|
||||
{"fullkey", LogicalType::VARCHAR}, {"path", LogicalType::VARCHAR},
|
||||
};
|
||||
|
||||
// Add all default columns
|
||||
names.reserve(schema.size());
|
||||
return_types.reserve(schema.size());
|
||||
for (const auto &col : schema) {
|
||||
names.emplace_back(col.first);
|
||||
return_types.emplace_back(col.second);
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct JSONTableInOutGlobalState : GlobalTableFunctionState {
|
||||
JSONTableInOutGlobalState() {
|
||||
}
|
||||
|
||||
//! Regular columns
|
||||
optional_idx key_column_index;
|
||||
optional_idx value_column_index;
|
||||
optional_idx type_column_index;
|
||||
optional_idx atom_column_index;
|
||||
optional_idx id_column_index;
|
||||
optional_idx parent_column_index;
|
||||
optional_idx fullkey_column_index;
|
||||
optional_idx path_column_index;
|
||||
|
||||
//! Virtual columns
|
||||
optional_idx json_column_index;
|
||||
optional_idx root_column_index;
|
||||
optional_idx empty_column_idex;
|
||||
optional_idx rowid_column_index;
|
||||
|
||||
static constexpr idx_t JSON_COLUMN_OFFSET = 0;
|
||||
static constexpr idx_t ROOT_COLUMN_OFFSET = 1;
|
||||
};
|
||||
|
||||
static unique_ptr<GlobalTableFunctionState> JSONTableInOutInitGlobal(ClientContext &, TableFunctionInitInput &input) {
|
||||
auto result = make_uniq<JSONTableInOutGlobalState>();
|
||||
for (idx_t i = 0; i < input.column_indexes.size(); i++) {
|
||||
const auto &col_idx = input.column_indexes[i];
|
||||
if (!col_idx.IsVirtualColumn()) {
|
||||
switch (col_idx.GetPrimaryIndex()) {
|
||||
case 0:
|
||||
result->key_column_index = i;
|
||||
break;
|
||||
case 1:
|
||||
result->value_column_index = i;
|
||||
break;
|
||||
case 2:
|
||||
result->type_column_index = i;
|
||||
break;
|
||||
case 3:
|
||||
result->atom_column_index = i;
|
||||
break;
|
||||
case 4:
|
||||
result->id_column_index = i;
|
||||
break;
|
||||
case 5:
|
||||
result->parent_column_index = i;
|
||||
break;
|
||||
case 6:
|
||||
result->fullkey_column_index = i;
|
||||
break;
|
||||
case 7:
|
||||
result->path_column_index = i;
|
||||
break;
|
||||
default:
|
||||
throw NotImplementedException("Column %llu for json_each/json_tree", col_idx.GetPrimaryIndex());
|
||||
}
|
||||
} else {
|
||||
if (col_idx.GetPrimaryIndex() == VIRTUAL_COLUMN_START + JSONTableInOutGlobalState::JSON_COLUMN_OFFSET) {
|
||||
result->json_column_index = i;
|
||||
} else if (col_idx.GetPrimaryIndex() ==
|
||||
VIRTUAL_COLUMN_START + JSONTableInOutGlobalState::ROOT_COLUMN_OFFSET) {
|
||||
result->root_column_index = i;
|
||||
} else if (col_idx.IsEmptyColumn()) {
|
||||
result->empty_column_idex = i;
|
||||
} else if (col_idx.IsRowIdColumn()) {
|
||||
result->rowid_column_index = i;
|
||||
} else {
|
||||
throw NotImplementedException("Virtual column %llu for json_each/json_tree", col_idx.GetPrimaryIndex());
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
struct JSONTableInOutRecursionNode {
|
||||
JSONTableInOutRecursionNode(string path_p, yyjson_val *parent_val_p)
|
||||
: path(std::move(path_p)), parent_val(parent_val_p), child_index(0) {
|
||||
}
|
||||
|
||||
string path;
|
||||
yyjson_val *parent_val;
|
||||
idx_t child_index;
|
||||
};
|
||||
|
||||
struct JSONTableInOutLocalState : LocalTableFunctionState {
|
||||
explicit JSONTableInOutLocalState(ClientContext &context)
|
||||
: json_allocator(BufferAllocator::Get(context)), alc(json_allocator.GetYYAlc()), len(DConstants::INVALID_INDEX),
|
||||
doc(nullptr), initialized(false), total_count(0) {
|
||||
}
|
||||
|
||||
string GetPath() const {
|
||||
auto result = path;
|
||||
for (const auto &ri : recursion_nodes) {
|
||||
result += ri.path;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void AddRecursionNode(yyjson_val *val, optional_ptr<yyjson_val> vkey, const optional_idx arr_index) {
|
||||
string str;
|
||||
if (vkey) {
|
||||
str = "." + string(unsafe_yyjson_get_str(vkey.get()), unsafe_yyjson_get_len(vkey.get()));
|
||||
} else if (arr_index.IsValid()) {
|
||||
str = "[" + to_string(arr_index.GetIndex()) + "]";
|
||||
}
|
||||
recursion_nodes.emplace_back(str, val);
|
||||
}
|
||||
|
||||
JSONAllocator json_allocator;
|
||||
yyjson_alc *alc;
|
||||
|
||||
string path;
|
||||
idx_t len;
|
||||
yyjson_doc *doc;
|
||||
bool initialized;
|
||||
|
||||
idx_t total_count;
|
||||
vector<JSONTableInOutRecursionNode> recursion_nodes;
|
||||
};
|
||||
|
||||
static unique_ptr<LocalTableFunctionState> JSONTableInOutInitLocal(ExecutionContext &context, TableFunctionInitInput &,
|
||||
GlobalTableFunctionState *) {
|
||||
return make_uniq<JSONTableInOutLocalState>(context.client);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
struct JSONTableInOutResultVector {
|
||||
explicit JSONTableInOutResultVector(DataChunk &output, const optional_idx &output_column_index)
|
||||
: enabled(output_column_index.IsValid()), vector(output.data[enabled ? output_column_index.GetIndex() : 0]),
|
||||
data(enabled ? FlatVector::GetData<T>(vector) : nullptr), validity(FlatVector::Validity(vector)) {
|
||||
}
|
||||
const bool enabled;
|
||||
Vector &vector;
|
||||
T *data;
|
||||
ValidityMask &validity;
|
||||
};
|
||||
|
||||
struct JSONTableInOutResult {
|
||||
explicit JSONTableInOutResult(const JSONTableInOutGlobalState &gstate, DataChunk &output)
|
||||
: count(0), key(output, gstate.key_column_index), value(output, gstate.value_column_index),
|
||||
type(output, gstate.type_column_index), atom(output, gstate.atom_column_index),
|
||||
id(output, gstate.id_column_index), parent(output, gstate.parent_column_index),
|
||||
fullkey(output, gstate.fullkey_column_index), path(output, gstate.path_column_index),
|
||||
rowid(output, gstate.rowid_column_index) {
|
||||
}
|
||||
|
||||
template <JSONTableInOutType TYPE>
|
||||
void AddRow(JSONTableInOutLocalState &lstate, optional_ptr<yyjson_val> vkey, yyjson_val *val) {
|
||||
const auto &recursion_nodes = lstate.recursion_nodes;
|
||||
const auto arr_el = !recursion_nodes.empty() && unsafe_yyjson_is_arr(recursion_nodes.back().parent_val);
|
||||
if (key.enabled) {
|
||||
if (vkey) { // Object field
|
||||
key.data[count] = string_t(unsafe_yyjson_get_str(vkey.get()), unsafe_yyjson_get_len(vkey.get()));
|
||||
} else if (arr_el) { // Array element
|
||||
key.data[count] = StringVector::AddString(key.vector, to_string(recursion_nodes.back().child_index));
|
||||
} else { // Other
|
||||
key.validity.SetInvalid(count);
|
||||
}
|
||||
}
|
||||
if (value.enabled) {
|
||||
value.data[count] = JSONCommon::WriteVal(val, lstate.alc);
|
||||
}
|
||||
if (type.enabled) {
|
||||
type.data[count] = JSONCommon::ValTypeToStringT(val);
|
||||
}
|
||||
if (atom.enabled) {
|
||||
atom.data[count] = JSONCommon::JSONValue(val, lstate.alc, atom.vector, atom.validity, count);
|
||||
}
|
||||
if (id.enabled) {
|
||||
id.data[count] = NumericCast<idx_t>(val - lstate.doc->root);
|
||||
}
|
||||
if (parent.enabled) {
|
||||
if (TYPE == JSONTableInOutType::EACH || recursion_nodes.empty()) {
|
||||
parent.validity.SetInvalid(count);
|
||||
} else {
|
||||
parent.data[count] = NumericCast<uint64_t>(recursion_nodes.back().parent_val - lstate.doc->root);
|
||||
}
|
||||
}
|
||||
const auto path_str = lstate.GetPath();
|
||||
if (fullkey.enabled) {
|
||||
if (vkey) { // Object field
|
||||
const auto vkey_str = string(unsafe_yyjson_get_str(vkey.get()), unsafe_yyjson_get_len(vkey.get()));
|
||||
fullkey.data[count] = StringVector::AddString(fullkey.vector, path_str + "." + vkey_str);
|
||||
} else if (arr_el) { // Array element
|
||||
const auto arr_path = "[" + to_string(recursion_nodes.back().child_index) + "]";
|
||||
fullkey.data[count] = StringVector::AddString(fullkey.vector, path_str + arr_path);
|
||||
} else { // Other
|
||||
fullkey.data[count] = StringVector::AddString(fullkey.vector, path_str);
|
||||
}
|
||||
}
|
||||
if (path.enabled) {
|
||||
path.data[count] = StringVector::AddString(path.vector, path_str);
|
||||
}
|
||||
if (rowid.enabled) {
|
||||
rowid.data[count] = NumericCast<int64_t>(lstate.total_count++);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
||||
idx_t count;
|
||||
JSONTableInOutResultVector<string_t> key;
|
||||
JSONTableInOutResultVector<string_t> value;
|
||||
JSONTableInOutResultVector<string_t> type;
|
||||
JSONTableInOutResultVector<string_t> atom;
|
||||
JSONTableInOutResultVector<uint64_t> id;
|
||||
JSONTableInOutResultVector<uint64_t> parent;
|
||||
JSONTableInOutResultVector<string_t> fullkey;
|
||||
JSONTableInOutResultVector<string_t> path;
|
||||
JSONTableInOutResultVector<int64_t> rowid;
|
||||
};
|
||||
|
||||
template <JSONTableInOutType TYPE>
|
||||
static void InitializeLocalState(JSONTableInOutLocalState &lstate, DataChunk &input, JSONTableInOutResult &result) {
|
||||
lstate.total_count = 0;
|
||||
|
||||
// Parse path, default to root if not given
|
||||
Value path_value("$");
|
||||
if (input.data.size() > 1) {
|
||||
auto &path_vector = input.data[1];
|
||||
if (ConstantVector::IsNull(path_vector)) {
|
||||
return;
|
||||
}
|
||||
path_value = ConstantVector::GetData<string_t>(path_vector)[0];
|
||||
}
|
||||
|
||||
if (JSONReadFunctionData::CheckPath(path_value, lstate.path, lstate.len) == JSONCommon::JSONPathType::WILDCARD) {
|
||||
throw BinderException("Wildcard JSON path not supported in json_each/json_tree");
|
||||
}
|
||||
|
||||
if (lstate.path.c_str()[0] != '$') {
|
||||
throw BinderException("JSON path must start with '$' for json_each/json_tree");
|
||||
}
|
||||
|
||||
// Parse document and get the value at the supplied path
|
||||
const auto &input_vector = input.data[0];
|
||||
if (ConstantVector::IsNull(input_vector)) {
|
||||
return;
|
||||
}
|
||||
const auto &input_data = FlatVector::GetData<string_t>(input_vector)[0];
|
||||
lstate.doc = JSONCommon::ReadDocument(input_data, JSONCommon::READ_FLAG, lstate.alc);
|
||||
const auto root = JSONCommon::GetUnsafe(lstate.doc->root, lstate.path.c_str(), lstate.len);
|
||||
|
||||
if (!root) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto is_container = unsafe_yyjson_is_arr(root) || unsafe_yyjson_is_obj(root);
|
||||
if (!is_container || TYPE == JSONTableInOutType::TREE) {
|
||||
result.AddRow<TYPE>(lstate, nullptr, root);
|
||||
}
|
||||
if (is_container) {
|
||||
lstate.AddRecursionNode(root, nullptr, optional_idx());
|
||||
}
|
||||
}
|
||||
|
||||
template <JSONTableInOutType TYPE>
|
||||
static bool JSONTableInOutHandleValue(JSONTableInOutLocalState &lstate, JSONTableInOutResult &result,
|
||||
idx_t &child_index, size_t &idx, yyjson_val *child_key, yyjson_val *child_val) {
|
||||
|
||||
if (idx < child_index) {
|
||||
return false; // Continue: Get back to where we left off
|
||||
}
|
||||
result.AddRow<TYPE>(lstate, child_key, child_val);
|
||||
child_index++; // We finished processing the array element
|
||||
if (TYPE == JSONTableInOutType::TREE && (unsafe_yyjson_is_arr(child_val) || unsafe_yyjson_is_obj(child_val))) {
|
||||
lstate.AddRecursionNode(child_val, child_key, idx);
|
||||
return true; // Break: We added a recursion node, go depth-first
|
||||
}
|
||||
if (result.count == STANDARD_VECTOR_SIZE) {
|
||||
return true; // Break: Vector is full
|
||||
}
|
||||
return false; // Continue: Next element
|
||||
}
|
||||
|
||||
template <JSONTableInOutType TYPE>
|
||||
static OperatorResultType JSONTableInOutFunction(ExecutionContext &, TableFunctionInput &data_p, DataChunk &input,
|
||||
DataChunk &output) {
|
||||
auto &gstate = data_p.global_state->Cast<JSONTableInOutGlobalState>();
|
||||
auto &lstate = data_p.local_state->Cast<JSONTableInOutLocalState>();
|
||||
|
||||
JSONTableInOutResult result(gstate, output);
|
||||
if (!lstate.initialized) {
|
||||
InitializeLocalState<TYPE>(lstate, input, result);
|
||||
lstate.initialized = true;
|
||||
}
|
||||
|
||||
// Traverse the JSON (keeping a stack to avoid recursion and save progress across calls)
|
||||
auto &recursion_nodes = lstate.recursion_nodes;
|
||||
while (!lstate.recursion_nodes.empty() && result.count != STANDARD_VECTOR_SIZE) {
|
||||
auto &parent_val = recursion_nodes.back().parent_val;
|
||||
auto &child_index = recursion_nodes.back().child_index;
|
||||
|
||||
size_t idx, max;
|
||||
yyjson_val *child_key, *child_val;
|
||||
switch (yyjson_get_tag(parent_val)) {
|
||||
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
||||
yyjson_arr_foreach(parent_val, idx, max, child_val) {
|
||||
if (JSONTableInOutHandleValue<TYPE>(lstate, result, child_index, idx, nullptr, child_val)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
||||
yyjson_obj_foreach(parent_val, idx, max, child_key, child_val) {
|
||||
if (JSONTableInOutHandleValue<TYPE>(lstate, result, child_index, idx, child_key, child_val)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw InternalException("Non-object/array JSON added to recursion in json_each/json_tree");
|
||||
}
|
||||
if (idx == max) {
|
||||
lstate.recursion_nodes.pop_back(); // Array/object is done, remove
|
||||
}
|
||||
}
|
||||
output.SetCardinality(result.count);
|
||||
|
||||
// Set constant virtual columns ("json", "root", and "empty")
|
||||
if (gstate.json_column_index.IsValid()) {
|
||||
auto &json_vector = output.data[gstate.json_column_index.GetIndex()];
|
||||
json_vector.Reference(input.data[0]);
|
||||
}
|
||||
if (gstate.root_column_index.IsValid()) {
|
||||
auto &root_vector = output.data[gstate.root_column_index.GetIndex()];
|
||||
root_vector.SetVectorType(VectorType::CONSTANT_VECTOR);
|
||||
FlatVector::GetData<string_t>(root_vector)[0] = string_t(lstate.path.c_str(), lstate.len);
|
||||
}
|
||||
if (gstate.empty_column_idex.IsValid()) {
|
||||
auto &empty_vector = output.data[gstate.empty_column_idex.GetIndex()];
|
||||
empty_vector.SetVectorType(VectorType::CONSTANT_VECTOR);
|
||||
ConstantVector::SetNull(empty_vector, true);
|
||||
}
|
||||
|
||||
if (output.size() == 0) {
|
||||
D_ASSERT(recursion_nodes.empty());
|
||||
lstate.json_allocator.Reset();
|
||||
lstate.initialized = false;
|
||||
return OperatorResultType::NEED_MORE_INPUT;
|
||||
}
|
||||
return OperatorResultType::HAVE_MORE_OUTPUT;
|
||||
}
|
||||
|
||||
virtual_column_map_t GetJSONTableInOutVirtualColumns(ClientContext &, optional_ptr<FunctionData>) {
|
||||
virtual_column_map_t result;
|
||||
result.insert(make_pair(VIRTUAL_COLUMN_START + JSONTableInOutGlobalState::JSON_COLUMN_OFFSET,
|
||||
TableColumn("json", LogicalType::JSON())));
|
||||
result.insert(make_pair(VIRTUAL_COLUMN_START + JSONTableInOutGlobalState::ROOT_COLUMN_OFFSET,
|
||||
TableColumn("root", LogicalType::VARCHAR)));
|
||||
result.insert(make_pair(COLUMN_IDENTIFIER_EMPTY, TableColumn("", LogicalType::BOOLEAN)));
|
||||
result.insert(make_pair(COLUMN_IDENTIFIER_ROW_ID, TableColumn("rowid", LogicalType::BIGINT)));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <JSONTableInOutType TYPE>
|
||||
TableFunction GetJSONTableInOutFunction(const LogicalType &input_type, const bool &has_path_param) {
|
||||
vector<LogicalType> arguments = {input_type};
|
||||
if (has_path_param) {
|
||||
arguments.push_back(LogicalType::VARCHAR);
|
||||
}
|
||||
TableFunction function(arguments, nullptr, JSONTableInOutBind, JSONTableInOutInitGlobal, JSONTableInOutInitLocal);
|
||||
function.in_out_function = JSONTableInOutFunction<TYPE>;
|
||||
function.get_virtual_columns = GetJSONTableInOutVirtualColumns;
|
||||
function.projection_pushdown = true;
|
||||
return function;
|
||||
}
|
||||
|
||||
TableFunctionSet JSONFunctions::GetJSONEachFunction() {
|
||||
TableFunctionSet set("json_each");
|
||||
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::EACH>(LogicalType::VARCHAR, false));
|
||||
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::EACH>(LogicalType::VARCHAR, true));
|
||||
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::EACH>(LogicalType::JSON(), false));
|
||||
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::EACH>(LogicalType::JSON(), true));
|
||||
return set;
|
||||
}
|
||||
|
||||
TableFunctionSet JSONFunctions::GetJSONTreeFunction() {
|
||||
TableFunctionSet set("json_tree");
|
||||
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::TREE>(LogicalType::VARCHAR, false));
|
||||
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::TREE>(LogicalType::VARCHAR, true));
|
||||
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::TREE>(LogicalType::JSON(), false));
|
||||
set.AddFunction(GetJSONTableInOutFunction<JSONTableInOutType::TREE>(LogicalType::JSON(), true));
|
||||
return set;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
1064
external/duckdb/extension/json/json_functions/json_transform.cpp
vendored
Normal file
1064
external/duckdb/extension/json/json_functions/json_transform.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
38
external/duckdb/extension/json/json_functions/json_type.cpp
vendored
Normal file
38
external/duckdb/extension/json/json_functions/json_type.cpp
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
#include "json_executors.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static inline string_t GetType(yyjson_val *val, yyjson_alc *, Vector &, ValidityMask &mask, idx_t idx) {
|
||||
return JSONCommon::ValTypeToStringT(val);
|
||||
}
|
||||
|
||||
static void UnaryTypeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::UnaryExecute<string_t>(args, state, result, GetType);
|
||||
}
|
||||
|
||||
static void BinaryTypeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::BinaryExecute<string_t>(args, state, result, GetType);
|
||||
}
|
||||
|
||||
static void ManyTypeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::ExecuteMany<string_t>(args, state, result, GetType);
|
||||
}
|
||||
|
||||
static void GetTypeFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
|
||||
set.AddFunction(ScalarFunction({input_type}, LogicalType::VARCHAR, UnaryTypeFunction, nullptr, nullptr, nullptr,
|
||||
JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::VARCHAR, BinaryTypeFunction,
|
||||
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
|
||||
LogicalType::LIST(LogicalType::VARCHAR), ManyTypeFunction,
|
||||
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetTypeFunction() {
|
||||
ScalarFunctionSet set("json_type");
|
||||
GetTypeFunctionsInternal(set, LogicalType::VARCHAR);
|
||||
GetTypeFunctionsInternal(set, LogicalType::JSON());
|
||||
return set;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
27
external/duckdb/extension/json/json_functions/json_valid.cpp
vendored
Normal file
27
external/duckdb/extension/json/json_functions/json_valid.cpp
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
#include "json_executors.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static void ValidFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
auto &inputs = args.data[0];
|
||||
UnaryExecutor::Execute<string_t, bool>(inputs, result, args.size(), [&](string_t input) {
|
||||
return JSONCommon::ReadDocumentUnsafe(input, JSONCommon::READ_FLAG, alc);
|
||||
});
|
||||
}
|
||||
|
||||
static void GetValidFunctionInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
|
||||
set.AddFunction(ScalarFunction("json_valid", {input_type}, LogicalType::BOOLEAN, ValidFunction, nullptr, nullptr,
|
||||
nullptr, JSONFunctionLocalState::Init));
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetValidFunction() {
|
||||
ScalarFunctionSet set("json_valid");
|
||||
GetValidFunctionInternal(set, LogicalType::VARCHAR);
|
||||
GetValidFunctionInternal(set, LogicalType::JSON());
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
31
external/duckdb/extension/json/json_functions/json_value.cpp
vendored
Normal file
31
external/duckdb/extension/json/json_functions/json_value.cpp
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
#include "json_executors.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static void ValueFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::BinaryExecute<string_t>(args, state, result, JSONCommon::JSONValue);
|
||||
}
|
||||
|
||||
static void ValueManyFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
||||
JSONExecutors::ExecuteMany<string_t>(args, state, result, JSONCommon::JSONValue);
|
||||
}
|
||||
|
||||
static void GetValueFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::BIGINT}, LogicalType::VARCHAR, ValueFunction,
|
||||
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::VARCHAR, ValueFunction,
|
||||
JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
|
||||
LogicalType::LIST(LogicalType::VARCHAR), ValueManyFunction,
|
||||
JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
|
||||
}
|
||||
|
||||
ScalarFunctionSet JSONFunctions::GetValueFunction() {
|
||||
// The value function is just like the extract function but returns NULL if the JSON is not a scalar value
|
||||
ScalarFunctionSet set("json_value");
|
||||
GetValueFunctionsInternal(set, LogicalType::VARCHAR);
|
||||
GetValueFunctionsInternal(set, LogicalType::JSON());
|
||||
return set;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
288
external/duckdb/extension/json/json_functions/read_json.cpp
vendored
Normal file
288
external/duckdb/extension/json/json_functions/read_json.cpp
vendored
Normal file
@@ -0,0 +1,288 @@
|
||||
#include "duckdb/common/helper.hpp"
|
||||
#include "duckdb/common/multi_file/multi_file_reader.hpp"
|
||||
#include "json_functions.hpp"
|
||||
#include "json_scan.hpp"
|
||||
#include "json_structure.hpp"
|
||||
#include "json_transform.hpp"
|
||||
#include "json_multi_file_info.hpp"
|
||||
#include "duckdb/parallel/task_executor.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static inline LogicalType RemoveDuplicateStructKeys(const LogicalType &type, const bool ignore_errors) {
|
||||
switch (type.id()) {
|
||||
case LogicalTypeId::STRUCT: {
|
||||
case_insensitive_set_t child_names;
|
||||
child_list_t<LogicalType> child_types;
|
||||
for (auto &child_type : StructType::GetChildTypes(type)) {
|
||||
auto insert_success = child_names.insert(child_type.first).second;
|
||||
if (!insert_success) {
|
||||
if (ignore_errors) {
|
||||
continue;
|
||||
}
|
||||
throw NotImplementedException(
|
||||
"Duplicate name \"%s\" in struct auto-detected in JSON, try ignore_errors=true", child_type.first);
|
||||
} else {
|
||||
child_types.emplace_back(child_type.first, RemoveDuplicateStructKeys(child_type.second, ignore_errors));
|
||||
}
|
||||
}
|
||||
return LogicalType::STRUCT(child_types);
|
||||
}
|
||||
case LogicalTypeId::MAP:
|
||||
return LogicalType::MAP(RemoveDuplicateStructKeys(MapType::KeyType(type), ignore_errors),
|
||||
RemoveDuplicateStructKeys(MapType::ValueType(type), ignore_errors));
|
||||
case LogicalTypeId::LIST:
|
||||
return LogicalType::LIST(RemoveDuplicateStructKeys(ListType::GetChildType(type), ignore_errors));
|
||||
default:
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
||||
struct AutoDetectState {
|
||||
AutoDetectState(ClientContext &context_p, MultiFileBindData &bind_data_p, const vector<OpenFileInfo> &files,
|
||||
MutableDateFormatMap &date_format_map)
|
||||
: context(context_p), bind_data(bind_data_p), files(files), date_format_map(date_format_map), files_scanned(0),
|
||||
tuples_scanned(0), bytes_scanned(0), total_file_size(0) {
|
||||
}
|
||||
|
||||
ClientContext &context;
|
||||
MultiFileBindData &bind_data;
|
||||
const vector<OpenFileInfo> &files;
|
||||
MutableDateFormatMap &date_format_map;
|
||||
atomic<idx_t> files_scanned;
|
||||
atomic<idx_t> tuples_scanned;
|
||||
atomic<idx_t> bytes_scanned;
|
||||
atomic<idx_t> total_file_size;
|
||||
};
|
||||
|
||||
class JSONSchemaTask : public BaseExecutorTask {
|
||||
public:
|
||||
JSONSchemaTask(TaskExecutor &executor, AutoDetectState &auto_detect_state, JSONStructureNode &node_p,
|
||||
const idx_t file_idx_start_p, const idx_t file_idx_end_p)
|
||||
: BaseExecutorTask(executor), auto_detect_state(auto_detect_state), node(node_p),
|
||||
file_idx_start(file_idx_start_p), file_idx_end(file_idx_end_p),
|
||||
allocator(BufferAllocator::Get(auto_detect_state.context)), string_vector(LogicalType::VARCHAR) {
|
||||
}
|
||||
|
||||
static idx_t ExecuteInternal(AutoDetectState &auto_detect_state, JSONStructureNode &node, const idx_t file_idx,
|
||||
ArenaAllocator &allocator, Vector &string_vector, idx_t remaining) {
|
||||
auto &context = auto_detect_state.context;
|
||||
auto &bind_data = auto_detect_state.bind_data;
|
||||
auto &files = auto_detect_state.files;
|
||||
auto &json_data = bind_data.bind_data->Cast<JSONScanData>();
|
||||
auto json_reader = make_shared_ptr<JSONReader>(context, json_data.options, files[file_idx].path);
|
||||
if (bind_data.union_readers[file_idx]) {
|
||||
throw InternalException("Union data already set");
|
||||
}
|
||||
auto &reader = *json_reader;
|
||||
auto union_data = make_uniq<BaseUnionData>(files[file_idx].path);
|
||||
union_data->reader = std::move(json_reader);
|
||||
bind_data.union_readers[file_idx] = std::move(union_data);
|
||||
|
||||
auto &global_allocator = Allocator::Get(context);
|
||||
idx_t buffer_capacity = json_data.options.maximum_object_size * 2;
|
||||
JSONReaderScanState scan_state(context, global_allocator, buffer_capacity);
|
||||
auto &options = json_data.options;
|
||||
// Read and detect schema
|
||||
idx_t total_tuple_count = 0;
|
||||
idx_t total_read_size = 0;
|
||||
|
||||
reader.Initialize(global_allocator, buffer_capacity);
|
||||
reader.InitializeScan(scan_state, JSONFileReadType::SCAN_ENTIRE_FILE);
|
||||
|
||||
auto file_size = reader.GetFileHandle().GetHandle().GetFileSize();
|
||||
while (remaining != 0) {
|
||||
allocator.Reset();
|
||||
auto buffer_offset_before = scan_state.buffer_offset;
|
||||
auto read_count = reader.Scan(scan_state);
|
||||
if (read_count == 0) {
|
||||
break;
|
||||
}
|
||||
total_read_size += scan_state.buffer_offset - buffer_offset_before;
|
||||
total_tuple_count += read_count;
|
||||
|
||||
const auto next = MinValue<idx_t>(read_count, remaining);
|
||||
for (idx_t i = 0; i < next; i++) {
|
||||
const auto &val = scan_state.values[i];
|
||||
if (val) {
|
||||
JSONStructure::ExtractStructure(val, node, true);
|
||||
}
|
||||
}
|
||||
if (!node.ContainsVarchar()) { // Can't refine non-VARCHAR types
|
||||
continue;
|
||||
}
|
||||
node.InitializeCandidateTypes(options.max_depth, options.convert_strings_to_integers);
|
||||
node.RefineCandidateTypes(scan_state.values, next, string_vector, allocator,
|
||||
auto_detect_state.date_format_map);
|
||||
remaining -= next;
|
||||
}
|
||||
auto_detect_state.total_file_size += file_size;
|
||||
auto_detect_state.bytes_scanned += total_read_size;
|
||||
auto_detect_state.tuples_scanned += total_tuple_count;
|
||||
++auto_detect_state.files_scanned;
|
||||
|
||||
return remaining;
|
||||
}
|
||||
|
||||
void ExecuteTask() override {
|
||||
auto &json_data = auto_detect_state.bind_data.bind_data->Cast<JSONScanData>();
|
||||
auto &options = json_data.options;
|
||||
for (idx_t file_idx = file_idx_start; file_idx < file_idx_end; file_idx++) {
|
||||
ExecuteInternal(auto_detect_state, node, file_idx, allocator, string_vector, options.sample_size);
|
||||
}
|
||||
}
|
||||
|
||||
string TaskType() const override {
|
||||
return "JSONSchemaTask";
|
||||
}
|
||||
|
||||
private:
|
||||
AutoDetectState &auto_detect_state;
|
||||
JSONStructureNode &node;
|
||||
const idx_t file_idx_start;
|
||||
const idx_t file_idx_end;
|
||||
|
||||
ArenaAllocator allocator;
|
||||
Vector string_vector;
|
||||
};
|
||||
|
||||
void JSONScan::AutoDetect(ClientContext &context, MultiFileBindData &bind_data, vector<LogicalType> &return_types,
|
||||
vector<string> &names) {
|
||||
auto &json_data = bind_data.bind_data->Cast<JSONScanData>();
|
||||
|
||||
MutableDateFormatMap date_format_map(*json_data.date_format_map);
|
||||
JSONStructureNode node;
|
||||
auto &options = json_data.options;
|
||||
auto files = bind_data.file_list->GetAllFiles();
|
||||
auto file_count = bind_data.file_options.union_by_name
|
||||
? files.size()
|
||||
: MinValue<idx_t>(options.maximum_sample_files, files.size());
|
||||
bind_data.union_readers.resize(files.empty() ? 0 : files.size());
|
||||
|
||||
AutoDetectState auto_detect_state(context, bind_data, files, date_format_map);
|
||||
const auto num_threads = NumericCast<idx_t>(TaskScheduler::GetScheduler(context).NumberOfThreads());
|
||||
const auto files_per_task = (file_count + num_threads - 1) / num_threads;
|
||||
const auto num_tasks = (file_count + files_per_task - 1) / files_per_task;
|
||||
vector<JSONStructureNode> task_nodes(num_tasks);
|
||||
|
||||
// Same idea as in union_by_name.hpp
|
||||
TaskExecutor executor(context);
|
||||
for (idx_t task_idx = 0; task_idx < num_tasks; task_idx++) {
|
||||
const auto file_idx_start = task_idx * files_per_task;
|
||||
const auto file_idx_end = MinValue(file_idx_start + files_per_task, file_count);
|
||||
auto task =
|
||||
make_uniq<JSONSchemaTask>(executor, auto_detect_state, task_nodes[task_idx], file_idx_start, file_idx_end);
|
||||
executor.ScheduleTask(std::move(task));
|
||||
}
|
||||
executor.WorkOnTasks();
|
||||
|
||||
// Merge task nodes into one
|
||||
for (auto &task_node : task_nodes) {
|
||||
JSONStructure::MergeNodes(node, task_node);
|
||||
}
|
||||
|
||||
// set the max threads/estimated per-file cardinality
|
||||
if (auto_detect_state.files_scanned > 0 && auto_detect_state.tuples_scanned > 0) {
|
||||
auto average_tuple_size =
|
||||
MaxValue<idx_t>(auto_detect_state.bytes_scanned / auto_detect_state.tuples_scanned, 1);
|
||||
json_data.estimated_cardinality_per_file = auto_detect_state.total_file_size / average_tuple_size;
|
||||
if (auto_detect_state.files_scanned == 1) {
|
||||
json_data.max_threads =
|
||||
MaxValue<idx_t>(auto_detect_state.total_file_size / json_data.options.maximum_object_size, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Convert structure to logical type
|
||||
auto type = JSONStructure::StructureToType(context, node, options.max_depth, options.field_appearance_threshold,
|
||||
options.map_inference_threshold);
|
||||
|
||||
// Auto-detect record type
|
||||
if (json_data.options.record_type == JSONRecordType::AUTO_DETECT) {
|
||||
if (type.id() == LogicalTypeId::STRUCT) {
|
||||
json_data.options.record_type = JSONRecordType::RECORDS;
|
||||
} else {
|
||||
json_data.options.record_type = JSONRecordType::VALUES;
|
||||
}
|
||||
}
|
||||
|
||||
if (!names.empty()) {
|
||||
// COPY - we already have names/types
|
||||
return;
|
||||
}
|
||||
|
||||
// Auto-detect columns
|
||||
if (json_data.options.record_type == JSONRecordType::RECORDS) {
|
||||
if (type.id() == LogicalTypeId::STRUCT) {
|
||||
const auto &child_types = StructType::GetChildTypes(type);
|
||||
return_types.reserve(child_types.size());
|
||||
names.reserve(child_types.size());
|
||||
for (auto &child_type : child_types) {
|
||||
return_types.emplace_back(RemoveDuplicateStructKeys(child_type.second, options.ignore_errors));
|
||||
names.emplace_back(child_type.first);
|
||||
}
|
||||
} else {
|
||||
throw BinderException("json_read expected records, but got non-record JSON instead."
|
||||
"\n Try setting records='auto' or records='false'.");
|
||||
}
|
||||
} else {
|
||||
D_ASSERT(json_data.options.record_type == JSONRecordType::VALUES);
|
||||
return_types.emplace_back(RemoveDuplicateStructKeys(type, options.ignore_errors));
|
||||
names.emplace_back("json");
|
||||
}
|
||||
}
|
||||
|
||||
TableFunction JSONFunctions::GetReadJSONTableFunction(shared_ptr<JSONScanInfo> function_info) {
|
||||
MultiFileFunction<JSONMultiFileInfo> table_function("read_json");
|
||||
|
||||
JSONScan::TableFunctionDefaults(table_function);
|
||||
table_function.named_parameters["columns"] = LogicalType::ANY;
|
||||
table_function.named_parameters["auto_detect"] = LogicalType::BOOLEAN;
|
||||
table_function.named_parameters["sample_size"] = LogicalType::BIGINT;
|
||||
table_function.named_parameters["dateformat"] = LogicalType::VARCHAR;
|
||||
table_function.named_parameters["date_format"] = LogicalType::VARCHAR;
|
||||
table_function.named_parameters["timestampformat"] = LogicalType::VARCHAR;
|
||||
table_function.named_parameters["timestamp_format"] = LogicalType::VARCHAR;
|
||||
table_function.named_parameters["records"] = LogicalType::VARCHAR;
|
||||
table_function.named_parameters["maximum_sample_files"] = LogicalType::BIGINT;
|
||||
|
||||
// TODO: might be able to do filter pushdown/prune ?
|
||||
table_function.function_info = std::move(function_info);
|
||||
|
||||
return static_cast<TableFunction>(table_function);
|
||||
}
|
||||
|
||||
TableFunctionSet CreateJSONFunctionInfo(string name, shared_ptr<JSONScanInfo> info) {
|
||||
auto table_function = JSONFunctions::GetReadJSONTableFunction(std::move(info));
|
||||
table_function.name = std::move(name);
|
||||
table_function.named_parameters["maximum_depth"] = LogicalType::BIGINT;
|
||||
table_function.named_parameters["field_appearance_threshold"] = LogicalType::DOUBLE;
|
||||
table_function.named_parameters["convert_strings_to_integers"] = LogicalType::BOOLEAN;
|
||||
table_function.named_parameters["map_inference_threshold"] = LogicalType::BIGINT;
|
||||
return MultiFileReader::CreateFunctionSet(table_function);
|
||||
}
|
||||
|
||||
TableFunctionSet JSONFunctions::GetReadJSONFunction() {
|
||||
auto info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT,
|
||||
JSONRecordType::AUTO_DETECT, true);
|
||||
return CreateJSONFunctionInfo("read_json", std::move(info));
|
||||
}
|
||||
|
||||
TableFunctionSet JSONFunctions::GetReadNDJSONFunction() {
|
||||
auto info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED,
|
||||
JSONRecordType::AUTO_DETECT, true);
|
||||
return CreateJSONFunctionInfo("read_ndjson", std::move(info));
|
||||
}
|
||||
|
||||
TableFunctionSet JSONFunctions::GetReadJSONAutoFunction() {
|
||||
auto info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT,
|
||||
JSONRecordType::AUTO_DETECT, true);
|
||||
return CreateJSONFunctionInfo("read_json_auto", std::move(info));
|
||||
}
|
||||
|
||||
TableFunctionSet JSONFunctions::GetReadNDJSONAutoFunction() {
|
||||
auto info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED,
|
||||
JSONRecordType::AUTO_DETECT, true);
|
||||
return CreateJSONFunctionInfo("read_ndjson_auto", std::move(info));
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
37
external/duckdb/extension/json/json_functions/read_json_objects.cpp
vendored
Normal file
37
external/duckdb/extension/json/json_functions/read_json_objects.cpp
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
#include "json_common.hpp"
|
||||
#include "json_functions.hpp"
|
||||
#include "json_scan.hpp"
|
||||
#include "duckdb/common/helper.hpp"
|
||||
#include "json_multi_file_info.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
TableFunction GetReadJSONObjectsTableFunction(string name, shared_ptr<JSONScanInfo> function_info) {
|
||||
MultiFileFunction<JSONMultiFileInfo> table_function(std::move(name));
|
||||
JSONScan::TableFunctionDefaults(table_function);
|
||||
table_function.function_info = std::move(function_info);
|
||||
return static_cast<TableFunction>(table_function);
|
||||
}
|
||||
|
||||
TableFunctionSet JSONFunctions::GetReadJSONObjectsFunction() {
|
||||
auto function_info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::AUTO_DETECT,
|
||||
JSONRecordType::RECORDS);
|
||||
auto table_function = GetReadJSONObjectsTableFunction("read_json_objects", std::move(function_info));
|
||||
return MultiFileReader::CreateFunctionSet(std::move(table_function));
|
||||
}
|
||||
|
||||
TableFunctionSet JSONFunctions::GetReadNDJSONObjectsFunction() {
|
||||
auto function_info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::NEWLINE_DELIMITED,
|
||||
JSONRecordType::RECORDS);
|
||||
auto table_function = GetReadJSONObjectsTableFunction("read_ndjson_objects", std::move(function_info));
|
||||
return MultiFileReader::CreateFunctionSet(std::move(table_function));
|
||||
}
|
||||
|
||||
TableFunctionSet JSONFunctions::GetReadJSONObjectsAutoFunction() {
|
||||
auto function_info = make_shared_ptr<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::AUTO_DETECT,
|
||||
JSONRecordType::RECORDS);
|
||||
auto table_function = GetReadJSONObjectsTableFunction("read_json_objects_auto", std::move(function_info));
|
||||
return MultiFileReader::CreateFunctionSet(std::move(table_function));
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
585
external/duckdb/extension/json/json_multi_file_info.cpp
vendored
Normal file
585
external/duckdb/extension/json/json_multi_file_info.cpp
vendored
Normal file
@@ -0,0 +1,585 @@
|
||||
#include "json_multi_file_info.hpp"
|
||||
#include "json_scan.hpp"
|
||||
#include "duckdb/common/types/value.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
unique_ptr<MultiFileReaderInterface> JSONMultiFileInfo::CreateInterface(ClientContext &context) {
|
||||
return make_uniq<JSONMultiFileInfo>();
|
||||
}
|
||||
|
||||
unique_ptr<BaseFileReaderOptions> JSONMultiFileInfo::InitializeOptions(ClientContext &context,
|
||||
optional_ptr<TableFunctionInfo> info) {
|
||||
auto reader_options = make_uniq<JSONFileReaderOptions>();
|
||||
auto &options = reader_options->options;
|
||||
if (info) {
|
||||
auto &scan_info = info->Cast<JSONScanInfo>();
|
||||
options.type = scan_info.type;
|
||||
options.format = scan_info.format;
|
||||
options.record_type = scan_info.record_type;
|
||||
options.auto_detect = scan_info.auto_detect;
|
||||
if (scan_info.type == JSONScanType::READ_JSON_OBJECTS) {
|
||||
// read_json_objects always emits a single JSON column called "json"
|
||||
options.sql_type_list.push_back(LogicalType::JSON());
|
||||
options.name_list.emplace_back("json");
|
||||
}
|
||||
} else {
|
||||
// COPY
|
||||
options.type = JSONScanType::READ_JSON;
|
||||
options.record_type = JSONRecordType::RECORDS;
|
||||
options.format = JSONFormat::AUTO_DETECT;
|
||||
options.auto_detect = false;
|
||||
}
|
||||
return std::move(reader_options);
|
||||
}
|
||||
|
||||
bool JSONMultiFileInfo::ParseOption(ClientContext &context, const string &key, const Value &value, MultiFileOptions &,
|
||||
BaseFileReaderOptions &options_p) {
|
||||
auto &reader_options = options_p.Cast<JSONFileReaderOptions>();
|
||||
auto &options = reader_options.options;
|
||||
if (value.IsNull()) {
|
||||
throw BinderException("Cannot use NULL as argument to key %s", key);
|
||||
}
|
||||
auto loption = StringUtil::Lower(key);
|
||||
if (loption == "ignore_errors") {
|
||||
options.ignore_errors = BooleanValue::Get(value);
|
||||
return true;
|
||||
}
|
||||
if (loption == "maximum_object_size") {
|
||||
options.maximum_object_size = MaxValue<idx_t>(UIntegerValue::Get(value), options.maximum_object_size);
|
||||
return true;
|
||||
}
|
||||
if (loption == "format") {
|
||||
auto arg = StringUtil::Lower(StringValue::Get(value));
|
||||
static const auto FORMAT_OPTIONS =
|
||||
case_insensitive_map_t<JSONFormat> {{"auto", JSONFormat::AUTO_DETECT},
|
||||
{"unstructured", JSONFormat::UNSTRUCTURED},
|
||||
{"newline_delimited", JSONFormat::NEWLINE_DELIMITED},
|
||||
{"nd", JSONFormat::NEWLINE_DELIMITED},
|
||||
{"array", JSONFormat::ARRAY}};
|
||||
auto lookup = FORMAT_OPTIONS.find(arg);
|
||||
if (lookup == FORMAT_OPTIONS.end()) {
|
||||
vector<string> valid_options;
|
||||
for (auto &pair : FORMAT_OPTIONS) {
|
||||
valid_options.push_back(StringUtil::Format("'%s'", pair.first));
|
||||
}
|
||||
throw BinderException("format must be one of [%s], not '%s'", StringUtil::Join(valid_options, ", "), arg);
|
||||
}
|
||||
options.format = lookup->second;
|
||||
return true;
|
||||
}
|
||||
if (loption == "compression") {
|
||||
options.compression = EnumUtil::FromString<FileCompressionType>(StringUtil::Upper(StringValue::Get(value)));
|
||||
return true;
|
||||
}
|
||||
if (loption == "columns") {
|
||||
auto &child_type = value.type();
|
||||
if (child_type.id() != LogicalTypeId::STRUCT) {
|
||||
throw BinderException("read_json \"columns\" parameter requires a struct as input.");
|
||||
}
|
||||
auto &struct_children = StructValue::GetChildren(value);
|
||||
D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
|
||||
for (idx_t i = 0; i < struct_children.size(); i++) {
|
||||
auto &name = StructType::GetChildName(child_type, i);
|
||||
auto &val = struct_children[i];
|
||||
if (val.IsNull()) {
|
||||
throw BinderException("read_json \"columns\" parameter type specification cannot be NULL.");
|
||||
}
|
||||
options.name_list.push_back(name);
|
||||
if (val.type().id() != LogicalTypeId::VARCHAR) {
|
||||
throw BinderException("read_json \"columns\" parameter type specification must be VARCHAR.");
|
||||
}
|
||||
options.sql_type_list.emplace_back(TransformStringToLogicalType(StringValue::Get(val), context));
|
||||
}
|
||||
D_ASSERT(options.name_list.size() == options.sql_type_list.size());
|
||||
if (options.name_list.empty()) {
|
||||
throw BinderException("read_json \"columns\" parameter needs at least one column.");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (loption == "auto_detect") {
|
||||
options.auto_detect = BooleanValue::Get(value);
|
||||
return true;
|
||||
}
|
||||
if (loption == "sample_size") {
|
||||
auto arg = BigIntValue::Get(value);
|
||||
if (arg == -1) {
|
||||
options.sample_size = NumericLimits<idx_t>::Maximum();
|
||||
} else if (arg > 0) {
|
||||
options.sample_size = arg;
|
||||
} else {
|
||||
throw BinderException("read_json \"sample_size\" parameter must be positive, or -1 to sample all input "
|
||||
"files entirely, up to \"maximum_sample_files\" files.");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (loption == "maximum_depth") {
|
||||
auto arg = BigIntValue::Get(value);
|
||||
if (arg == -1) {
|
||||
options.max_depth = NumericLimits<idx_t>::Maximum();
|
||||
} else {
|
||||
options.max_depth = arg;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (loption == "field_appearance_threshold") {
|
||||
auto arg = DoubleValue::Get(value);
|
||||
if (arg < 0 || arg > 1) {
|
||||
throw BinderException("read_json_auto \"field_appearance_threshold\" parameter must be between 0 and 1");
|
||||
}
|
||||
options.field_appearance_threshold = arg;
|
||||
return true;
|
||||
}
|
||||
if (loption == "map_inference_threshold") {
|
||||
auto arg = BigIntValue::Get(value);
|
||||
if (arg == -1) {
|
||||
options.map_inference_threshold = NumericLimits<idx_t>::Maximum();
|
||||
} else if (arg >= 0) {
|
||||
options.map_inference_threshold = arg;
|
||||
} else {
|
||||
throw BinderException("read_json_auto \"map_inference_threshold\" parameter must be 0 or positive, "
|
||||
"or -1 to disable map inference for consistent objects.");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (loption == "dateformat" || loption == "date_format") {
|
||||
auto format_string = StringValue::Get(value);
|
||||
if (StringUtil::Lower(format_string) == "iso") {
|
||||
format_string = "%Y-%m-%d";
|
||||
}
|
||||
options.date_format = format_string;
|
||||
|
||||
StrpTimeFormat format;
|
||||
auto error = StrTimeFormat::ParseFormatSpecifier(format_string, format);
|
||||
if (!error.empty()) {
|
||||
throw BinderException("read_json could not parse \"dateformat\": '%s'.", error.c_str());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (loption == "timestampformat" || loption == "timestamp_format") {
|
||||
auto format_string = StringValue::Get(value);
|
||||
if (StringUtil::Lower(format_string) == "iso") {
|
||||
format_string = "%Y-%m-%dT%H:%M:%S.%fZ";
|
||||
}
|
||||
options.timestamp_format = format_string;
|
||||
|
||||
StrpTimeFormat format;
|
||||
auto error = StrTimeFormat::ParseFormatSpecifier(format_string, format);
|
||||
if (!error.empty()) {
|
||||
throw BinderException("read_json could not parse \"timestampformat\": '%s'.", error.c_str());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (loption == "records") {
|
||||
auto arg = StringValue::Get(value);
|
||||
if (arg == "auto") {
|
||||
options.record_type = JSONRecordType::AUTO_DETECT;
|
||||
} else if (arg == "true") {
|
||||
options.record_type = JSONRecordType::RECORDS;
|
||||
} else if (arg == "false") {
|
||||
options.record_type = JSONRecordType::VALUES;
|
||||
} else {
|
||||
throw BinderException("read_json requires \"records\" to be one of ['auto', 'true', 'false'].");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (loption == "maximum_sample_files") {
|
||||
auto arg = BigIntValue::Get(value);
|
||||
if (arg == -1) {
|
||||
options.maximum_sample_files = NumericLimits<idx_t>::Maximum();
|
||||
} else if (arg > 0) {
|
||||
options.maximum_sample_files = arg;
|
||||
} else {
|
||||
throw BinderException("read_json \"maximum_sample_files\" parameter must be positive, or -1 to remove "
|
||||
"the limit on the number of files used to sample \"sample_size\" rows.");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (loption == "convert_strings_to_integers") {
|
||||
options.convert_strings_to_integers = BooleanValue::Get(value);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void JSONCheckSingleParameter(const string &key, const vector<Value> &values) {
|
||||
if (values.size() == 1) {
|
||||
return;
|
||||
}
|
||||
throw BinderException("COPY (FORMAT JSON) parameter %s expects a single argument.", key);
|
||||
}
|
||||
|
||||
bool JSONMultiFileInfo::ParseCopyOption(ClientContext &context, const string &key, const vector<Value> &values,
|
||||
BaseFileReaderOptions &options_p, vector<string> &expected_names,
|
||||
vector<LogicalType> &expected_types) {
|
||||
auto &reader_options = options_p.Cast<JSONFileReaderOptions>();
|
||||
auto &options = reader_options.options;
|
||||
const auto &loption = StringUtil::Lower(key);
|
||||
if (loption == "dateformat" || loption == "date_format") {
|
||||
JSONCheckSingleParameter(key, values);
|
||||
options.date_format = StringValue::Get(values.back());
|
||||
return true;
|
||||
}
|
||||
if (loption == "timestampformat" || loption == "timestamp_format") {
|
||||
JSONCheckSingleParameter(key, values);
|
||||
options.timestamp_format = StringValue::Get(values.back());
|
||||
return true;
|
||||
}
|
||||
if (loption == "auto_detect") {
|
||||
if (values.empty()) {
|
||||
options.auto_detect = true;
|
||||
} else {
|
||||
JSONCheckSingleParameter(key, values);
|
||||
options.auto_detect = BooleanValue::Get(values.back().DefaultCastAs(LogicalTypeId::BOOLEAN));
|
||||
options.format = JSONFormat::NEWLINE_DELIMITED;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (loption == "compression") {
|
||||
JSONCheckSingleParameter(key, values);
|
||||
options.compression =
|
||||
EnumUtil::FromString<FileCompressionType>(StringUtil::Upper(StringValue::Get(values.back())));
|
||||
return true;
|
||||
}
|
||||
if (loption == "array") {
|
||||
if (values.empty()) {
|
||||
options.format = JSONFormat::ARRAY;
|
||||
} else {
|
||||
JSONCheckSingleParameter(key, values);
|
||||
if (BooleanValue::Get(values.back().DefaultCastAs(LogicalTypeId::BOOLEAN))) {
|
||||
options.format = JSONFormat::ARRAY;
|
||||
} else {
|
||||
// Default to newline-delimited otherwise
|
||||
options.format = JSONFormat::NEWLINE_DELIMITED;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
unique_ptr<TableFunctionData> JSONMultiFileInfo::InitializeBindData(MultiFileBindData &multi_file_data,
|
||||
unique_ptr<BaseFileReaderOptions> options) {
|
||||
auto &reader_options = options->Cast<JSONFileReaderOptions>();
|
||||
auto json_data = make_uniq<JSONScanData>();
|
||||
json_data->options = std::move(reader_options.options);
|
||||
return std::move(json_data);
|
||||
}
|
||||
|
||||
void JSONMultiFileInfo::BindReader(ClientContext &context, vector<LogicalType> &return_types, vector<string> &names,
|
||||
MultiFileBindData &bind_data) {
|
||||
auto &json_data = bind_data.bind_data->Cast<JSONScanData>();
|
||||
|
||||
auto &options = json_data.options;
|
||||
names = options.name_list;
|
||||
return_types = options.sql_type_list;
|
||||
if (options.record_type == JSONRecordType::AUTO_DETECT && return_types.size() > 1) {
|
||||
// More than one specified column implies records
|
||||
options.record_type = JSONRecordType::RECORDS;
|
||||
}
|
||||
|
||||
// Specifying column names overrides auto-detect
|
||||
if (!return_types.empty()) {
|
||||
options.auto_detect = false;
|
||||
}
|
||||
|
||||
if (!options.auto_detect) {
|
||||
// Need to specify columns if RECORDS and not auto-detecting
|
||||
if (return_types.empty()) {
|
||||
throw BinderException("When auto_detect=false, read_json requires columns to be specified through the "
|
||||
"\"columns\" parameter.");
|
||||
}
|
||||
// If we are reading VALUES, we can only have one column
|
||||
if (json_data.options.record_type == JSONRecordType::VALUES && return_types.size() != 1) {
|
||||
throw BinderException("read_json requires a single column to be specified through the \"columns\" "
|
||||
"parameter when \"records\" is set to 'false'.");
|
||||
}
|
||||
}
|
||||
|
||||
json_data.InitializeFormats();
|
||||
|
||||
if (options.auto_detect || options.record_type == JSONRecordType::AUTO_DETECT) {
|
||||
JSONScan::AutoDetect(context, bind_data, return_types, names);
|
||||
D_ASSERT(return_types.size() == names.size());
|
||||
}
|
||||
json_data.key_names = names;
|
||||
|
||||
bind_data.multi_file_reader->BindOptions(bind_data.file_options, *bind_data.file_list, return_types, names,
|
||||
bind_data.reader_bind);
|
||||
|
||||
auto &transform_options = json_data.transform_options;
|
||||
transform_options.strict_cast = !options.ignore_errors;
|
||||
transform_options.error_duplicate_key = !options.ignore_errors;
|
||||
transform_options.error_missing_key = false;
|
||||
transform_options.error_unknown_key = options.auto_detect && !options.ignore_errors;
|
||||
transform_options.date_format_map = json_data.date_format_map.get();
|
||||
transform_options.delay_error = true;
|
||||
|
||||
if (options.auto_detect) {
|
||||
// JSON may contain columns such as "id" and "Id", which are duplicates for us due to case-insensitivity
|
||||
// We rename them so we can parse the file anyway. Note that we can't change json_data.key_names,
|
||||
// because the JSON reader gets columns by exact name, not position
|
||||
case_insensitive_map_t<idx_t> name_collision_count;
|
||||
for (auto &col_name : names) {
|
||||
// Taken from CSV header_detection.cpp
|
||||
while (name_collision_count.find(col_name) != name_collision_count.end()) {
|
||||
name_collision_count[col_name] += 1;
|
||||
col_name = col_name + "_" + to_string(name_collision_count[col_name]);
|
||||
}
|
||||
name_collision_count[col_name] = 0;
|
||||
}
|
||||
}
|
||||
bool reuse_readers = true;
|
||||
for (auto &union_reader : bind_data.union_readers) {
|
||||
if (!union_reader || !union_reader->reader) {
|
||||
// not all readers have been initialized - don't re-use any
|
||||
reuse_readers = false;
|
||||
break;
|
||||
}
|
||||
auto &json_reader = union_reader->reader->Cast<JSONReader>();
|
||||
if (!json_reader.IsOpen()) {
|
||||
// no open file-handle - close
|
||||
reuse_readers = false;
|
||||
}
|
||||
}
|
||||
if (!reuse_readers) {
|
||||
bind_data.union_readers.clear();
|
||||
} else {
|
||||
// re-use readers
|
||||
for (auto &union_reader : bind_data.union_readers) {
|
||||
auto &json_reader = union_reader->reader->Cast<JSONReader>();
|
||||
union_reader->names = names;
|
||||
union_reader->types = return_types;
|
||||
union_reader->reader->columns = MultiFileColumnDefinition::ColumnsFromNamesAndTypes(names, return_types);
|
||||
json_reader.Reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void JSONMultiFileInfo::FinalizeCopyBind(ClientContext &context, BaseFileReaderOptions &options_p,
|
||||
const vector<string> &expected_names,
|
||||
const vector<LogicalType> &expected_types) {
|
||||
auto &reader_options = options_p.Cast<JSONFileReaderOptions>();
|
||||
auto &options = reader_options.options;
|
||||
options.name_list = expected_names;
|
||||
options.sql_type_list = expected_types;
|
||||
if (options.auto_detect && options.format != JSONFormat::ARRAY) {
|
||||
options.format = JSONFormat::AUTO_DETECT;
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<GlobalTableFunctionState> JSONMultiFileInfo::InitializeGlobalState(ClientContext &context,
|
||||
MultiFileBindData &bind_data,
|
||||
MultiFileGlobalState &global_state) {
|
||||
auto json_state = make_uniq<JSONGlobalTableFunctionState>(context, bind_data);
|
||||
auto &json_data = bind_data.bind_data->Cast<JSONScanData>();
|
||||
|
||||
auto &gstate = json_state->state;
|
||||
// Perform projection pushdown
|
||||
for (idx_t col_idx = 0; col_idx < global_state.column_indexes.size(); col_idx++) {
|
||||
auto &column_index = global_state.column_indexes[col_idx];
|
||||
const auto &col_id = column_index.GetPrimaryIndex();
|
||||
|
||||
// Skip any multi-file reader / row id stuff
|
||||
if (bind_data.reader_bind.filename_idx.IsValid() && col_id == bind_data.reader_bind.filename_idx.GetIndex()) {
|
||||
continue;
|
||||
}
|
||||
if (IsVirtualColumn(col_id)) {
|
||||
continue;
|
||||
}
|
||||
bool skip = false;
|
||||
for (const auto &hive_partitioning_index : bind_data.reader_bind.hive_partitioning_indexes) {
|
||||
if (col_id == hive_partitioning_index.index) {
|
||||
skip = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (skip) {
|
||||
continue;
|
||||
}
|
||||
|
||||
gstate.names.push_back(json_data.key_names[col_id]);
|
||||
gstate.column_ids.push_back(col_idx);
|
||||
gstate.column_indices.push_back(column_index);
|
||||
}
|
||||
if (gstate.names.size() < json_data.key_names.size() || bind_data.file_options.union_by_name) {
|
||||
// If we are auto-detecting, but don't need all columns present in the file,
|
||||
// then we don't need to throw an error if we encounter an unseen column
|
||||
gstate.transform_options.error_unknown_key = false;
|
||||
}
|
||||
return std::move(json_state);
|
||||
}
|
||||
|
||||
unique_ptr<LocalTableFunctionState> JSONMultiFileInfo::InitializeLocalState(ExecutionContext &context,
|
||||
GlobalTableFunctionState &global_state) {
|
||||
auto &gstate = global_state.Cast<JSONGlobalTableFunctionState>();
|
||||
auto result = make_uniq<JSONLocalTableFunctionState>(context.client, gstate.state);
|
||||
|
||||
// Copy the transform options / date format map because we need to do thread-local stuff
|
||||
result->state.transform_options = gstate.state.transform_options;
|
||||
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
double JSONReader::GetProgressInFile(ClientContext &context) {
|
||||
return GetProgress();
|
||||
}
|
||||
|
||||
shared_ptr<BaseFileReader> JSONMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &gstate_p,
|
||||
BaseUnionData &union_data,
|
||||
const MultiFileBindData &bind_data_p) {
|
||||
auto &json_data = bind_data_p.bind_data->Cast<JSONScanData>();
|
||||
auto reader = make_shared_ptr<JSONReader>(context, json_data.options, union_data.GetFileName());
|
||||
reader->columns = MultiFileColumnDefinition::ColumnsFromNamesAndTypes(union_data.names, union_data.types);
|
||||
return std::move(reader);
|
||||
}
|
||||
|
||||
shared_ptr<BaseFileReader> JSONMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &gstate_p,
|
||||
const OpenFileInfo &file, idx_t file_idx,
|
||||
const MultiFileBindData &bind_data) {
|
||||
auto &json_data = bind_data.bind_data->Cast<JSONScanData>();
|
||||
auto reader = make_shared_ptr<JSONReader>(context, json_data.options, file.path);
|
||||
reader->columns = MultiFileColumnDefinition::ColumnsFromNamesAndTypes(bind_data.names, bind_data.types);
|
||||
return std::move(reader);
|
||||
}
|
||||
|
||||
void JSONReader::PrepareReader(ClientContext &context, GlobalTableFunctionState &gstate_p) {
|
||||
auto &gstate = gstate_p.Cast<JSONGlobalTableFunctionState>().state;
|
||||
if (gstate.enable_parallel_scans) {
|
||||
// if we are doing parallel scans we need to open the file here
|
||||
Initialize(gstate.allocator, gstate.buffer_capacity);
|
||||
}
|
||||
}
|
||||
|
||||
bool JSONReader::TryInitializeScan(ClientContext &context, GlobalTableFunctionState &gstate_p,
|
||||
LocalTableFunctionState &lstate_p) {
|
||||
auto &gstate = gstate_p.Cast<JSONGlobalTableFunctionState>().state;
|
||||
auto &lstate = lstate_p.Cast<JSONLocalTableFunctionState>().state;
|
||||
|
||||
lstate.GetScanState().ResetForNextBuffer();
|
||||
return lstate.TryInitializeScan(gstate, *this);
|
||||
}
|
||||
|
||||
void ReadJSONFunction(ClientContext &context, JSONReader &json_reader, JSONScanGlobalState &gstate,
|
||||
JSONScanLocalState &lstate, DataChunk &output) {
|
||||
auto &scan_state = lstate.GetScanState();
|
||||
D_ASSERT(RefersToSameObject(json_reader, *scan_state.current_reader));
|
||||
|
||||
const auto count = lstate.Read();
|
||||
yyjson_val **values = scan_state.values;
|
||||
|
||||
auto &column_ids = json_reader.column_ids;
|
||||
if (!gstate.names.empty()) {
|
||||
vector<Vector *> result_vectors;
|
||||
result_vectors.reserve(column_ids.size());
|
||||
for (idx_t i = 0; i < column_ids.size(); i++) {
|
||||
result_vectors.emplace_back(&output.data[i]);
|
||||
}
|
||||
|
||||
D_ASSERT(gstate.json_data.options.record_type != JSONRecordType::AUTO_DETECT);
|
||||
bool success;
|
||||
if (gstate.json_data.options.record_type == JSONRecordType::RECORDS) {
|
||||
success = JSONTransform::TransformObject(values, scan_state.allocator.GetYYAlc(), count, gstate.names,
|
||||
result_vectors, lstate.transform_options, gstate.column_indices,
|
||||
lstate.transform_options.error_unknown_key);
|
||||
} else {
|
||||
D_ASSERT(gstate.json_data.options.record_type == JSONRecordType::VALUES);
|
||||
success = JSONTransform::Transform(values, scan_state.allocator.GetYYAlc(), *result_vectors[0], count,
|
||||
lstate.transform_options, gstate.column_indices[0]);
|
||||
}
|
||||
|
||||
if (!success) {
|
||||
string hint =
|
||||
gstate.json_data.options.auto_detect
|
||||
? "\nTry increasing 'sample_size', reducing 'maximum_depth', specifying 'columns', 'format' or "
|
||||
"'records' manually, setting 'ignore_errors' to true, or setting 'union_by_name' to true when "
|
||||
"reading multiple files with a different structure."
|
||||
: "\nTry setting 'auto_detect' to true, specifying 'format' or 'records' manually, or setting "
|
||||
"'ignore_errors' to true.";
|
||||
lstate.AddTransformError(lstate.transform_options.object_index,
|
||||
lstate.transform_options.error_message + hint);
|
||||
return;
|
||||
}
|
||||
}
|
||||
output.SetCardinality(count);
|
||||
}
|
||||
|
||||
void ReadJSONObjectsFunction(ClientContext &context, JSONReader &json_reader, JSONScanGlobalState &gstate,
|
||||
JSONScanLocalState &lstate, DataChunk &output) {
|
||||
// Fetch next lines
|
||||
auto &scan_state = lstate.GetScanState();
|
||||
D_ASSERT(RefersToSameObject(json_reader, *scan_state.current_reader));
|
||||
|
||||
const auto count = lstate.Read();
|
||||
const auto units = scan_state.units;
|
||||
const auto objects = scan_state.values;
|
||||
|
||||
if (!gstate.names.empty()) {
|
||||
// Create the strings without copying them
|
||||
auto strings = FlatVector::GetData<string_t>(output.data[0]);
|
||||
auto &validity = FlatVector::Validity(output.data[0]);
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
if (objects[i]) {
|
||||
strings[i] = string_t(units[i].pointer, units[i].size);
|
||||
} else {
|
||||
validity.SetInvalid(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output.SetCardinality(count);
|
||||
}
|
||||
|
||||
void JSONReader::Scan(ClientContext &context, GlobalTableFunctionState &global_state,
|
||||
LocalTableFunctionState &local_state, DataChunk &output) {
|
||||
auto &gstate = global_state.Cast<JSONGlobalTableFunctionState>().state;
|
||||
auto &lstate = local_state.Cast<JSONLocalTableFunctionState>().state;
|
||||
auto &json_data = gstate.bind_data.bind_data->Cast<JSONScanData>();
|
||||
switch (json_data.options.type) {
|
||||
case JSONScanType::READ_JSON:
|
||||
ReadJSONFunction(context, *this, gstate, lstate, output);
|
||||
break;
|
||||
case JSONScanType::READ_JSON_OBJECTS:
|
||||
ReadJSONObjectsFunction(context, *this, gstate, lstate, output);
|
||||
break;
|
||||
default:
|
||||
throw InternalException("Unsupported scan type for JSONMultiFileInfo::Scan");
|
||||
}
|
||||
}
|
||||
|
||||
void JSONReader::FinishFile(ClientContext &context, GlobalTableFunctionState &global_state) {
|
||||
auto &gstate = global_state.Cast<JSONGlobalTableFunctionState>().state;
|
||||
gstate.file_is_assigned = false;
|
||||
}
|
||||
|
||||
void JSONMultiFileInfo::FinishReading(ClientContext &context, GlobalTableFunctionState &global_state,
|
||||
LocalTableFunctionState &local_state) {
|
||||
auto &lstate = local_state.Cast<JSONLocalTableFunctionState>().state;
|
||||
lstate.GetScanState().ResetForNextBuffer();
|
||||
}
|
||||
|
||||
unique_ptr<NodeStatistics> JSONMultiFileInfo::GetCardinality(const MultiFileBindData &bind_data, idx_t file_count) {
|
||||
auto &json_data = bind_data.bind_data->Cast<JSONScanData>();
|
||||
idx_t per_file_cardinality = 42;
|
||||
// get the average per-file cardinality from the bind data (if it is set)
|
||||
if (json_data.estimated_cardinality_per_file.IsValid()) {
|
||||
per_file_cardinality = json_data.estimated_cardinality_per_file.GetIndex();
|
||||
}
|
||||
return make_uniq<NodeStatistics>(per_file_cardinality * file_count);
|
||||
}
|
||||
|
||||
optional_idx JSONMultiFileInfo::MaxThreads(const MultiFileBindData &bind_data, const MultiFileGlobalState &global_state,
|
||||
FileExpandResult expand_result) {
|
||||
if (expand_result == FileExpandResult::MULTIPLE_FILES) {
|
||||
return optional_idx();
|
||||
}
|
||||
// get the max threads from the bind data (if it is set)
|
||||
auto &json_data = bind_data.bind_data->Cast<JSONScanData>();
|
||||
return json_data.max_threads;
|
||||
}
|
||||
|
||||
FileGlobInput JSONMultiFileInfo::GetGlobInput() {
|
||||
return FileGlobInput(FileGlobOptions::FALLBACK_GLOB, "json");
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
1101
external/duckdb/extension/json/json_reader.cpp
vendored
Normal file
1101
external/duckdb/extension/json/json_reader.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
128
external/duckdb/extension/json/json_scan.cpp
vendored
Normal file
128
external/duckdb/extension/json/json_scan.cpp
vendored
Normal file
@@ -0,0 +1,128 @@
|
||||
#include "json_scan.hpp"
|
||||
|
||||
#include "duckdb/common/enum_util.hpp"
|
||||
#include "duckdb/common/multi_file/multi_file_reader.hpp"
|
||||
#include "duckdb/common/serializer/deserializer.hpp"
|
||||
#include "duckdb/common/serializer/serializer.hpp"
|
||||
#include "duckdb/main/extension_helper.hpp"
|
||||
#include "duckdb/parallel/task_scheduler.hpp"
|
||||
#include "duckdb/storage/buffer_manager.hpp"
|
||||
#include "json_multi_file_info.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
JSONScanData::JSONScanData() {
|
||||
}
|
||||
|
||||
void JSONScanData::InitializeFormats() {
|
||||
InitializeFormats(options.auto_detect);
|
||||
}
|
||||
|
||||
void JSONScanData::InitializeFormats(bool auto_detect_p) {
|
||||
type_id_map_t<vector<StrpTimeFormat>> candidate_formats;
|
||||
// Initialize date_format_map if anything was specified
|
||||
if (!options.date_format.empty()) {
|
||||
DateFormatMap::AddFormat(candidate_formats, LogicalTypeId::DATE, options.date_format);
|
||||
}
|
||||
if (!options.timestamp_format.empty()) {
|
||||
DateFormatMap::AddFormat(candidate_formats, LogicalTypeId::TIMESTAMP, options.timestamp_format);
|
||||
}
|
||||
|
||||
if (auto_detect_p) {
|
||||
static const type_id_map_t<vector<const char *>> FORMAT_TEMPLATES = {
|
||||
{LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
|
||||
{LogicalTypeId::TIMESTAMP,
|
||||
{"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
|
||||
"%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"}},
|
||||
};
|
||||
|
||||
// Populate possible date/timestamp formats, assume this is consistent across columns
|
||||
for (auto &kv : FORMAT_TEMPLATES) {
|
||||
const auto &logical_type = kv.first;
|
||||
if (DateFormatMap::HasFormats(candidate_formats, logical_type)) {
|
||||
continue; // Already populated
|
||||
}
|
||||
const auto &format_strings = kv.second;
|
||||
for (auto &format_string : format_strings) {
|
||||
DateFormatMap::AddFormat(candidate_formats, logical_type, format_string);
|
||||
}
|
||||
}
|
||||
}
|
||||
date_format_map = make_uniq<DateFormatMap>(std::move(candidate_formats));
|
||||
}
|
||||
|
||||
JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, const MultiFileBindData &bind_data_p)
|
||||
: bind_data(bind_data_p), json_data(bind_data.bind_data->Cast<JSONScanData>()),
|
||||
transform_options(json_data.transform_options), allocator(BufferAllocator::Get(context)),
|
||||
buffer_capacity(json_data.options.maximum_object_size * 2),
|
||||
system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()),
|
||||
enable_parallel_scans(bind_data.file_list->GetTotalFileCount() < system_threads) {
|
||||
}
|
||||
|
||||
JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
|
||||
: scan_state(context, gstate.allocator, gstate.buffer_capacity) {
|
||||
}
|
||||
|
||||
JSONGlobalTableFunctionState::JSONGlobalTableFunctionState(ClientContext &context, const MultiFileBindData &bind_data)
|
||||
: state(context, bind_data) {
|
||||
}
|
||||
|
||||
JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate)
|
||||
: state(context, gstate) {
|
||||
}
|
||||
|
||||
idx_t JSONScanLocalState::Read() {
|
||||
return scan_state.current_reader->Scan(scan_state);
|
||||
}
|
||||
|
||||
void JSONScanLocalState::ParseJSON(char *const json_start, const idx_t json_size, const idx_t remaining) {
|
||||
scan_state.current_reader->ParseJSON(scan_state, json_start, json_size, remaining);
|
||||
}
|
||||
|
||||
bool JSONScanLocalState::TryInitializeScan(JSONScanGlobalState &gstate, JSONReader &reader) {
|
||||
// try to initialize a scan in the given reader
|
||||
// three scenarios:
|
||||
// scenario 1 - unseekable file - Read from the file and setup the buffers
|
||||
// scenario 2 - seekable file - get the position from the file to read and return
|
||||
// scenario 3 - entire file readers - if we are reading an entire file at once, do not do anything here, except for
|
||||
// setting up the basics
|
||||
auto read_type = JSONFileReadType::SCAN_PARTIAL;
|
||||
if (!gstate.enable_parallel_scans || reader.GetFormat() != JSONFormat::NEWLINE_DELIMITED) {
|
||||
read_type = JSONFileReadType::SCAN_ENTIRE_FILE;
|
||||
}
|
||||
if (read_type == JSONFileReadType::SCAN_ENTIRE_FILE) {
|
||||
if (gstate.file_is_assigned) {
|
||||
return false;
|
||||
}
|
||||
gstate.file_is_assigned = true;
|
||||
}
|
||||
return reader.InitializeScan(scan_state, read_type);
|
||||
}
|
||||
|
||||
void JSONScanLocalState::AddTransformError(idx_t object_index, const string &error_message) {
|
||||
scan_state.current_reader->AddTransformError(scan_state, object_index, error_message);
|
||||
}
|
||||
|
||||
void JSONScan::Serialize(Serializer &serializer, const optional_ptr<FunctionData> bind_data_p, const TableFunction &) {
|
||||
throw NotImplementedException("JSONScan Serialize not implemented");
|
||||
}
|
||||
|
||||
unique_ptr<FunctionData> JSONScan::Deserialize(Deserializer &deserializer, TableFunction &) {
|
||||
throw NotImplementedException("JSONScan Deserialize not implemented");
|
||||
}
|
||||
|
||||
void JSONScan::TableFunctionDefaults(TableFunction &table_function) {
|
||||
table_function.named_parameters["maximum_object_size"] = LogicalType::UINTEGER;
|
||||
table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
|
||||
table_function.named_parameters["format"] = LogicalType::VARCHAR;
|
||||
table_function.named_parameters["compression"] = LogicalType::VARCHAR;
|
||||
|
||||
table_function.serialize = Serialize;
|
||||
table_function.deserialize = Deserialize;
|
||||
|
||||
table_function.projection_pushdown = true;
|
||||
table_function.filter_pushdown = false;
|
||||
table_function.filter_prune = false;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
219
external/duckdb/extension/json/json_serializer.cpp
vendored
Normal file
219
external/duckdb/extension/json/json_serializer.cpp
vendored
Normal file
@@ -0,0 +1,219 @@
|
||||
#include "json_serializer.hpp"
|
||||
#include "duckdb/common/types/blob.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
void JsonSerializer::PushValue(yyjson_mut_val *val) {
|
||||
auto current = Current();
|
||||
// Array case, just append the value
|
||||
if (yyjson_mut_is_arr(current)) {
|
||||
yyjson_mut_arr_append(current, val);
|
||||
}
|
||||
// Object case, use the currently set tag.
|
||||
else if (yyjson_mut_is_obj(current)) {
|
||||
yyjson_mut_obj_add(current, current_tag, val);
|
||||
}
|
||||
// Else throw
|
||||
else {
|
||||
throw InternalException("Cannot add value to non-array/object json value");
|
||||
}
|
||||
}
|
||||
|
||||
void JsonSerializer::OnPropertyBegin(const field_id_t, const char *tag) {
|
||||
current_tag = yyjson_mut_strcpy(doc, tag);
|
||||
}
|
||||
|
||||
void JsonSerializer::OnPropertyEnd() {
|
||||
}
|
||||
|
||||
void JsonSerializer::OnOptionalPropertyBegin(const field_id_t, const char *tag, bool) {
|
||||
current_tag = yyjson_mut_strcpy(doc, tag);
|
||||
}
|
||||
|
||||
void JsonSerializer::OnOptionalPropertyEnd(bool) {
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
// Nested Types
|
||||
//-------------------------------------------------------------------------
|
||||
void JsonSerializer::OnNullableBegin(bool present) {
|
||||
if (!present && !skip_if_null) {
|
||||
WriteNull();
|
||||
}
|
||||
}
|
||||
|
||||
void JsonSerializer::OnNullableEnd() {
|
||||
}
|
||||
|
||||
void JsonSerializer::OnListBegin(idx_t count) {
|
||||
auto new_value = yyjson_mut_arr(doc);
|
||||
// We always push a value to the stack, we just don't add it as a child to the current value
|
||||
// if skipping empty. Even though it is "unnecessary" to create an empty value just to discard it,
|
||||
// this allows the rest of the code to keep on like normal.
|
||||
if (!(count == 0 && skip_if_empty)) {
|
||||
PushValue(new_value);
|
||||
}
|
||||
stack.push_back(new_value);
|
||||
}
|
||||
|
||||
void JsonSerializer::OnListEnd() {
|
||||
stack.pop_back();
|
||||
}
|
||||
|
||||
void JsonSerializer::OnObjectBegin() {
|
||||
auto new_value = yyjson_mut_obj(doc);
|
||||
PushValue(new_value);
|
||||
stack.push_back(new_value);
|
||||
}
|
||||
|
||||
void JsonSerializer::OnObjectEnd() {
|
||||
auto obj = Current();
|
||||
auto count = yyjson_mut_obj_size(obj);
|
||||
|
||||
stack.pop_back();
|
||||
|
||||
if (count == 0 && skip_if_empty && !stack.empty()) {
|
||||
// remove obj from parent since it was empty
|
||||
auto parent = Current();
|
||||
if (yyjson_mut_is_arr(parent)) {
|
||||
size_t idx;
|
||||
size_t max;
|
||||
yyjson_mut_val *item;
|
||||
size_t found;
|
||||
yyjson_mut_arr_foreach(parent, idx, max, item) {
|
||||
if (item == obj) {
|
||||
found = idx;
|
||||
}
|
||||
}
|
||||
yyjson_mut_arr_remove(parent, found);
|
||||
} else if (yyjson_mut_is_obj(parent)) {
|
||||
size_t idx;
|
||||
size_t max;
|
||||
yyjson_mut_val *item;
|
||||
yyjson_mut_val *key;
|
||||
const char *found;
|
||||
yyjson_mut_obj_foreach(parent, idx, max, key, item) {
|
||||
if (item == obj) {
|
||||
found = yyjson_mut_get_str(key);
|
||||
}
|
||||
}
|
||||
yyjson_mut_obj_remove_key(parent, found);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
// Primitive Types
|
||||
//-------------------------------------------------------------------------
|
||||
void JsonSerializer::WriteNull() {
|
||||
if (skip_if_null) {
|
||||
return;
|
||||
}
|
||||
auto val = yyjson_mut_null(doc);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(uint8_t value) {
|
||||
auto val = yyjson_mut_uint(doc, value);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(int8_t value) {
|
||||
auto val = yyjson_mut_sint(doc, value);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(uint16_t value) {
|
||||
auto val = yyjson_mut_uint(doc, value);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(int16_t value) {
|
||||
auto val = yyjson_mut_sint(doc, value);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(uint32_t value) {
|
||||
auto val = yyjson_mut_uint(doc, value);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(int32_t value) {
|
||||
auto val = yyjson_mut_sint(doc, value);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(uint64_t value) {
|
||||
auto val = yyjson_mut_uint(doc, value);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(int64_t value) {
|
||||
auto val = yyjson_mut_sint(doc, value);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(hugeint_t value) {
|
||||
auto val = yyjson_mut_obj(doc);
|
||||
PushValue(val);
|
||||
stack.push_back(val);
|
||||
WriteProperty(100, "upper", value.upper);
|
||||
WriteProperty(101, "lower", value.lower);
|
||||
stack.pop_back();
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(uhugeint_t value) {
|
||||
auto val = yyjson_mut_obj(doc);
|
||||
PushValue(val);
|
||||
stack.push_back(val);
|
||||
WriteProperty(100, "upper", value.upper);
|
||||
WriteProperty(101, "lower", value.lower);
|
||||
stack.pop_back();
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(float value) {
|
||||
auto val = yyjson_mut_real(doc, value);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(double value) {
|
||||
auto val = yyjson_mut_real(doc, value);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(const string &value) {
|
||||
if (skip_if_empty && value.empty()) {
|
||||
return;
|
||||
}
|
||||
auto val = yyjson_mut_strncpy(doc, value.c_str(), value.size());
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(const string_t value) {
|
||||
if (skip_if_empty && value.GetSize() == 0) {
|
||||
return;
|
||||
}
|
||||
auto val = yyjson_mut_strncpy(doc, value.GetData(), value.GetSize());
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(const char *value) {
|
||||
if (skip_if_empty && strlen(value) == 0) {
|
||||
return;
|
||||
}
|
||||
auto val = yyjson_mut_strcpy(doc, value);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteValue(bool value) {
|
||||
auto val = yyjson_mut_bool(doc, value);
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
void JsonSerializer::WriteDataPtr(const_data_ptr_t ptr, idx_t count) {
|
||||
auto blob = Blob::ToString(string_t(const_char_ptr_cast(ptr), count));
|
||||
auto val = yyjson_mut_strcpy(doc, blob.c_str());
|
||||
PushValue(val);
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
30
external/duckdb/extension/json/serialize_json.cpp
vendored
Normal file
30
external/duckdb/extension/json/serialize_json.cpp
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// This file is automatically generated by scripts/generate_serialization.py
|
||||
// Do not edit this file manually, your changes will be overwritten
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "duckdb/common/serializer/serializer.hpp"
|
||||
#include "duckdb/common/serializer/deserializer.hpp"
|
||||
#include "json_transform.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
void JSONTransformOptions::Serialize(Serializer &serializer) const {
|
||||
serializer.WritePropertyWithDefault<bool>(100, "strict_cast", strict_cast);
|
||||
serializer.WritePropertyWithDefault<bool>(101, "error_duplicate_key", error_duplicate_key);
|
||||
serializer.WritePropertyWithDefault<bool>(102, "error_missing_key", error_missing_key);
|
||||
serializer.WritePropertyWithDefault<bool>(103, "error_unknown_key", error_unknown_key);
|
||||
serializer.WritePropertyWithDefault<bool>(104, "delay_error", delay_error);
|
||||
}
|
||||
|
||||
JSONTransformOptions JSONTransformOptions::Deserialize(Deserializer &deserializer) {
|
||||
JSONTransformOptions result;
|
||||
deserializer.ReadPropertyWithDefault<bool>(100, "strict_cast", result.strict_cast);
|
||||
deserializer.ReadPropertyWithDefault<bool>(101, "error_duplicate_key", result.error_duplicate_key);
|
||||
deserializer.ReadPropertyWithDefault<bool>(102, "error_missing_key", result.error_missing_key);
|
||||
deserializer.ReadPropertyWithDefault<bool>(103, "error_unknown_key", result.error_unknown_key);
|
||||
deserializer.ReadPropertyWithDefault<bool>(104, "delay_error", result.delay_error);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
Reference in New Issue
Block a user