389 lines
13 KiB
C++
389 lines
13 KiB
C++
//===----------------------------------------------------------------------===//
|
|
// DuckDB
|
|
//
|
|
// json_common.hpp
|
|
//
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#pragma once
|
|
|
|
#include "duckdb/common/operator/cast_operators.hpp"
|
|
#include "duckdb/common/operator/decimal_cast_operators.hpp"
|
|
#include "duckdb/common/operator/string_cast.hpp"
|
|
#include "duckdb/planner/expression/bound_function_expression.hpp"
|
|
#include "yyjson.hpp"
|
|
#include "duckdb/common/types/blob.hpp"
|
|
|
|
using namespace duckdb_yyjson; // NOLINT
|
|
|
|
namespace duckdb {
|
|
|
|
class JSONAllocator;
|
|
|
|
class JSONStringVectorBuffer : public VectorBuffer {
|
|
public:
|
|
explicit JSONStringVectorBuffer(shared_ptr<JSONAllocator> allocator_p)
|
|
: VectorBuffer(VectorBufferType::OPAQUE_BUFFER), allocator(std::move(allocator_p)) {
|
|
}
|
|
|
|
private:
|
|
shared_ptr<JSONAllocator> allocator;
|
|
};
|
|
|
|
//! JSON allocator is a custom allocator for yyjson that prevents many tiny allocations
|
|
class JSONAllocator : public enable_shared_from_this<JSONAllocator> {
|
|
public:
|
|
explicit JSONAllocator(Allocator &allocator)
|
|
: arena_allocator(allocator), yyjson_allocator({Allocate, Reallocate, Free, this}) {
|
|
}
|
|
|
|
inline yyjson_alc *GetYYAlc() {
|
|
return &yyjson_allocator;
|
|
}
|
|
|
|
void Reset() {
|
|
arena_allocator.Reset();
|
|
}
|
|
|
|
void AddBuffer(Vector &vector) {
|
|
if (vector.GetType().InternalType() == PhysicalType::VARCHAR) {
|
|
StringVector::AddBuffer(vector, make_buffer<JSONStringVectorBuffer>(shared_from_this()));
|
|
}
|
|
}
|
|
|
|
static void AddBuffer(Vector &vector, yyjson_alc *alc) {
|
|
auto alloc = (JSONAllocator *)alc->ctx; // NOLINT
|
|
alloc->AddBuffer(vector);
|
|
}
|
|
|
|
private:
|
|
static inline void *Allocate(void *ctx, size_t size) {
|
|
auto alloc = (JSONAllocator *)ctx; // NOLINT
|
|
return alloc->arena_allocator.AllocateAligned(size);
|
|
}
|
|
|
|
static inline void *Reallocate(void *ctx, void *ptr, size_t old_size, size_t size) {
|
|
auto alloc = (JSONAllocator *)ctx; // NOLINT
|
|
return alloc->arena_allocator.ReallocateAligned(data_ptr_cast(ptr), old_size, size);
|
|
}
|
|
|
|
static inline void Free(void *ctx, void *ptr) {
|
|
// NOP because ArenaAllocator can't free
|
|
}
|
|
|
|
private:
|
|
ArenaAllocator arena_allocator;
|
|
yyjson_alc yyjson_allocator;
|
|
};
|
|
|
|
//! JSONKey / json_key_map_t speeds up mapping from JSON key to column ID
|
|
struct JSONKey {
|
|
const char *ptr;
|
|
size_t len;
|
|
};
|
|
|
|
struct JSONKeyHash {
|
|
inline std::size_t operator()(const JSONKey &k) const {
|
|
size_t result;
|
|
if (k.len >= sizeof(size_t)) {
|
|
memcpy(&result, k.ptr + k.len - sizeof(size_t), sizeof(size_t));
|
|
} else {
|
|
result = 0;
|
|
FastMemcpy(&result, k.ptr, k.len);
|
|
}
|
|
return result;
|
|
}
|
|
};
|
|
|
|
struct JSONKeyEquality {
|
|
inline bool operator()(const JSONKey &a, const JSONKey &b) const {
|
|
if (a.len != b.len) {
|
|
return false;
|
|
}
|
|
return FastMemcmp(a.ptr, b.ptr, a.len) == 0;
|
|
}
|
|
};
|
|
|
|
template <typename T>
|
|
using json_key_map_t = unordered_map<JSONKey, T, JSONKeyHash, JSONKeyEquality>;
|
|
using json_key_set_t = unordered_set<JSONKey, JSONKeyHash, JSONKeyEquality>;
|
|
|
|
//! Common JSON functionality for most JSON functions
|
|
struct JSONCommon {
|
|
public:
|
|
//! Read/Write flags
|
|
static constexpr auto READ_FLAG =
|
|
YYJSON_READ_ALLOW_INF_AND_NAN | YYJSON_READ_ALLOW_TRAILING_COMMAS | YYJSON_READ_BIGNUM_AS_RAW;
|
|
static constexpr auto READ_STOP_FLAG = READ_FLAG | YYJSON_READ_STOP_WHEN_DONE;
|
|
static constexpr auto READ_INSITU_FLAG = READ_STOP_FLAG | YYJSON_READ_INSITU;
|
|
static constexpr auto WRITE_FLAG = YYJSON_WRITE_ALLOW_INF_AND_NAN;
|
|
static constexpr auto WRITE_PRETTY_FLAG = YYJSON_WRITE_ALLOW_INF_AND_NAN | YYJSON_WRITE_PRETTY;
|
|
|
|
public:
|
|
//! Constant JSON type strings
|
|
static constexpr char const *TYPE_STRING_NULL = "NULL";
|
|
static constexpr char const *TYPE_STRING_BOOLEAN = "BOOLEAN";
|
|
static constexpr char const *TYPE_STRING_BIGINT = "BIGINT";
|
|
static constexpr char const *TYPE_STRING_UBIGINT = "UBIGINT";
|
|
static constexpr char const *TYPE_STRING_DOUBLE = "DOUBLE";
|
|
static constexpr char const *TYPE_STRING_HUGEINT = "HUGEINT";
|
|
static constexpr char const *TYPE_STRING_VARCHAR = "VARCHAR";
|
|
static constexpr char const *TYPE_STRING_ARRAY = "ARRAY";
|
|
static constexpr char const *TYPE_STRING_OBJECT = "OBJECT";
|
|
|
|
static inline const char *ValTypeToString(yyjson_val *val) {
|
|
switch (yyjson_get_tag(val)) {
|
|
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
|
|
return TYPE_STRING_NULL;
|
|
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC:
|
|
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
|
|
return TYPE_STRING_VARCHAR;
|
|
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
|
return TYPE_STRING_ARRAY;
|
|
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
|
return TYPE_STRING_OBJECT;
|
|
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_TRUE:
|
|
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_FALSE:
|
|
return TYPE_STRING_BOOLEAN;
|
|
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_UINT:
|
|
return TYPE_STRING_UBIGINT;
|
|
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_SINT:
|
|
return TYPE_STRING_BIGINT;
|
|
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_REAL:
|
|
case YYJSON_TYPE_RAW | YYJSON_SUBTYPE_NONE:
|
|
return TYPE_STRING_DOUBLE;
|
|
default:
|
|
throw InternalException("Unexpected yyjson tag in ValTypeToString");
|
|
}
|
|
}
|
|
|
|
static inline string_t ValTypeToStringT(yyjson_val *val) {
|
|
return string_t(ValTypeToString(val));
|
|
}
|
|
|
|
static inline LogicalTypeId ValTypeToLogicalTypeId(yyjson_val *val) {
|
|
switch (yyjson_get_tag(val)) {
|
|
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
|
|
return LogicalTypeId::SQLNULL;
|
|
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC:
|
|
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
|
|
return LogicalTypeId::VARCHAR;
|
|
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
|
return LogicalTypeId::LIST;
|
|
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
|
return LogicalTypeId::STRUCT;
|
|
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_TRUE:
|
|
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_FALSE:
|
|
return LogicalTypeId::BOOLEAN;
|
|
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_UINT:
|
|
return LogicalTypeId::UBIGINT;
|
|
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_SINT:
|
|
return LogicalTypeId::BIGINT;
|
|
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_REAL:
|
|
case YYJSON_TYPE_RAW | YYJSON_SUBTYPE_NONE:
|
|
return LogicalTypeId::DOUBLE;
|
|
default:
|
|
throw InternalException("Unexpected yyjson tag in ValTypeToLogicalTypeId");
|
|
}
|
|
}
|
|
|
|
public:
|
|
//===--------------------------------------------------------------------===//
|
|
// Document creation / reading / writing
|
|
//===--------------------------------------------------------------------===//
|
|
template <class T>
|
|
static T *AllocateArray(yyjson_alc *alc, idx_t count) {
|
|
return reinterpret_cast<T *>(alc->malloc(alc->ctx, sizeof(T) * count));
|
|
}
|
|
|
|
template <class T>
|
|
static T *AllocateArray(yyjson_mut_doc *doc, idx_t count) {
|
|
return AllocateArray<T>(&doc->alc, count);
|
|
}
|
|
|
|
static inline yyjson_mut_doc *CreateDocument(yyjson_alc *alc) {
|
|
D_ASSERT(alc);
|
|
return yyjson_mut_doc_new(alc);
|
|
}
|
|
static inline yyjson_doc *ReadDocumentUnsafe(char *data, idx_t size, const yyjson_read_flag flg, yyjson_alc *alc,
|
|
yyjson_read_err *err = nullptr) {
|
|
D_ASSERT(alc);
|
|
return yyjson_read_opts(data, size, flg, alc, err);
|
|
}
|
|
static inline yyjson_doc *ReadDocumentUnsafe(const string_t &input, const yyjson_read_flag flg, yyjson_alc *alc,
|
|
yyjson_read_err *err = nullptr) {
|
|
return ReadDocumentUnsafe(input.GetDataWriteable(), input.GetSize(), flg, alc, err);
|
|
}
|
|
static inline yyjson_doc *ReadDocument(char *data, idx_t size, const yyjson_read_flag flg, yyjson_alc *alc) {
|
|
yyjson_read_err error;
|
|
auto result = ReadDocumentUnsafe(data, size, flg, alc, &error);
|
|
if (error.code != YYJSON_READ_SUCCESS) {
|
|
ThrowParseError(data, size, error);
|
|
}
|
|
return result;
|
|
}
|
|
static inline yyjson_doc *ReadDocument(const string_t &input, const yyjson_read_flag flg, yyjson_alc *alc) {
|
|
return ReadDocument(input.GetDataWriteable(), input.GetSize(), flg, alc);
|
|
}
|
|
|
|
static string FormatParseError(const char *data, idx_t length, yyjson_read_err &error, const string &extra = "") {
|
|
D_ASSERT(error.code != YYJSON_READ_SUCCESS);
|
|
// Truncate, so we don't print megabytes worth of JSON
|
|
auto input = length > 50 ? string(data, 47) + "..." : string(data, length);
|
|
// Have to replace \r, otherwise output is unreadable
|
|
input = StringUtil::Replace(input, "\r", "\\r");
|
|
return StringUtil::Format("Malformed JSON at byte %lld of input: %s. %s Input: \"%s\"", error.pos, error.msg,
|
|
extra, input);
|
|
}
|
|
static void ThrowParseError(const char *data, idx_t length, yyjson_read_err &error, const string &extra = "") {
|
|
throw InvalidInputException(FormatParseError(data, length, error, extra));
|
|
}
|
|
|
|
template <class YYJSON_VAL_T>
|
|
static inline char *WriteVal(YYJSON_VAL_T *val, yyjson_alc *alc, idx_t &len) {
|
|
throw InternalException("Unknown yyjson val type");
|
|
}
|
|
template <class YYJSON_VAL_T>
|
|
static inline string_t WriteVal(YYJSON_VAL_T *val, yyjson_alc *alc) {
|
|
D_ASSERT(alc);
|
|
idx_t len;
|
|
auto data = WriteVal<YYJSON_VAL_T>(val, alc, len);
|
|
return string_t(data, len);
|
|
}
|
|
|
|
//! Slow and easy ToString for errors
|
|
static string ValToString(yyjson_val *val, idx_t max_len = DConstants::INVALID_INDEX);
|
|
//! Throw an error with the printed yyjson_val
|
|
static void ThrowValFormatError(string error_string, yyjson_val *val);
|
|
|
|
public:
|
|
//===--------------------------------------------------------------------===//
|
|
// JSON pointer / path
|
|
//===--------------------------------------------------------------------===//
|
|
enum class JSONPathType : uint8_t {
|
|
//! Extract a single value
|
|
REGULAR = 0,
|
|
//! Extract multiple values (when we have a '*' wildcard in the JSON Path)
|
|
WILDCARD = 1,
|
|
};
|
|
|
|
//! Get JSON value using JSON path query (safe, checks the path query)
|
|
static inline yyjson_val *Get(yyjson_val *val, const string_t &path_str, bool integral_argument) {
|
|
auto ptr = path_str.GetData();
|
|
auto len = path_str.GetSize();
|
|
if (len == 0) {
|
|
return GetUnsafe(val, ptr, len);
|
|
}
|
|
if (integral_argument) {
|
|
auto str = "$[" + path_str.GetString() + "]";
|
|
return GetUnsafe(val, str.c_str(), str.length());
|
|
}
|
|
switch (*ptr) {
|
|
case '/': {
|
|
// '/' notation must be '\0'-terminated
|
|
auto str = string(ptr, len);
|
|
return GetUnsafe(val, str.c_str(), len);
|
|
}
|
|
case '$': {
|
|
if (ValidatePath(ptr, len, false) == JSONPathType::WILDCARD) {
|
|
throw InvalidInputException(
|
|
"JSON path cannot contain wildcards if the path is not a constant parameter");
|
|
}
|
|
return GetUnsafe(val, ptr, len);
|
|
}
|
|
default: {
|
|
string path;
|
|
if (memchr(ptr, '"', len)) {
|
|
path = "/" + string(ptr, len);
|
|
} else {
|
|
path = "$.\"" + path_str.GetString() + "\"";
|
|
}
|
|
return GetUnsafe(val, path.c_str(), path.length());
|
|
}
|
|
}
|
|
}
|
|
|
|
//! Get JSON value using JSON path query (unsafe)
|
|
static inline yyjson_val *GetUnsafe(yyjson_val *val, const char *ptr, const idx_t &len) {
|
|
if (len == 0) {
|
|
return val;
|
|
}
|
|
switch (*ptr) {
|
|
case '/':
|
|
return GetPointer(val, ptr, len);
|
|
case '$':
|
|
return GetPath(val, ptr, len);
|
|
default:
|
|
throw InternalException("JSON pointer/path does not start with '/' or '$'");
|
|
}
|
|
}
|
|
|
|
//! Get JSON value using JSON path query (unsafe)
|
|
static void GetWildcardPath(yyjson_val *val, const char *ptr, const idx_t &len, vector<yyjson_val *> &vals);
|
|
|
|
//! Validate JSON Path ($.field[index]... syntax), returns true if there are wildcards in the path
|
|
static JSONPathType ValidatePath(const char *ptr, const idx_t &len, const bool binder);
|
|
|
|
public:
|
|
//! Same as BigQuery json_value
|
|
static inline string_t JSONValue(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &mask, idx_t idx) {
|
|
switch (yyjson_get_tag(val)) {
|
|
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
|
|
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
|
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
|
mask.SetInvalid(idx);
|
|
return string_t {};
|
|
default:
|
|
return JSONCommon::WriteVal<yyjson_val>(val, alc);
|
|
}
|
|
}
|
|
|
|
private:
|
|
//! Get JSON pointer (/field/index/... syntax)
|
|
static inline yyjson_val *GetPointer(yyjson_val *val, const char *ptr, const idx_t &len) {
|
|
yyjson_ptr_err err;
|
|
return unsafe_yyjson_ptr_getx(val, ptr, len, &err);
|
|
}
|
|
//! Get JSON path ($.field[index]... syntax)
|
|
static yyjson_val *GetPath(yyjson_val *val, const char *ptr, const idx_t &len);
|
|
};
|
|
|
|
template <>
|
|
inline char *JSONCommon::WriteVal(yyjson_val *val, yyjson_alc *alc, idx_t &len) {
|
|
size_t len_size_t;
|
|
// yyjson_val_write_opts must not throw
|
|
auto ret = yyjson_val_write_opts(val, JSONCommon::WRITE_FLAG, alc, &len_size_t, nullptr);
|
|
len = len_size_t;
|
|
return ret;
|
|
}
|
|
template <>
|
|
inline char *JSONCommon::WriteVal(yyjson_mut_val *val, yyjson_alc *alc, idx_t &len) {
|
|
size_t len_size_t;
|
|
// yyjson_mut_val_write_opts must not throw
|
|
auto ret = yyjson_mut_val_write_opts(val, JSONCommon::WRITE_FLAG, alc, &len_size_t, nullptr);
|
|
len = len_size_t;
|
|
return ret;
|
|
}
|
|
|
|
struct yyjson_doc_deleter {
|
|
void operator()(yyjson_doc *doc) {
|
|
if (doc) {
|
|
yyjson_doc_free(doc);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct yyjson_mut_doc_deleter {
|
|
void operator()(yyjson_mut_doc *doc) {
|
|
if (doc) {
|
|
yyjson_mut_doc_free(doc);
|
|
}
|
|
}
|
|
};
|
|
|
|
using yyjson_doc_ptr = unique_ptr<yyjson_doc, yyjson_doc_deleter>;
|
|
using yyjson_mut_doc_ptr = unique_ptr<yyjson_mut_doc, yyjson_mut_doc_deleter>;
|
|
|
|
} // namespace duckdb
|