should be it
This commit is contained in:
36
external/duckdb/extension/json/include/json.json
vendored
Normal file
36
external/duckdb/extension/json/include/json.json
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
[
|
||||
{
|
||||
"class": "JSONTransformOptions",
|
||||
"includes": [
|
||||
"json_transform.hpp"
|
||||
],
|
||||
"members": [
|
||||
{
|
||||
"id": 100,
|
||||
"name": "strict_cast",
|
||||
"type": "bool"
|
||||
},
|
||||
{
|
||||
"id": 101,
|
||||
"name": "error_duplicate_key",
|
||||
"type": "bool"
|
||||
},
|
||||
{
|
||||
"id": 102,
|
||||
"name": "error_missing_key",
|
||||
"type": "bool"
|
||||
},
|
||||
{
|
||||
"id": 103,
|
||||
"name": "error_unknown_key",
|
||||
"type": "bool"
|
||||
},
|
||||
{
|
||||
"id": 104,
|
||||
"name": "delay_error",
|
||||
"type": "bool"
|
||||
}
|
||||
],
|
||||
"pointer_type": "none"
|
||||
}
|
||||
]
|
||||
388
external/duckdb/extension/json/include/json_common.hpp
vendored
Normal file
388
external/duckdb/extension/json/include/json_common.hpp
vendored
Normal file
@@ -0,0 +1,388 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_common.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/operator/cast_operators.hpp"
|
||||
#include "duckdb/common/operator/decimal_cast_operators.hpp"
|
||||
#include "duckdb/common/operator/string_cast.hpp"
|
||||
#include "duckdb/planner/expression/bound_function_expression.hpp"
|
||||
#include "yyjson.hpp"
|
||||
#include "duckdb/common/types/blob.hpp"
|
||||
|
||||
using namespace duckdb_yyjson; // NOLINT
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class JSONAllocator;
|
||||
|
||||
class JSONStringVectorBuffer : public VectorBuffer {
|
||||
public:
|
||||
explicit JSONStringVectorBuffer(shared_ptr<JSONAllocator> allocator_p)
|
||||
: VectorBuffer(VectorBufferType::OPAQUE_BUFFER), allocator(std::move(allocator_p)) {
|
||||
}
|
||||
|
||||
private:
|
||||
shared_ptr<JSONAllocator> allocator;
|
||||
};
|
||||
|
||||
//! JSON allocator is a custom allocator for yyjson that prevents many tiny allocations
|
||||
class JSONAllocator : public enable_shared_from_this<JSONAllocator> {
|
||||
public:
|
||||
explicit JSONAllocator(Allocator &allocator)
|
||||
: arena_allocator(allocator), yyjson_allocator({Allocate, Reallocate, Free, this}) {
|
||||
}
|
||||
|
||||
inline yyjson_alc *GetYYAlc() {
|
||||
return &yyjson_allocator;
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
arena_allocator.Reset();
|
||||
}
|
||||
|
||||
void AddBuffer(Vector &vector) {
|
||||
if (vector.GetType().InternalType() == PhysicalType::VARCHAR) {
|
||||
StringVector::AddBuffer(vector, make_buffer<JSONStringVectorBuffer>(shared_from_this()));
|
||||
}
|
||||
}
|
||||
|
||||
static void AddBuffer(Vector &vector, yyjson_alc *alc) {
|
||||
auto alloc = (JSONAllocator *)alc->ctx; // NOLINT
|
||||
alloc->AddBuffer(vector);
|
||||
}
|
||||
|
||||
private:
|
||||
static inline void *Allocate(void *ctx, size_t size) {
|
||||
auto alloc = (JSONAllocator *)ctx; // NOLINT
|
||||
return alloc->arena_allocator.AllocateAligned(size);
|
||||
}
|
||||
|
||||
static inline void *Reallocate(void *ctx, void *ptr, size_t old_size, size_t size) {
|
||||
auto alloc = (JSONAllocator *)ctx; // NOLINT
|
||||
return alloc->arena_allocator.ReallocateAligned(data_ptr_cast(ptr), old_size, size);
|
||||
}
|
||||
|
||||
static inline void Free(void *ctx, void *ptr) {
|
||||
// NOP because ArenaAllocator can't free
|
||||
}
|
||||
|
||||
private:
|
||||
ArenaAllocator arena_allocator;
|
||||
yyjson_alc yyjson_allocator;
|
||||
};
|
||||
|
||||
//! JSONKey / json_key_map_t speeds up mapping from JSON key to column ID
|
||||
struct JSONKey {
|
||||
const char *ptr;
|
||||
size_t len;
|
||||
};
|
||||
|
||||
struct JSONKeyHash {
|
||||
inline std::size_t operator()(const JSONKey &k) const {
|
||||
size_t result;
|
||||
if (k.len >= sizeof(size_t)) {
|
||||
memcpy(&result, k.ptr + k.len - sizeof(size_t), sizeof(size_t));
|
||||
} else {
|
||||
result = 0;
|
||||
FastMemcpy(&result, k.ptr, k.len);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
struct JSONKeyEquality {
|
||||
inline bool operator()(const JSONKey &a, const JSONKey &b) const {
|
||||
if (a.len != b.len) {
|
||||
return false;
|
||||
}
|
||||
return FastMemcmp(a.ptr, b.ptr, a.len) == 0;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using json_key_map_t = unordered_map<JSONKey, T, JSONKeyHash, JSONKeyEquality>;
|
||||
using json_key_set_t = unordered_set<JSONKey, JSONKeyHash, JSONKeyEquality>;
|
||||
|
||||
//! Common JSON functionality for most JSON functions
|
||||
struct JSONCommon {
|
||||
public:
|
||||
//! Read/Write flags
|
||||
static constexpr auto READ_FLAG =
|
||||
YYJSON_READ_ALLOW_INF_AND_NAN | YYJSON_READ_ALLOW_TRAILING_COMMAS | YYJSON_READ_BIGNUM_AS_RAW;
|
||||
static constexpr auto READ_STOP_FLAG = READ_FLAG | YYJSON_READ_STOP_WHEN_DONE;
|
||||
static constexpr auto READ_INSITU_FLAG = READ_STOP_FLAG | YYJSON_READ_INSITU;
|
||||
static constexpr auto WRITE_FLAG = YYJSON_WRITE_ALLOW_INF_AND_NAN;
|
||||
static constexpr auto WRITE_PRETTY_FLAG = YYJSON_WRITE_ALLOW_INF_AND_NAN | YYJSON_WRITE_PRETTY;
|
||||
|
||||
public:
|
||||
//! Constant JSON type strings
|
||||
static constexpr char const *TYPE_STRING_NULL = "NULL";
|
||||
static constexpr char const *TYPE_STRING_BOOLEAN = "BOOLEAN";
|
||||
static constexpr char const *TYPE_STRING_BIGINT = "BIGINT";
|
||||
static constexpr char const *TYPE_STRING_UBIGINT = "UBIGINT";
|
||||
static constexpr char const *TYPE_STRING_DOUBLE = "DOUBLE";
|
||||
static constexpr char const *TYPE_STRING_HUGEINT = "HUGEINT";
|
||||
static constexpr char const *TYPE_STRING_VARCHAR = "VARCHAR";
|
||||
static constexpr char const *TYPE_STRING_ARRAY = "ARRAY";
|
||||
static constexpr char const *TYPE_STRING_OBJECT = "OBJECT";
|
||||
|
||||
static inline const char *ValTypeToString(yyjson_val *val) {
|
||||
switch (yyjson_get_tag(val)) {
|
||||
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
|
||||
return TYPE_STRING_NULL;
|
||||
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC:
|
||||
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
|
||||
return TYPE_STRING_VARCHAR;
|
||||
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
||||
return TYPE_STRING_ARRAY;
|
||||
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
||||
return TYPE_STRING_OBJECT;
|
||||
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_TRUE:
|
||||
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_FALSE:
|
||||
return TYPE_STRING_BOOLEAN;
|
||||
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_UINT:
|
||||
return TYPE_STRING_UBIGINT;
|
||||
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_SINT:
|
||||
return TYPE_STRING_BIGINT;
|
||||
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_REAL:
|
||||
case YYJSON_TYPE_RAW | YYJSON_SUBTYPE_NONE:
|
||||
return TYPE_STRING_DOUBLE;
|
||||
default:
|
||||
throw InternalException("Unexpected yyjson tag in ValTypeToString");
|
||||
}
|
||||
}
|
||||
|
||||
static inline string_t ValTypeToStringT(yyjson_val *val) {
|
||||
return string_t(ValTypeToString(val));
|
||||
}
|
||||
|
||||
static inline LogicalTypeId ValTypeToLogicalTypeId(yyjson_val *val) {
|
||||
switch (yyjson_get_tag(val)) {
|
||||
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
|
||||
return LogicalTypeId::SQLNULL;
|
||||
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC:
|
||||
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
|
||||
return LogicalTypeId::VARCHAR;
|
||||
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
||||
return LogicalTypeId::LIST;
|
||||
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
||||
return LogicalTypeId::STRUCT;
|
||||
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_TRUE:
|
||||
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_FALSE:
|
||||
return LogicalTypeId::BOOLEAN;
|
||||
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_UINT:
|
||||
return LogicalTypeId::UBIGINT;
|
||||
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_SINT:
|
||||
return LogicalTypeId::BIGINT;
|
||||
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_REAL:
|
||||
case YYJSON_TYPE_RAW | YYJSON_SUBTYPE_NONE:
|
||||
return LogicalTypeId::DOUBLE;
|
||||
default:
|
||||
throw InternalException("Unexpected yyjson tag in ValTypeToLogicalTypeId");
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Document creation / reading / writing
|
||||
//===--------------------------------------------------------------------===//
|
||||
template <class T>
|
||||
static T *AllocateArray(yyjson_alc *alc, idx_t count) {
|
||||
return reinterpret_cast<T *>(alc->malloc(alc->ctx, sizeof(T) * count));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static T *AllocateArray(yyjson_mut_doc *doc, idx_t count) {
|
||||
return AllocateArray<T>(&doc->alc, count);
|
||||
}
|
||||
|
||||
static inline yyjson_mut_doc *CreateDocument(yyjson_alc *alc) {
|
||||
D_ASSERT(alc);
|
||||
return yyjson_mut_doc_new(alc);
|
||||
}
|
||||
static inline yyjson_doc *ReadDocumentUnsafe(char *data, idx_t size, const yyjson_read_flag flg, yyjson_alc *alc,
|
||||
yyjson_read_err *err = nullptr) {
|
||||
D_ASSERT(alc);
|
||||
return yyjson_read_opts(data, size, flg, alc, err);
|
||||
}
|
||||
static inline yyjson_doc *ReadDocumentUnsafe(const string_t &input, const yyjson_read_flag flg, yyjson_alc *alc,
|
||||
yyjson_read_err *err = nullptr) {
|
||||
return ReadDocumentUnsafe(input.GetDataWriteable(), input.GetSize(), flg, alc, err);
|
||||
}
|
||||
static inline yyjson_doc *ReadDocument(char *data, idx_t size, const yyjson_read_flag flg, yyjson_alc *alc) {
|
||||
yyjson_read_err error;
|
||||
auto result = ReadDocumentUnsafe(data, size, flg, alc, &error);
|
||||
if (error.code != YYJSON_READ_SUCCESS) {
|
||||
ThrowParseError(data, size, error);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
static inline yyjson_doc *ReadDocument(const string_t &input, const yyjson_read_flag flg, yyjson_alc *alc) {
|
||||
return ReadDocument(input.GetDataWriteable(), input.GetSize(), flg, alc);
|
||||
}
|
||||
|
||||
static string FormatParseError(const char *data, idx_t length, yyjson_read_err &error, const string &extra = "") {
|
||||
D_ASSERT(error.code != YYJSON_READ_SUCCESS);
|
||||
// Truncate, so we don't print megabytes worth of JSON
|
||||
auto input = length > 50 ? string(data, 47) + "..." : string(data, length);
|
||||
// Have to replace \r, otherwise output is unreadable
|
||||
input = StringUtil::Replace(input, "\r", "\\r");
|
||||
return StringUtil::Format("Malformed JSON at byte %lld of input: %s. %s Input: \"%s\"", error.pos, error.msg,
|
||||
extra, input);
|
||||
}
|
||||
static void ThrowParseError(const char *data, idx_t length, yyjson_read_err &error, const string &extra = "") {
|
||||
throw InvalidInputException(FormatParseError(data, length, error, extra));
|
||||
}
|
||||
|
||||
template <class YYJSON_VAL_T>
|
||||
static inline char *WriteVal(YYJSON_VAL_T *val, yyjson_alc *alc, idx_t &len) {
|
||||
throw InternalException("Unknown yyjson val type");
|
||||
}
|
||||
template <class YYJSON_VAL_T>
|
||||
static inline string_t WriteVal(YYJSON_VAL_T *val, yyjson_alc *alc) {
|
||||
D_ASSERT(alc);
|
||||
idx_t len;
|
||||
auto data = WriteVal<YYJSON_VAL_T>(val, alc, len);
|
||||
return string_t(data, len);
|
||||
}
|
||||
|
||||
//! Slow and easy ToString for errors
|
||||
static string ValToString(yyjson_val *val, idx_t max_len = DConstants::INVALID_INDEX);
|
||||
//! Throw an error with the printed yyjson_val
|
||||
static void ThrowValFormatError(string error_string, yyjson_val *val);
|
||||
|
||||
public:
|
||||
//===--------------------------------------------------------------------===//
|
||||
// JSON pointer / path
|
||||
//===--------------------------------------------------------------------===//
|
||||
enum class JSONPathType : uint8_t {
|
||||
//! Extract a single value
|
||||
REGULAR = 0,
|
||||
//! Extract multiple values (when we have a '*' wildcard in the JSON Path)
|
||||
WILDCARD = 1,
|
||||
};
|
||||
|
||||
//! Get JSON value using JSON path query (safe, checks the path query)
|
||||
static inline yyjson_val *Get(yyjson_val *val, const string_t &path_str, bool integral_argument) {
|
||||
auto ptr = path_str.GetData();
|
||||
auto len = path_str.GetSize();
|
||||
if (len == 0) {
|
||||
return GetUnsafe(val, ptr, len);
|
||||
}
|
||||
if (integral_argument) {
|
||||
auto str = "$[" + path_str.GetString() + "]";
|
||||
return GetUnsafe(val, str.c_str(), str.length());
|
||||
}
|
||||
switch (*ptr) {
|
||||
case '/': {
|
||||
// '/' notation must be '\0'-terminated
|
||||
auto str = string(ptr, len);
|
||||
return GetUnsafe(val, str.c_str(), len);
|
||||
}
|
||||
case '$': {
|
||||
if (ValidatePath(ptr, len, false) == JSONPathType::WILDCARD) {
|
||||
throw InvalidInputException(
|
||||
"JSON path cannot contain wildcards if the path is not a constant parameter");
|
||||
}
|
||||
return GetUnsafe(val, ptr, len);
|
||||
}
|
||||
default: {
|
||||
string path;
|
||||
if (memchr(ptr, '"', len)) {
|
||||
path = "/" + string(ptr, len);
|
||||
} else {
|
||||
path = "$.\"" + path_str.GetString() + "\"";
|
||||
}
|
||||
return GetUnsafe(val, path.c_str(), path.length());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//! Get JSON value using JSON path query (unsafe)
|
||||
static inline yyjson_val *GetUnsafe(yyjson_val *val, const char *ptr, const idx_t &len) {
|
||||
if (len == 0) {
|
||||
return val;
|
||||
}
|
||||
switch (*ptr) {
|
||||
case '/':
|
||||
return GetPointer(val, ptr, len);
|
||||
case '$':
|
||||
return GetPath(val, ptr, len);
|
||||
default:
|
||||
throw InternalException("JSON pointer/path does not start with '/' or '$'");
|
||||
}
|
||||
}
|
||||
|
||||
//! Get JSON value using JSON path query (unsafe)
|
||||
static void GetWildcardPath(yyjson_val *val, const char *ptr, const idx_t &len, vector<yyjson_val *> &vals);
|
||||
|
||||
//! Validate JSON Path ($.field[index]... syntax), returns true if there are wildcards in the path
|
||||
static JSONPathType ValidatePath(const char *ptr, const idx_t &len, const bool binder);
|
||||
|
||||
public:
|
||||
//! Same as BigQuery json_value
|
||||
static inline string_t JSONValue(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &mask, idx_t idx) {
|
||||
switch (yyjson_get_tag(val)) {
|
||||
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
|
||||
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
||||
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
||||
mask.SetInvalid(idx);
|
||||
return string_t {};
|
||||
default:
|
||||
return JSONCommon::WriteVal<yyjson_val>(val, alc);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
//! Get JSON pointer (/field/index/... syntax)
|
||||
static inline yyjson_val *GetPointer(yyjson_val *val, const char *ptr, const idx_t &len) {
|
||||
yyjson_ptr_err err;
|
||||
return unsafe_yyjson_ptr_getx(val, ptr, len, &err);
|
||||
}
|
||||
//! Get JSON path ($.field[index]... syntax)
|
||||
static yyjson_val *GetPath(yyjson_val *val, const char *ptr, const idx_t &len);
|
||||
};
|
||||
|
||||
template <>
|
||||
inline char *JSONCommon::WriteVal(yyjson_val *val, yyjson_alc *alc, idx_t &len) {
|
||||
size_t len_size_t;
|
||||
// yyjson_val_write_opts must not throw
|
||||
auto ret = yyjson_val_write_opts(val, JSONCommon::WRITE_FLAG, alc, &len_size_t, nullptr);
|
||||
len = len_size_t;
|
||||
return ret;
|
||||
}
|
||||
template <>
|
||||
inline char *JSONCommon::WriteVal(yyjson_mut_val *val, yyjson_alc *alc, idx_t &len) {
|
||||
size_t len_size_t;
|
||||
// yyjson_mut_val_write_opts must not throw
|
||||
auto ret = yyjson_mut_val_write_opts(val, JSONCommon::WRITE_FLAG, alc, &len_size_t, nullptr);
|
||||
len = len_size_t;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct yyjson_doc_deleter {
|
||||
void operator()(yyjson_doc *doc) {
|
||||
if (doc) {
|
||||
yyjson_doc_free(doc);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct yyjson_mut_doc_deleter {
|
||||
void operator()(yyjson_mut_doc *doc) {
|
||||
if (doc) {
|
||||
yyjson_mut_doc_free(doc);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using yyjson_doc_ptr = unique_ptr<yyjson_doc, yyjson_doc_deleter>;
|
||||
using yyjson_mut_doc_ptr = unique_ptr<yyjson_mut_doc, yyjson_mut_doc_deleter>;
|
||||
|
||||
} // namespace duckdb
|
||||
82
external/duckdb/extension/json/include/json_deserializer.hpp
vendored
Normal file
82
external/duckdb/extension/json/include/json_deserializer.hpp
vendored
Normal file
@@ -0,0 +1,82 @@
|
||||
#pragma once
|
||||
#include "json_common.hpp"
|
||||
#include "duckdb/common/serializer/deserializer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class JsonDeserializer : public Deserializer {
|
||||
public:
|
||||
JsonDeserializer(yyjson_val *val, const yyjson_doc_ptr &doc) : doc(doc.get()) {
|
||||
deserialize_enum_from_string = true;
|
||||
stack.emplace_back(val);
|
||||
}
|
||||
|
||||
private:
|
||||
struct StackFrame {
|
||||
yyjson_val *val;
|
||||
yyjson_arr_iter arr_iter;
|
||||
explicit StackFrame(yyjson_val *val) : val(val) {
|
||||
yyjson_arr_iter_init(val, &arr_iter);
|
||||
}
|
||||
};
|
||||
|
||||
yyjson_doc *doc;
|
||||
const char *current_tag = nullptr;
|
||||
vector<StackFrame> stack;
|
||||
|
||||
void DumpDoc();
|
||||
void DumpCurrent();
|
||||
void Dump(yyjson_mut_val *val);
|
||||
void Dump(yyjson_val *val);
|
||||
|
||||
// Get the current json value
|
||||
inline StackFrame &Current() {
|
||||
return stack.back();
|
||||
};
|
||||
|
||||
inline void Push(yyjson_val *val) {
|
||||
stack.emplace_back(val);
|
||||
}
|
||||
inline void Pop() {
|
||||
stack.pop_back();
|
||||
}
|
||||
yyjson_val *GetNextValue();
|
||||
|
||||
void ThrowTypeError(yyjson_val *val, const char *expected);
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Nested Types Hooks
|
||||
//===--------------------------------------------------------------------===//
|
||||
void OnPropertyBegin(const field_id_t field_id, const char *tag) final;
|
||||
void OnPropertyEnd() final;
|
||||
bool OnOptionalPropertyBegin(const field_id_t field_id, const char *tag) final;
|
||||
void OnOptionalPropertyEnd(bool present) final;
|
||||
|
||||
void OnObjectBegin() final;
|
||||
void OnObjectEnd() final;
|
||||
idx_t OnListBegin() final;
|
||||
void OnListEnd() final;
|
||||
bool OnNullableBegin() final;
|
||||
void OnNullableEnd() final;
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Primitive Types
|
||||
//===--------------------------------------------------------------------===//
|
||||
bool ReadBool() final;
|
||||
int8_t ReadSignedInt8() final;
|
||||
uint8_t ReadUnsignedInt8() final;
|
||||
int16_t ReadSignedInt16() final;
|
||||
uint16_t ReadUnsignedInt16() final;
|
||||
int32_t ReadSignedInt32() final;
|
||||
uint32_t ReadUnsignedInt32() final;
|
||||
int64_t ReadSignedInt64() final;
|
||||
uint64_t ReadUnsignedInt64() final;
|
||||
float ReadFloat() final;
|
||||
double ReadDouble() final;
|
||||
string ReadString() final;
|
||||
hugeint_t ReadHugeInt() final;
|
||||
uhugeint_t ReadUhugeInt() final;
|
||||
void ReadDataPtr(data_ptr_t &ptr, idx_t count) final;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
60
external/duckdb/extension/json/include/json_enums.hpp
vendored
Normal file
60
external/duckdb/extension/json/include/json_enums.hpp
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// This file is automatically generated by scripts/generate_enums.py
|
||||
// Do not edit this file manually, your changes will be overwritten
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/constants.hpp"
|
||||
#include "duckdb/common/enum_util.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
enum class JSONScanType : uint8_t {
|
||||
INVALID = 0,
|
||||
//! Read JSON straight to columnar data
|
||||
READ_JSON = 1,
|
||||
//! Read JSON values as strings
|
||||
READ_JSON_OBJECTS = 2,
|
||||
//! Sample run for schema detection
|
||||
SAMPLE = 3,
|
||||
};
|
||||
|
||||
enum class JSONRecordType : uint8_t {
|
||||
AUTO_DETECT = 0,
|
||||
//! Sequential objects that are unpacked
|
||||
RECORDS = 1,
|
||||
//! Any other JSON type, e.g., ARRAY
|
||||
VALUES = 2,
|
||||
};
|
||||
|
||||
enum class JSONFormat : uint8_t {
|
||||
//! Auto-detect format (UNSTRUCTURED / NEWLINE_DELIMITED)
|
||||
AUTO_DETECT = 0,
|
||||
//! One unit after another, newlines can be anywhere
|
||||
UNSTRUCTURED = 1,
|
||||
//! Units are separated by newlines, newlines do not occur within Units (NDJSON)
|
||||
NEWLINE_DELIMITED = 2,
|
||||
//! File is one big array of units
|
||||
ARRAY = 3,
|
||||
};
|
||||
|
||||
template<>
|
||||
const char* EnumUtil::ToChars<JSONScanType>(JSONScanType value);
|
||||
|
||||
template<>
|
||||
JSONScanType EnumUtil::FromString<JSONScanType>(const char *value);
|
||||
|
||||
template<>
|
||||
const char* EnumUtil::ToChars<JSONRecordType>(JSONRecordType value);
|
||||
|
||||
template<>
|
||||
JSONRecordType EnumUtil::FromString<JSONRecordType>(const char *value);
|
||||
|
||||
template<>
|
||||
const char* EnumUtil::ToChars<JSONFormat>(JSONFormat value);
|
||||
|
||||
template<>
|
||||
JSONFormat EnumUtil::FromString<JSONFormat>(const char *value);
|
||||
|
||||
} // namespace duckdb
|
||||
55
external/duckdb/extension/json/include/json_enums.json
vendored
Normal file
55
external/duckdb/extension/json/include/json_enums.json
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
[
|
||||
{
|
||||
"name": "JSONScanType",
|
||||
"values": [
|
||||
"INVALID",
|
||||
{
|
||||
"name": "READ_JSON",
|
||||
"comment": "Read JSON straight to columnar data"
|
||||
},
|
||||
{
|
||||
"name": "READ_JSON_OBJECTS",
|
||||
"comment": "Read JSON values as strings"
|
||||
},
|
||||
{
|
||||
"name": "SAMPLE",
|
||||
"comment": "Sample run for schema detection"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "JSONRecordType",
|
||||
"values": [
|
||||
"AUTO_DETECT",
|
||||
{
|
||||
"name": "RECORDS",
|
||||
"comment": "Sequential objects that are unpacked"
|
||||
},
|
||||
{
|
||||
"name": "VALUES",
|
||||
"comment": "Any other JSON type, e.g., ARRAY"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "JSONFormat",
|
||||
"values": [
|
||||
{
|
||||
"name": "AUTO_DETECT",
|
||||
"comment": "Auto-detect format (UNSTRUCTURED / NEWLINE_DELIMITED)"
|
||||
},
|
||||
{
|
||||
"name": "UNSTRUCTURED",
|
||||
"comment": "One unit after another, newlines can be anywhere"
|
||||
},
|
||||
{
|
||||
"name": "NEWLINE_DELIMITED",
|
||||
"comment": "Units are separated by newlines, newlines do not occur within Units (NDJSON)"
|
||||
},
|
||||
{
|
||||
"name": "ARRAY",
|
||||
"comment": "File is one big array of units"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
180
external/duckdb/extension/json/include/json_executors.hpp
vendored
Normal file
180
external/duckdb/extension/json/include/json_executors.hpp
vendored
Normal file
@@ -0,0 +1,180 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_executors.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/vector_operations/vector_operations.hpp"
|
||||
#include "duckdb/execution/expression_executor.hpp"
|
||||
#include "json_functions.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
template <class T>
|
||||
using json_function_t = std::function<T(yyjson_val *, yyjson_alc *, Vector &, ValidityMask &, idx_t)>;
|
||||
|
||||
struct JSONExecutors {
|
||||
public:
|
||||
//! Single-argument JSON read function, i.e. json_type('[1, 2, 3]')
|
||||
template <class T>
|
||||
static void UnaryExecute(DataChunk &args, ExpressionState &state, Vector &result, const json_function_t<T> fun) {
|
||||
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
|
||||
auto &inputs = args.data[0];
|
||||
UnaryExecutor::ExecuteWithNulls<string_t, T>(
|
||||
inputs, result, args.size(), [&](string_t input, ValidityMask &mask, idx_t idx) {
|
||||
auto doc = JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc);
|
||||
return fun(doc->root, alc, result, mask, idx);
|
||||
});
|
||||
|
||||
JSONAllocator::AddBuffer(result, alc);
|
||||
}
|
||||
|
||||
//! Two-argument JSON read function (with path query), i.e. json_type('[1, 2, 3]', '$[0]')
|
||||
template <class T, bool SET_NULL_IF_NOT_FOUND = true>
|
||||
static void BinaryExecute(DataChunk &args, ExpressionState &state, Vector &result, const json_function_t<T> fun) {
|
||||
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
|
||||
const auto &info = func_expr.bind_info->Cast<JSONReadFunctionData>();
|
||||
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
|
||||
auto &inputs = args.data[0];
|
||||
if (info.constant) { // Constant path
|
||||
const char *ptr = info.ptr;
|
||||
const idx_t &len = info.len;
|
||||
if (info.path_type == JSONCommon::JSONPathType::REGULAR) {
|
||||
UnaryExecutor::ExecuteWithNulls<string_t, T>(
|
||||
inputs, result, args.size(), [&](string_t input, ValidityMask &mask, idx_t idx) {
|
||||
auto doc = JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc);
|
||||
auto val = JSONCommon::GetUnsafe(doc->root, ptr, len);
|
||||
if (SET_NULL_IF_NOT_FOUND && !val) {
|
||||
mask.SetInvalid(idx);
|
||||
return T {};
|
||||
} else {
|
||||
return fun(val, alc, result, mask, idx);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
D_ASSERT(info.path_type == JSONCommon::JSONPathType::WILDCARD);
|
||||
vector<yyjson_val *> vals;
|
||||
UnaryExecutor::Execute<string_t, list_entry_t>(inputs, result, args.size(), [&](string_t input) {
|
||||
vals.clear();
|
||||
|
||||
auto doc = JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc);
|
||||
JSONCommon::GetWildcardPath(doc->root, ptr, len, vals);
|
||||
|
||||
auto current_size = ListVector::GetListSize(result);
|
||||
auto new_size = current_size + vals.size();
|
||||
if (ListVector::GetListCapacity(result) < new_size) {
|
||||
ListVector::Reserve(result, new_size);
|
||||
}
|
||||
|
||||
auto &child_entry = ListVector::GetEntry(result);
|
||||
auto child_vals = FlatVector::GetData<T>(child_entry);
|
||||
auto &child_validity = FlatVector::Validity(child_entry);
|
||||
for (idx_t i = 0; i < vals.size(); i++) {
|
||||
auto &val = vals[i];
|
||||
D_ASSERT(val != nullptr); // Wildcard extract shouldn't give back nullptrs
|
||||
child_vals[current_size + i] = fun(val, alc, result, child_validity, current_size + i);
|
||||
}
|
||||
|
||||
ListVector::SetListSize(result, new_size);
|
||||
|
||||
return list_entry_t {current_size, vals.size()};
|
||||
});
|
||||
}
|
||||
} else { // Columnref path
|
||||
D_ASSERT(info.path_type == JSONCommon::JSONPathType::REGULAR);
|
||||
unique_ptr<Vector> casted_paths;
|
||||
if (args.data[1].GetType().id() == LogicalTypeId::VARCHAR) {
|
||||
casted_paths = make_uniq<Vector>(args.data[1]);
|
||||
} else {
|
||||
casted_paths = make_uniq<Vector>(LogicalTypeId::VARCHAR);
|
||||
VectorOperations::DefaultCast(args.data[1], *casted_paths, args.size(), true);
|
||||
}
|
||||
BinaryExecutor::ExecuteWithNulls<string_t, string_t, T>(
|
||||
inputs, *casted_paths, result, args.size(),
|
||||
[&](string_t input, string_t path, ValidityMask &mask, idx_t idx) {
|
||||
auto doc = JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc);
|
||||
auto val = JSONCommon::Get(doc->root, path, args.data[1].GetType().IsIntegral());
|
||||
if (SET_NULL_IF_NOT_FOUND && !val) {
|
||||
mask.SetInvalid(idx);
|
||||
return T {};
|
||||
} else {
|
||||
return fun(val, alc, result, mask, idx);
|
||||
}
|
||||
});
|
||||
}
|
||||
if (args.AllConstant()) {
|
||||
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
||||
}
|
||||
|
||||
JSONAllocator::AddBuffer(result, alc);
|
||||
}
|
||||
|
||||
//! JSON read function with list of path queries, i.e. json_type('[1, 2, 3]', ['$[0]', '$[1]'])
|
||||
template <class T, bool SET_NULL_IF_NOT_FOUND = true>
|
||||
static void ExecuteMany(DataChunk &args, ExpressionState &state, Vector &result, const json_function_t<T> fun) {
|
||||
auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
|
||||
const auto &info = func_expr.bind_info->Cast<JSONReadManyFunctionData>();
|
||||
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
||||
auto alc = lstate.json_allocator->GetYYAlc();
|
||||
D_ASSERT(info.ptrs.size() == info.lens.size());
|
||||
|
||||
const auto count = args.size();
|
||||
const idx_t num_paths = info.ptrs.size();
|
||||
const idx_t list_size = count * num_paths;
|
||||
|
||||
UnifiedVectorFormat input_data;
|
||||
auto &input_vector = args.data[0];
|
||||
input_vector.ToUnifiedFormat(count, input_data);
|
||||
auto inputs = UnifiedVectorFormat::GetData<string_t>(input_data);
|
||||
|
||||
ListVector::Reserve(result, list_size);
|
||||
auto list_entries = FlatVector::GetData<list_entry_t>(result);
|
||||
auto &list_validity = FlatVector::Validity(result);
|
||||
|
||||
auto &child = ListVector::GetEntry(result);
|
||||
auto child_data = FlatVector::GetData<T>(child);
|
||||
auto &child_validity = FlatVector::Validity(child);
|
||||
|
||||
idx_t offset = 0;
|
||||
yyjson_val *val;
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
auto idx = input_data.sel->get_index(i);
|
||||
if (!input_data.validity.RowIsValid(idx)) {
|
||||
list_validity.SetInvalid(i);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto doc = JSONCommon::ReadDocument(inputs[idx], JSONCommon::READ_FLAG, alc);
|
||||
for (idx_t path_i = 0; path_i < num_paths; path_i++) {
|
||||
auto child_idx = offset + path_i;
|
||||
val = JSONCommon::GetUnsafe(doc->root, info.ptrs[path_i], info.lens[path_i]);
|
||||
if (SET_NULL_IF_NOT_FOUND && !val) {
|
||||
child_validity.SetInvalid(child_idx);
|
||||
} else {
|
||||
child_data[child_idx] = fun(val, alc, child, child_validity, child_idx);
|
||||
}
|
||||
}
|
||||
|
||||
list_entries[i].offset = offset;
|
||||
list_entries[i].length = num_paths;
|
||||
offset += num_paths;
|
||||
}
|
||||
ListVector::SetListSize(result, offset);
|
||||
|
||||
if (args.AllConstant()) {
|
||||
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
||||
}
|
||||
|
||||
JSONAllocator::AddBuffer(result, alc);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
22
external/duckdb/extension/json/include/json_extension.hpp
vendored
Normal file
22
external/duckdb/extension/json/include/json_extension.hpp
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_extension.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class JsonExtension : public Extension {
|
||||
public:
|
||||
void Load(ExtensionLoader &db) override;
|
||||
std::string Name() override;
|
||||
std::string Version() const override;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
138
external/duckdb/extension/json/include/json_functions.hpp
vendored
Normal file
138
external/duckdb/extension/json/include/json_functions.hpp
vendored
Normal file
@@ -0,0 +1,138 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_functions.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/main/extension/extension_loader.hpp"
|
||||
#include "json_common.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class TableRef;
|
||||
struct ReplacementScanData;
|
||||
class CastFunctionSet;
|
||||
struct CastParameters;
|
||||
struct CastLocalStateParameters;
|
||||
struct JSONScanInfo;
|
||||
class BuiltinFunctions;
|
||||
|
||||
// Scalar function stuff
|
||||
struct JSONReadFunctionData : public FunctionData {
|
||||
public:
|
||||
JSONReadFunctionData(bool constant, string path_p, idx_t len, JSONCommon::JSONPathType path_type);
|
||||
unique_ptr<FunctionData> Copy() const override;
|
||||
bool Equals(const FunctionData &other_p) const override;
|
||||
static JSONCommon::JSONPathType CheckPath(const Value &path_val, string &path, idx_t &len);
|
||||
static unique_ptr<FunctionData> Bind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments);
|
||||
|
||||
public:
|
||||
const bool constant;
|
||||
const string path;
|
||||
const JSONCommon::JSONPathType path_type;
|
||||
const char *ptr;
|
||||
const idx_t len;
|
||||
};
|
||||
|
||||
struct JSONReadManyFunctionData : public FunctionData {
|
||||
public:
|
||||
JSONReadManyFunctionData(vector<string> paths_p, vector<idx_t> lens_p);
|
||||
unique_ptr<FunctionData> Copy() const override;
|
||||
bool Equals(const FunctionData &other_p) const override;
|
||||
static unique_ptr<FunctionData> Bind(ClientContext &context, ScalarFunction &bound_function,
|
||||
vector<unique_ptr<Expression>> &arguments);
|
||||
|
||||
public:
|
||||
const vector<string> paths;
|
||||
vector<const char *> ptrs;
|
||||
const vector<idx_t> lens;
|
||||
};
|
||||
|
||||
struct JSONFunctionLocalState : public FunctionLocalState {
|
||||
public:
|
||||
explicit JSONFunctionLocalState(Allocator &allocator);
|
||||
explicit JSONFunctionLocalState(ClientContext &context);
|
||||
static unique_ptr<FunctionLocalState> Init(ExpressionState &state, const BoundFunctionExpression &expr,
|
||||
FunctionData *bind_data);
|
||||
static unique_ptr<FunctionLocalState> InitCastLocalState(CastLocalStateParameters ¶meters);
|
||||
static JSONFunctionLocalState &ResetAndGet(ExpressionState &state);
|
||||
|
||||
public:
|
||||
shared_ptr<JSONAllocator> json_allocator;
|
||||
};
|
||||
|
||||
class JSONFunctions {
|
||||
public:
|
||||
static vector<ScalarFunctionSet> GetScalarFunctions();
|
||||
static vector<PragmaFunctionSet> GetPragmaFunctions();
|
||||
static vector<TableFunctionSet> GetTableFunctions();
|
||||
static unique_ptr<TableRef> ReadJSONReplacement(ClientContext &context, ReplacementScanInput &input,
|
||||
optional_ptr<ReplacementScanData> data);
|
||||
static TableFunction GetReadJSONTableFunction(shared_ptr<JSONScanInfo> function_info);
|
||||
static CopyFunction GetJSONCopyFunction();
|
||||
static void RegisterSimpleCastFunctions(ExtensionLoader &loader);
|
||||
static void RegisterJSONCreateCastFunctions(ExtensionLoader &loader);
|
||||
static void RegisterJSONTransformCastFunctions(ExtensionLoader &loader);
|
||||
|
||||
private:
|
||||
// Scalar functions
|
||||
static ScalarFunctionSet GetExtractFunction();
|
||||
static ScalarFunctionSet GetExtractStringFunction();
|
||||
|
||||
static ScalarFunctionSet GetArrayFunction();
|
||||
static ScalarFunctionSet GetObjectFunction();
|
||||
static ScalarFunctionSet GetToJSONFunction();
|
||||
static ScalarFunctionSet GetArrayToJSONFunction();
|
||||
static ScalarFunctionSet GetRowToJSONFunction();
|
||||
static ScalarFunctionSet GetMergePatchFunction();
|
||||
|
||||
static ScalarFunctionSet GetStructureFunction();
|
||||
static ScalarFunctionSet GetTransformFunction();
|
||||
static ScalarFunctionSet GetTransformStrictFunction();
|
||||
|
||||
static ScalarFunctionSet GetArrayLengthFunction();
|
||||
static ScalarFunctionSet GetContainsFunction();
|
||||
static ScalarFunctionSet GetExistsFunction();
|
||||
static ScalarFunctionSet GetKeysFunction();
|
||||
static ScalarFunctionSet GetTypeFunction();
|
||||
static ScalarFunctionSet GetValidFunction();
|
||||
static ScalarFunctionSet GetValueFunction();
|
||||
static ScalarFunctionSet GetSerializeSqlFunction();
|
||||
static ScalarFunctionSet GetDeserializeSqlFunction();
|
||||
static ScalarFunctionSet GetSerializePlanFunction();
|
||||
|
||||
static ScalarFunctionSet GetPrettyPrintFunction();
|
||||
|
||||
static PragmaFunctionSet GetExecuteJsonSerializedSqlPragmaFunction();
|
||||
|
||||
template <class FUNCTION_INFO>
|
||||
static void AddAliases(const vector<string> &names, FUNCTION_INFO fun, vector<FUNCTION_INFO> &functions) {
|
||||
for (auto &name : names) {
|
||||
fun.name = name;
|
||||
functions.push_back(fun);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Table functions
|
||||
static TableFunctionSet GetReadJSONObjectsFunction();
|
||||
static TableFunctionSet GetReadNDJSONObjectsFunction();
|
||||
static TableFunctionSet GetReadJSONObjectsAutoFunction();
|
||||
|
||||
static TableFunctionSet GetReadJSONFunction();
|
||||
static TableFunctionSet GetReadNDJSONFunction();
|
||||
static TableFunctionSet GetReadJSONAutoFunction();
|
||||
static TableFunctionSet GetReadNDJSONAutoFunction();
|
||||
|
||||
static TableFunctionSet GetJSONEachFunction();
|
||||
static TableFunctionSet GetJSONTreeFunction();
|
||||
|
||||
static TableFunctionSet GetExecuteJsonSerializedSqlFunction();
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
54
external/duckdb/extension/json/include/json_multi_file_info.hpp
vendored
Normal file
54
external/duckdb/extension/json/include/json_multi_file_info.hpp
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_multi_file_info.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/multi_file/multi_file_function.hpp"
|
||||
#include "json_reader_options.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class JSONFileReaderOptions : public BaseFileReaderOptions {
|
||||
public:
|
||||
JSONReaderOptions options;
|
||||
};
|
||||
|
||||
struct JSONMultiFileInfo : MultiFileReaderInterface {
|
||||
static unique_ptr<MultiFileReaderInterface> CreateInterface(ClientContext &context);
|
||||
|
||||
unique_ptr<BaseFileReaderOptions> InitializeOptions(ClientContext &context,
|
||||
optional_ptr<TableFunctionInfo> info) override;
|
||||
bool ParseCopyOption(ClientContext &context, const string &key, const vector<Value> &values,
|
||||
BaseFileReaderOptions &options, vector<string> &expected_names,
|
||||
vector<LogicalType> &expected_types) override;
|
||||
bool ParseOption(ClientContext &context, const string &key, const Value &val, MultiFileOptions &file_options,
|
||||
BaseFileReaderOptions &options) override;
|
||||
void FinalizeCopyBind(ClientContext &context, BaseFileReaderOptions &options, const vector<string> &expected_names,
|
||||
const vector<LogicalType> &expected_types) override;
|
||||
unique_ptr<TableFunctionData> InitializeBindData(MultiFileBindData &multi_file_data,
|
||||
unique_ptr<BaseFileReaderOptions> options) override;
|
||||
void BindReader(ClientContext &context, vector<LogicalType> &return_types, vector<string> &names,
|
||||
MultiFileBindData &bind_data) override;
|
||||
optional_idx MaxThreads(const MultiFileBindData &bind_data, const MultiFileGlobalState &global_state,
|
||||
FileExpandResult expand_result) override;
|
||||
unique_ptr<GlobalTableFunctionState> InitializeGlobalState(ClientContext &context, MultiFileBindData &bind_data,
|
||||
MultiFileGlobalState &global_state) override;
|
||||
unique_ptr<LocalTableFunctionState> InitializeLocalState(ExecutionContext &context,
|
||||
GlobalTableFunctionState &global_state) override;
|
||||
shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
|
||||
BaseUnionData &union_data, const MultiFileBindData &bind_data_p) override;
|
||||
shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
|
||||
const OpenFileInfo &file, idx_t file_idx,
|
||||
const MultiFileBindData &bind_data) override;
|
||||
void FinishReading(ClientContext &context, GlobalTableFunctionState &global_state,
|
||||
LocalTableFunctionState &local_state) override;
|
||||
unique_ptr<NodeStatistics> GetCardinality(const MultiFileBindData &bind_data, idx_t file_count) override;
|
||||
FileGlobInput GetGlobInput() override;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
302
external/duckdb/extension/json/include/json_reader.hpp
vendored
Normal file
302
external/duckdb/extension/json/include/json_reader.hpp
vendored
Normal file
@@ -0,0 +1,302 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_reader.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/atomic.hpp"
|
||||
#include "duckdb/common/enum_util.hpp"
|
||||
#include "duckdb/common/enums/file_compression_type.hpp"
|
||||
#include "duckdb/common/file_system.hpp"
|
||||
#include "duckdb/common/multi_file/base_file_reader.hpp"
|
||||
#include "duckdb/common/multi_file/multi_file_reader.hpp"
|
||||
#include "json_reader_options.hpp"
|
||||
#include "duckdb/common/mutex.hpp"
|
||||
#include "json_common.hpp"
|
||||
#include "json_enums.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
struct JSONScanGlobalState;
|
||||
class JSONReader;
|
||||
|
||||
struct JSONBufferHandle {
|
||||
public:
|
||||
JSONBufferHandle(JSONReader &reader, idx_t buffer_index, idx_t readers, AllocatedData &&buffer, idx_t buffer_size,
|
||||
idx_t buffer_start);
|
||||
|
||||
public:
|
||||
//! The reader this buffer comes from
|
||||
JSONReader &reader;
|
||||
//! Buffer index (within same file)
|
||||
const idx_t buffer_index;
|
||||
|
||||
//! Number of readers for this buffer
|
||||
atomic<idx_t> readers;
|
||||
//! The buffer
|
||||
AllocatedData buffer;
|
||||
//! The size of the data in the buffer (can be less than buffer.GetSize())
|
||||
const idx_t buffer_size;
|
||||
//! The start position in the buffer
|
||||
idx_t buffer_start;
|
||||
};
|
||||
|
||||
struct JSONFileHandle {
|
||||
public:
|
||||
JSONFileHandle(QueryContext context, unique_ptr<FileHandle> file_handle, Allocator &allocator);
|
||||
|
||||
bool IsOpen() const;
|
||||
void Close();
|
||||
|
||||
void Reset();
|
||||
bool RequestedReadsComplete();
|
||||
bool LastReadRequested() const;
|
||||
|
||||
idx_t FileSize() const;
|
||||
idx_t Remaining() const;
|
||||
|
||||
bool CanSeek() const;
|
||||
bool IsPipe() const;
|
||||
|
||||
FileHandle &GetHandle();
|
||||
|
||||
//! The next two functions return whether the read was successful
|
||||
bool GetPositionAndSize(idx_t &position, idx_t &size, idx_t requested_size);
|
||||
bool Read(char *pointer, idx_t &read_size, idx_t requested_size);
|
||||
//! Read at position optionally allows passing a custom handle to read from, otherwise the default one is used
|
||||
void ReadAtPosition(char *pointer, idx_t size, idx_t position, optional_ptr<FileHandle> override_handle = nullptr);
|
||||
|
||||
private:
|
||||
idx_t ReadInternal(char *pointer, const idx_t requested_size);
|
||||
idx_t ReadFromCache(char *&pointer, idx_t &size, atomic<idx_t> &position);
|
||||
|
||||
private:
|
||||
QueryContext context;
|
||||
|
||||
//! The JSON file handle
|
||||
unique_ptr<FileHandle> file_handle;
|
||||
Allocator &allocator;
|
||||
|
||||
//! File properties
|
||||
const bool can_seek;
|
||||
const idx_t file_size;
|
||||
|
||||
//! Read properties
|
||||
atomic<idx_t> read_position;
|
||||
atomic<idx_t> requested_reads;
|
||||
atomic<idx_t> actual_reads;
|
||||
atomic<bool> last_read_requested;
|
||||
|
||||
//! Cached buffers for resetting when reading stream
|
||||
vector<AllocatedData> cached_buffers;
|
||||
idx_t cached_size;
|
||||
};
|
||||
|
||||
struct JSONString {
|
||||
public:
|
||||
JSONString() {
|
||||
}
|
||||
JSONString(const char *pointer_p, idx_t size_p) : pointer(pointer_p), size(size_p) {
|
||||
}
|
||||
|
||||
const char *pointer;
|
||||
idx_t size;
|
||||
|
||||
public:
|
||||
string ToString() {
|
||||
return string(pointer, size);
|
||||
}
|
||||
|
||||
const char &operator[](size_t i) const {
|
||||
return pointer[i];
|
||||
}
|
||||
};
|
||||
|
||||
enum class JSONFileReadType { SCAN_ENTIRE_FILE, SCAN_PARTIAL };
|
||||
|
||||
struct JSONReaderScanState {
|
||||
explicit JSONReaderScanState(ClientContext &context, Allocator &global_allocator,
|
||||
idx_t reconstruct_buffer_capacity);
|
||||
|
||||
FileSystem &fs;
|
||||
Allocator &global_allocator;
|
||||
//! Thread-local allocator
|
||||
JSONAllocator allocator;
|
||||
idx_t buffer_capacity;
|
||||
bool initialized = false;
|
||||
// if we have a buffer already - this is our buffer index
|
||||
optional_idx buffer_index;
|
||||
//! Whether or not we are scanning the entire file
|
||||
//! If we are scanning the entire file we don't share reads between threads and just read the file until we are done
|
||||
JSONFileReadType file_read_type = JSONFileReadType::SCAN_PARTIAL;
|
||||
// Data for reading (if we have postponed reading)
|
||||
//! Buffer (if we have one)
|
||||
AllocatedData read_buffer;
|
||||
bool needs_to_read = false;
|
||||
idx_t request_size;
|
||||
idx_t read_position;
|
||||
idx_t read_size;
|
||||
//! Current scan data
|
||||
idx_t scan_count = 0;
|
||||
JSONString units[STANDARD_VECTOR_SIZE];
|
||||
yyjson_val *values[STANDARD_VECTOR_SIZE];
|
||||
optional_ptr<JSONBufferHandle> current_buffer_handle;
|
||||
//! Current buffer read info
|
||||
optional_ptr<JSONReader> current_reader;
|
||||
char *buffer_ptr = nullptr;
|
||||
idx_t buffer_size = 0;
|
||||
idx_t buffer_offset = 0;
|
||||
idx_t prev_buffer_remainder = 0;
|
||||
idx_t prev_buffer_offset = 0;
|
||||
idx_t lines_or_objects_in_buffer = 0;
|
||||
//! Whether this is the first time scanning this buffer
|
||||
bool is_first_scan = false;
|
||||
//! Whether this is the last batch of the file
|
||||
bool is_last = false;
|
||||
//! Buffer to reconstruct split values
|
||||
optional_idx batch_index;
|
||||
|
||||
//! For some filesystems (e.g. S3), using a filehandle per thread increases performance
|
||||
unique_ptr<FileHandle> thread_local_filehandle;
|
||||
|
||||
public:
|
||||
//! Reset for parsing the next batch of JSON from the current buffer
|
||||
void ResetForNextParse();
|
||||
//! Reset state for reading the next buffer
|
||||
void ResetForNextBuffer();
|
||||
//! Clear the buffer handle (if any)
|
||||
void ClearBufferHandle();
|
||||
};
|
||||
|
||||
struct JSONError {
|
||||
idx_t buf_index;
|
||||
idx_t line_or_object_in_buf;
|
||||
string error_msg;
|
||||
};
|
||||
|
||||
class JSONReader : public BaseFileReader {
|
||||
public:
|
||||
JSONReader(ClientContext &context, JSONReaderOptions options, OpenFileInfo file);
|
||||
|
||||
void OpenJSONFile();
|
||||
void CloseHandle();
|
||||
void Reset();
|
||||
|
||||
bool HasFileHandle() const;
|
||||
bool IsOpen() const;
|
||||
bool IsInitialized() const {
|
||||
return initialized;
|
||||
}
|
||||
|
||||
JSONReaderOptions &GetOptions();
|
||||
|
||||
JSONFormat GetFormat() const;
|
||||
void SetFormat(JSONFormat format);
|
||||
|
||||
JSONRecordType GetRecordType() const;
|
||||
void SetRecordType(JSONRecordType type);
|
||||
|
||||
const string &GetFileName() const;
|
||||
JSONFileHandle &GetFileHandle() const;
|
||||
|
||||
public:
|
||||
string GetReaderType() const override {
|
||||
return "JSON";
|
||||
}
|
||||
|
||||
void PrepareReader(ClientContext &context, GlobalTableFunctionState &) override;
|
||||
bool TryInitializeScan(ClientContext &context, GlobalTableFunctionState &gstate,
|
||||
LocalTableFunctionState &lstate) override;
|
||||
void Scan(ClientContext &context, GlobalTableFunctionState &global_state, LocalTableFunctionState &local_state,
|
||||
DataChunk &chunk) override;
|
||||
void FinishFile(ClientContext &context, GlobalTableFunctionState &gstate_p) override;
|
||||
double GetProgressInFile(ClientContext &context) override;
|
||||
|
||||
public:
|
||||
//! Get a new buffer index (must hold the lock)
|
||||
idx_t GetBufferIndex();
|
||||
//! Set line count for a buffer that is done (grabs the lock)
|
||||
void SetBufferLineOrObjectCount(JSONBufferHandle &handle, idx_t count);
|
||||
//! Records a parse error in the specified buffer
|
||||
void AddParseError(JSONReaderScanState &scan_state, idx_t line_or_object_in_buf, yyjson_read_err &err,
|
||||
const string &extra = "");
|
||||
//! Records a transform error in the specified buffer
|
||||
void AddTransformError(JSONReaderScanState &scan_state, idx_t object_index, const string &error_message);
|
||||
//! Whether this reader has thrown if an error has occurred
|
||||
bool HasThrown();
|
||||
|
||||
void Initialize(Allocator &allocator, idx_t buffer_size);
|
||||
bool InitializeScan(JSONReaderScanState &state, JSONFileReadType file_read_type);
|
||||
void ParseJSON(JSONReaderScanState &scan_state, char *const json_start, const idx_t json_size,
|
||||
const idx_t remaining);
|
||||
void ParseNextChunk(JSONReaderScanState &scan_state);
|
||||
idx_t Scan(JSONReaderScanState &scan_state);
|
||||
bool ReadNextBuffer(JSONReaderScanState &scan_state);
|
||||
bool PrepareBufferForRead(JSONReaderScanState &scan_state);
|
||||
|
||||
//! Scan progress
|
||||
double GetProgress() const;
|
||||
|
||||
void DecrementBufferUsage(JSONBufferHandle &handle, idx_t lines_or_object_in_buffer, AllocatedData &buffer);
|
||||
|
||||
private:
|
||||
void SkipOverArrayStart(JSONReaderScanState &scan_state);
|
||||
void AutoDetect(Allocator &allocator, idx_t buffer_size);
|
||||
bool CopyRemainderFromPreviousBuffer(JSONReaderScanState &scan_state);
|
||||
void FinalizeBufferInternal(JSONReaderScanState &scan_state, AllocatedData &buffer, idx_t buffer_index);
|
||||
void PrepareForReadInternal(JSONReaderScanState &scan_state);
|
||||
void PrepareForScan(JSONReaderScanState &scan_state);
|
||||
bool PrepareBufferSeek(JSONReaderScanState &scan_state);
|
||||
void ReadNextBufferSeek(JSONReaderScanState &scan_state);
|
||||
bool ReadNextBufferNoSeek(JSONReaderScanState &scan_state);
|
||||
void FinalizeBuffer(JSONReaderScanState &scan_state);
|
||||
|
||||
//! Insert/get/remove buffer (grabs the lock)
|
||||
void InsertBuffer(idx_t buffer_idx, unique_ptr<JSONBufferHandle> &&buffer);
|
||||
optional_ptr<JSONBufferHandle> GetBuffer(idx_t buffer_idx);
|
||||
AllocatedData RemoveBuffer(JSONBufferHandle &handle);
|
||||
|
||||
void ThrowObjectSizeError(const idx_t object_size);
|
||||
|
||||
private:
|
||||
//! Add an error to the buffer - requires the lock to be held
|
||||
void AddError(idx_t buf_index, idx_t line_or_object_in_buf, const string &error_msg);
|
||||
//! Throw errors if possible - requires the lock to be held
|
||||
void ThrowErrorsIfPossible();
|
||||
//! Try to get the line number - requires the lock to be held
|
||||
optional_idx TryGetLineNumber(idx_t buf_index, idx_t line_or_object_in_buf);
|
||||
|
||||
private:
|
||||
ClientContext &context;
|
||||
JSONReaderOptions options;
|
||||
|
||||
//! File handle
|
||||
unique_ptr<JSONFileHandle> file_handle;
|
||||
|
||||
//! Whether or not the reader has been initialized
|
||||
bool initialized;
|
||||
//! Next buffer index within the file
|
||||
idx_t next_buffer_index;
|
||||
//! Mapping from batch index to currently held buffers
|
||||
unordered_map<idx_t, unique_ptr<JSONBufferHandle>> buffer_map;
|
||||
|
||||
//! Line count per buffer
|
||||
vector<int64_t> buffer_line_or_object_counts;
|
||||
//! Whether any of the reading threads has thrown an error
|
||||
bool thrown;
|
||||
|
||||
//! If we have auto-detected, this is the buffer read by the auto-detection
|
||||
AllocatedData auto_detect_data;
|
||||
idx_t auto_detect_data_size = 0;
|
||||
|
||||
//! The first error we found in the file (if any)
|
||||
unique_ptr<JSONError> error;
|
||||
|
||||
public:
|
||||
mutable mutex lock;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
129
external/duckdb/extension/json/include/json_reader_options.hpp
vendored
Normal file
129
external/duckdb/extension/json/include/json_reader_options.hpp
vendored
Normal file
@@ -0,0 +1,129 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_reader_options.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "json_common.hpp"
|
||||
#include "json_enums.hpp"
|
||||
#include "duckdb/common/types/type_map.hpp"
|
||||
#include "duckdb/function/scalar/strftime_format.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct DateFormatMap {
|
||||
friend class MutableDateFormatMap;
|
||||
|
||||
public:
|
||||
explicit DateFormatMap(type_id_map_t<vector<StrpTimeFormat>> candidate_formats_p)
|
||||
: candidate_formats(std::move(candidate_formats_p)) {
|
||||
}
|
||||
|
||||
bool HasFormats(LogicalTypeId type) const {
|
||||
return HasFormats(candidate_formats, type);
|
||||
}
|
||||
|
||||
const StrpTimeFormat &GetFormat(LogicalTypeId type) const {
|
||||
D_ASSERT(candidate_formats.find(type) != candidate_formats.end());
|
||||
return candidate_formats.find(type)->second.back();
|
||||
}
|
||||
|
||||
public:
|
||||
static void AddFormat(type_id_map_t<vector<StrpTimeFormat>> &candidate_formats, LogicalTypeId type,
|
||||
const string &format_string) {
|
||||
auto &formats = candidate_formats[type];
|
||||
formats.emplace_back();
|
||||
formats.back().format_specifier = format_string;
|
||||
StrpTimeFormat::ParseFormatSpecifier(formats.back().format_specifier, formats.back());
|
||||
}
|
||||
|
||||
static bool HasFormats(const type_id_map_t<vector<StrpTimeFormat>> &candidate_formats, LogicalTypeId type) {
|
||||
return candidate_formats.find(type) != candidate_formats.end();
|
||||
}
|
||||
|
||||
private:
|
||||
type_id_map_t<vector<StrpTimeFormat>> candidate_formats;
|
||||
};
|
||||
|
||||
class MutableDateFormatMap {
|
||||
public:
|
||||
explicit MutableDateFormatMap(DateFormatMap &date_format_map) : date_format_map(date_format_map) {
|
||||
}
|
||||
|
||||
bool HasFormats(LogicalTypeId type) {
|
||||
lock_guard<mutex> lock(format_lock);
|
||||
return date_format_map.HasFormats(type);
|
||||
}
|
||||
|
||||
idx_t NumberOfFormats(LogicalTypeId type) {
|
||||
lock_guard<mutex> lock(format_lock);
|
||||
return date_format_map.candidate_formats.at(type).size();
|
||||
}
|
||||
|
||||
bool GetFormatAtIndex(LogicalTypeId type, idx_t index, StrpTimeFormat &format) {
|
||||
lock_guard<mutex> lock(format_lock);
|
||||
auto &formats = date_format_map.candidate_formats.at(type);
|
||||
if (index >= formats.size()) {
|
||||
return false;
|
||||
}
|
||||
format = formats[index];
|
||||
return true;
|
||||
}
|
||||
|
||||
void ShrinkFormatsToSize(LogicalTypeId type, idx_t size) {
|
||||
lock_guard<mutex> lock(format_lock);
|
||||
auto &formats = date_format_map.candidate_formats[type];
|
||||
while (formats.size() > size) {
|
||||
formats.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
mutex format_lock;
|
||||
DateFormatMap &date_format_map;
|
||||
};
|
||||
|
||||
struct JSONReaderOptions {
|
||||
//! Scan type
|
||||
JSONScanType type = JSONScanType::READ_JSON;
|
||||
//! The format of the JSON
|
||||
JSONFormat format = JSONFormat::AUTO_DETECT;
|
||||
//! Whether record types in the JSON
|
||||
JSONRecordType record_type = JSONRecordType::AUTO_DETECT;
|
||||
//! Whether file is compressed or not, and if so which compression type
|
||||
FileCompressionType compression = FileCompressionType::AUTO_DETECT;
|
||||
//! Whether or not we should ignore malformed JSON (default to NULL)
|
||||
bool ignore_errors = false;
|
||||
//! Maximum JSON object size (defaults to 16MB minimum)
|
||||
idx_t maximum_object_size = 16777216;
|
||||
//! Whether we auto-detect a schema
|
||||
bool auto_detect = false;
|
||||
//! Sample size for detecting schema
|
||||
idx_t sample_size = idx_t(STANDARD_VECTOR_SIZE) * 10;
|
||||
//! Max depth we go to detect nested JSON schema (defaults to unlimited)
|
||||
idx_t max_depth = NumericLimits<idx_t>::Maximum();
|
||||
//! We divide the number of appearances of each JSON field by the auto-detection sample size
|
||||
//! If the average over the fields of an object is less than this threshold,
|
||||
//! we default to the MAP type with value type of merged field types
|
||||
double field_appearance_threshold = 0.1;
|
||||
//! The maximum number of files we sample to sample sample_size rows
|
||||
idx_t maximum_sample_files = 32;
|
||||
//! Whether we auto-detect and convert JSON strings to integers
|
||||
bool convert_strings_to_integers = false;
|
||||
//! If a struct contains more fields than this threshold with at least 80% similar types,
|
||||
//! we infer it as MAP type
|
||||
idx_t map_inference_threshold = 200;
|
||||
//! User-provided list of names (in order)
|
||||
vector<string> name_list;
|
||||
//! User-provided list of types (in order)
|
||||
vector<LogicalType> sql_type_list;
|
||||
//! Forced date/timestamp formats
|
||||
string date_format;
|
||||
string timestamp_format;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
148
external/duckdb/extension/json/include/json_scan.hpp
vendored
Normal file
148
external/duckdb/extension/json/include/json_scan.hpp
vendored
Normal file
@@ -0,0 +1,148 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_scan.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "json_reader.hpp"
|
||||
#include "duckdb/common/multi_file/multi_file_reader.hpp"
|
||||
#include "duckdb/common/mutex.hpp"
|
||||
#include "duckdb/common/pair.hpp"
|
||||
#include "duckdb/common/types/type_map.hpp"
|
||||
#include "duckdb/function/scalar/strftime_format.hpp"
|
||||
#include "duckdb/function/table_function.hpp"
|
||||
#include "json_enums.hpp"
|
||||
#include "json_transform.hpp"
|
||||
#include "json_reader_options.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct JSONScanData : public TableFunctionData {
|
||||
public:
|
||||
JSONScanData();
|
||||
|
||||
void InitializeFormats();
|
||||
void InitializeFormats(bool auto_detect);
|
||||
|
||||
public:
|
||||
//! JSON reader options
|
||||
JSONReaderOptions options;
|
||||
|
||||
//! The set of keys to extract (case sensitive)
|
||||
vector<string> key_names;
|
||||
|
||||
//! The date format map
|
||||
unique_ptr<DateFormatMap> date_format_map;
|
||||
//! Options when transforming the JSON to columnar data
|
||||
JSONTransformOptions transform_options;
|
||||
|
||||
optional_idx max_threads;
|
||||
optional_idx estimated_cardinality_per_file;
|
||||
};
|
||||
|
||||
struct JSONScanInfo : public TableFunctionInfo {
|
||||
public:
|
||||
explicit JSONScanInfo(JSONScanType type_p = JSONScanType::INVALID, JSONFormat format_p = JSONFormat::AUTO_DETECT,
|
||||
JSONRecordType record_type_p = JSONRecordType::AUTO_DETECT, bool auto_detect_p = false)
|
||||
: type(type_p), format(format_p), record_type(record_type_p), auto_detect(auto_detect_p) {
|
||||
}
|
||||
|
||||
JSONScanType type;
|
||||
JSONFormat format;
|
||||
JSONRecordType record_type;
|
||||
bool auto_detect;
|
||||
};
|
||||
|
||||
struct JSONScanGlobalState {
|
||||
public:
|
||||
JSONScanGlobalState(ClientContext &context, const MultiFileBindData &bind_data);
|
||||
|
||||
public:
|
||||
//! Bound data
|
||||
const MultiFileBindData &bind_data;
|
||||
const JSONScanData &json_data;
|
||||
//! Options when transforming the JSON to columnar data
|
||||
JSONTransformOptions transform_options;
|
||||
|
||||
//! Column names that we're actually reading (after projection pushdown)
|
||||
vector<string> names;
|
||||
vector<column_t> column_ids;
|
||||
vector<ColumnIndex> column_indices;
|
||||
|
||||
//! Buffer manager allocator
|
||||
Allocator &allocator;
|
||||
//! The current buffer capacity
|
||||
idx_t buffer_capacity;
|
||||
|
||||
//! Current number of threads active
|
||||
idx_t system_threads;
|
||||
//! Whether we enable parallel scans (only if less files than threads)
|
||||
bool enable_parallel_scans;
|
||||
|
||||
bool file_is_assigned = false;
|
||||
bool initialized = false;
|
||||
};
|
||||
|
||||
struct JSONScanLocalState {
|
||||
public:
|
||||
JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate);
|
||||
|
||||
public:
|
||||
idx_t Read();
|
||||
void AddTransformError(idx_t object_index, const string &error_message);
|
||||
|
||||
JSONReaderScanState &GetScanState() {
|
||||
return scan_state;
|
||||
}
|
||||
|
||||
const JSONReaderScanState &GetScanState() const {
|
||||
return scan_state;
|
||||
}
|
||||
|
||||
bool TryInitializeScan(JSONScanGlobalState &gstate, JSONReader &reader);
|
||||
|
||||
public:
|
||||
//! Options when transforming the JSON to columnar data
|
||||
JSONTransformOptions transform_options;
|
||||
|
||||
private:
|
||||
void ParseJSON(char *const json_start, const idx_t json_size, const idx_t remaining);
|
||||
|
||||
private:
|
||||
//! Scan state
|
||||
JSONReaderScanState scan_state;
|
||||
};
|
||||
|
||||
struct JSONGlobalTableFunctionState : public GlobalTableFunctionState {
|
||||
public:
|
||||
JSONGlobalTableFunctionState(ClientContext &context, const MultiFileBindData &bind_data);
|
||||
|
||||
public:
|
||||
JSONScanGlobalState state;
|
||||
};
|
||||
|
||||
struct JSONLocalTableFunctionState : public LocalTableFunctionState {
|
||||
public:
|
||||
JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate);
|
||||
|
||||
public:
|
||||
JSONScanLocalState state;
|
||||
};
|
||||
|
||||
struct JSONScan {
|
||||
public:
|
||||
static void AutoDetect(ClientContext &context, MultiFileBindData &bind_data, vector<LogicalType> &return_types,
|
||||
vector<string> &names);
|
||||
|
||||
static void Serialize(Serializer &serializer, const optional_ptr<FunctionData> bind_data,
|
||||
const TableFunction &function);
|
||||
static unique_ptr<FunctionData> Deserialize(Deserializer &deserializer, TableFunction &function);
|
||||
|
||||
static void TableFunctionDefaults(TableFunction &table_function);
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
97
external/duckdb/extension/json/include/json_serializer.hpp
vendored
Normal file
97
external/duckdb/extension/json/include/json_serializer.hpp
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
#pragma once
|
||||
|
||||
#include "json_common.hpp"
|
||||
#include "duckdb/common/serializer/serializer.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct JsonSerializer : Serializer {
|
||||
private:
|
||||
yyjson_mut_doc *doc;
|
||||
yyjson_mut_val *current_tag;
|
||||
vector<yyjson_mut_val *> stack;
|
||||
|
||||
// Skip writing property if null
|
||||
bool skip_if_null = false;
|
||||
// Skip writing property if empty string, empty list or empty map.
|
||||
bool skip_if_empty = false;
|
||||
|
||||
// Get the current json value
|
||||
inline yyjson_mut_val *Current() {
|
||||
return stack.back();
|
||||
};
|
||||
|
||||
// Either adds a value to the current object with the current tag, or appends it to the current array
|
||||
void PushValue(yyjson_mut_val *val);
|
||||
|
||||
public:
|
||||
explicit JsonSerializer(yyjson_mut_doc *doc, bool skip_if_null, bool skip_if_empty, bool skip_if_default)
|
||||
: doc(doc), stack({yyjson_mut_obj(doc)}), skip_if_null(skip_if_null), skip_if_empty(skip_if_empty) {
|
||||
options.serialize_enum_as_string = true;
|
||||
options.serialize_default_values = !skip_if_default;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static yyjson_mut_val *Serialize(T &value, yyjson_mut_doc *doc, bool skip_if_null, bool skip_if_empty,
|
||||
bool skip_if_default) {
|
||||
JsonSerializer serializer(doc, skip_if_null, skip_if_empty, skip_if_default);
|
||||
value.Serialize(serializer);
|
||||
return serializer.GetRootObject();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static string SerializeToString(T &value) {
|
||||
auto doc = yyjson_mut_doc_new(nullptr);
|
||||
JsonSerializer serializer(doc, false, false, false);
|
||||
value.Serialize(serializer);
|
||||
auto result_obj = serializer.GetRootObject();
|
||||
idx_t len = 0;
|
||||
auto data = yyjson_mut_val_write_opts(result_obj, JSONCommon::WRITE_PRETTY_FLAG, nullptr,
|
||||
reinterpret_cast<size_t *>(&len), nullptr);
|
||||
return string(data, len);
|
||||
}
|
||||
|
||||
yyjson_mut_val *GetRootObject() {
|
||||
D_ASSERT(stack.size() == 1); // or we forgot to pop somewhere
|
||||
return stack.front();
|
||||
};
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Nested Types Hooks
|
||||
//===--------------------------------------------------------------------===//
|
||||
void OnPropertyBegin(const field_id_t field_id, const char *tag) final;
|
||||
void OnPropertyEnd() final;
|
||||
void OnOptionalPropertyBegin(const field_id_t field_id, const char *tag, bool present) final;
|
||||
void OnOptionalPropertyEnd(bool present) final;
|
||||
|
||||
void OnListBegin(idx_t count) final;
|
||||
void OnListEnd() final;
|
||||
void OnObjectBegin() final;
|
||||
void OnObjectEnd() final;
|
||||
void OnNullableBegin(bool present) final;
|
||||
void OnNullableEnd() final;
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Primitive Types
|
||||
//===--------------------------------------------------------------------===//
|
||||
void WriteNull() final;
|
||||
void WriteValue(uint8_t value) final;
|
||||
void WriteValue(int8_t value) final;
|
||||
void WriteValue(uint16_t value) final;
|
||||
void WriteValue(int16_t value) final;
|
||||
void WriteValue(uint32_t value) final;
|
||||
void WriteValue(int32_t value) final;
|
||||
void WriteValue(uint64_t value) final;
|
||||
void WriteValue(int64_t value) final;
|
||||
void WriteValue(hugeint_t value) final;
|
||||
void WriteValue(uhugeint_t value) final;
|
||||
void WriteValue(float value) final;
|
||||
void WriteValue(double value) final;
|
||||
void WriteValue(const string_t value) final;
|
||||
void WriteValue(const string &value) final;
|
||||
void WriteValue(const char *value) final;
|
||||
void WriteValue(bool value) final;
|
||||
void WriteDataPtr(const_data_ptr_t ptr, idx_t count) final;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
94
external/duckdb/extension/json/include/json_structure.hpp
vendored
Normal file
94
external/duckdb/extension/json/include/json_structure.hpp
vendored
Normal file
@@ -0,0 +1,94 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_structure.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "json_common.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct JSONStructureDescription;
|
||||
struct DateFormatMap;
|
||||
struct StrpTimeFormat;
|
||||
class MutableDateFormatMap;
|
||||
|
||||
struct JSONStructureNode {
|
||||
public:
|
||||
JSONStructureNode();
|
||||
JSONStructureNode(const char *key_ptr, const size_t key_len);
|
||||
JSONStructureNode(yyjson_val *key_p, yyjson_val *val_p, bool ignore_errors);
|
||||
|
||||
//! Disable copy constructors
|
||||
JSONStructureNode(const JSONStructureNode &other) = delete;
|
||||
JSONStructureNode &operator=(const JSONStructureNode &) = delete;
|
||||
//! Enable move constructors
|
||||
JSONStructureNode(JSONStructureNode &&other) noexcept;
|
||||
JSONStructureNode &operator=(JSONStructureNode &&) noexcept;
|
||||
|
||||
JSONStructureDescription &GetOrCreateDescription(LogicalTypeId type);
|
||||
|
||||
bool ContainsVarchar() const;
|
||||
void InitializeCandidateTypes(idx_t max_depth, bool convert_strings_to_integers, idx_t depth = 0);
|
||||
void RefineCandidateTypes(yyjson_val *vals[], idx_t val_count, Vector &string_vector, ArenaAllocator &allocator,
|
||||
MutableDateFormatMap &date_format_map);
|
||||
|
||||
private:
|
||||
void RefineCandidateTypesArray(yyjson_val *vals[], idx_t val_count, Vector &string_vector,
|
||||
ArenaAllocator &allocator, MutableDateFormatMap &date_format_map);
|
||||
void RefineCandidateTypesObject(yyjson_val *vals[], idx_t val_count, Vector &string_vector,
|
||||
ArenaAllocator &allocator, MutableDateFormatMap &date_format_map);
|
||||
void RefineCandidateTypesString(yyjson_val *vals[], idx_t val_count, Vector &string_vector,
|
||||
MutableDateFormatMap &date_format_map);
|
||||
void EliminateCandidateTypes(idx_t vec_count, Vector &string_vector, MutableDateFormatMap &date_format_map);
|
||||
bool EliminateCandidateFormats(idx_t vec_count, Vector &string_vector, const Vector &result_vector,
|
||||
MutableDateFormatMap &date_format_map);
|
||||
|
||||
public:
|
||||
unique_ptr<string> key;
|
||||
bool initialized = false;
|
||||
vector<JSONStructureDescription> descriptions;
|
||||
idx_t count;
|
||||
idx_t null_count;
|
||||
};
|
||||
|
||||
struct JSONStructureDescription {
|
||||
public:
|
||||
explicit JSONStructureDescription(LogicalTypeId type_p);
|
||||
//! Disable copy constructors
|
||||
JSONStructureDescription(const JSONStructureDescription &other) = delete;
|
||||
JSONStructureDescription &operator=(const JSONStructureDescription &) = delete;
|
||||
//! Enable move constructors
|
||||
JSONStructureDescription(JSONStructureDescription &&other) noexcept;
|
||||
JSONStructureDescription &operator=(JSONStructureDescription &&) noexcept;
|
||||
|
||||
JSONStructureNode &GetOrCreateChild();
|
||||
JSONStructureNode &GetOrCreateChild(const char *key_ptr, size_t key_size);
|
||||
JSONStructureNode &GetOrCreateChild(yyjson_val *key, yyjson_val *val, bool ignore_errors);
|
||||
|
||||
public:
|
||||
//! Type of this description
|
||||
LogicalTypeId type = LogicalTypeId::INVALID;
|
||||
|
||||
//! Map to children and children
|
||||
json_key_map_t<idx_t> key_map;
|
||||
vector<JSONStructureNode> children;
|
||||
|
||||
//! Candidate types (if auto-detecting and type == LogicalTypeId::VARCHAR)
|
||||
vector<LogicalTypeId> candidate_types;
|
||||
};
|
||||
|
||||
struct JSONStructure {
|
||||
public:
|
||||
static void ExtractStructure(yyjson_val *val, JSONStructureNode &node, bool ignore_errors);
|
||||
static void MergeNodes(JSONStructureNode &merged, const JSONStructureNode &node);
|
||||
static LogicalType StructureToType(ClientContext &context, const JSONStructureNode &node, idx_t max_depth,
|
||||
double field_appearance_threshold, idx_t map_inference_threshold,
|
||||
idx_t depth = 0, const LogicalType &null_type = LogicalType::JSON());
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
79
external/duckdb/extension/json/include/json_transform.hpp
vendored
Normal file
79
external/duckdb/extension/json/include/json_transform.hpp
vendored
Normal file
@@ -0,0 +1,79 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// json_functions.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/common/column_index.hpp"
|
||||
#include "duckdb/common/optional_ptr.hpp"
|
||||
#include "duckdb/function/scalar/strftime_format.hpp"
|
||||
#include "json_common.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct DateFormatMap;
|
||||
class JSONReader;
|
||||
|
||||
//! Options for error handling while transforming JSON
|
||||
struct JSONTransformOptions {
|
||||
public:
|
||||
JSONTransformOptions();
|
||||
JSONTransformOptions(bool strict_cast, bool error_duplicate_key, bool error_missing_key, bool error_unkown_key);
|
||||
|
||||
public:
|
||||
//! Throws an error if the cast doesn't work (instead of NULL-ing it)
|
||||
bool strict_cast = false;
|
||||
//! Throws an error if there is a duplicate key (instead of ignoring it)
|
||||
bool error_duplicate_key = false;
|
||||
//! Throws an error if a key is missing (instead of NULL-ing it)
|
||||
bool error_missing_key = false;
|
||||
//! Throws an error if an object has a key we didn't know about
|
||||
bool error_unknown_key = false;
|
||||
|
||||
//! Whether to delay the error when transforming (e.g., when non-strict casting or reading from file)
|
||||
bool delay_error = false;
|
||||
//! Date format used for parsing (can be NULL)
|
||||
optional_ptr<const DateFormatMap> date_format_map = nullptr;
|
||||
//! String to store errors in
|
||||
string error_message;
|
||||
//! Index of the object where the error occurred
|
||||
idx_t object_index = DConstants::INVALID_INDEX;
|
||||
//! Cast parameters
|
||||
CastParameters parameters;
|
||||
|
||||
public:
|
||||
void Serialize(Serializer &serializer) const;
|
||||
static JSONTransformOptions Deserialize(Deserializer &deserializer);
|
||||
};
|
||||
|
||||
struct TryParseDate {
|
||||
template <class T>
|
||||
static inline bool Operation(const StrpTimeFormat &format, const string_t &input, T &result,
|
||||
string &error_message) {
|
||||
return format.TryParseDate(input, result, error_message);
|
||||
}
|
||||
};
|
||||
|
||||
struct TryParseTimeStamp {
|
||||
template <class T>
|
||||
static inline bool Operation(const StrpTimeFormat &format, const string_t &input, T &result,
|
||||
string &error_message) {
|
||||
return format.TryParseTimestamp(input, result, error_message);
|
||||
}
|
||||
};
|
||||
|
||||
struct JSONTransform {
|
||||
static bool Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count,
|
||||
JSONTransformOptions &options, optional_ptr<const ColumnIndex> column_index);
|
||||
static bool TransformObject(yyjson_val *objects[], yyjson_alc *alc, const idx_t count, const vector<string> &names,
|
||||
const vector<Vector *> &result_vectors, JSONTransformOptions &options,
|
||||
optional_ptr<const vector<ColumnIndex>> column_indices, bool error_unknown_key);
|
||||
static bool GetStringVector(yyjson_val *vals[], const idx_t count, const LogicalType &target, Vector &string_vector,
|
||||
JSONTransformOptions &options);
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
Reference in New Issue
Block a user