394 lines
9.5 KiB
C++
394 lines
9.5 KiB
C++
#include "json_common.hpp"
|
|
|
|
#include "duckdb/common/exception/binder_exception.hpp"
|
|
|
|
namespace duckdb {
|
|
|
|
using JSONPathType = JSONCommon::JSONPathType;
|
|
|
|
string JSONCommon::ValToString(yyjson_val *val, idx_t max_len) {
|
|
JSONAllocator json_allocator(Allocator::DefaultAllocator());
|
|
idx_t len;
|
|
auto data = JSONCommon::WriteVal<yyjson_val>(val, json_allocator.GetYYAlc(), len);
|
|
if (max_len < len) {
|
|
return string(data, max_len) + "...";
|
|
} else {
|
|
return string(data, len);
|
|
}
|
|
}
|
|
|
|
void JSONCommon::ThrowValFormatError(string error_string, yyjson_val *val) {
|
|
error_string = StringUtil::Format(error_string, JSONCommon::ValToString(val));
|
|
throw InvalidInputException(error_string);
|
|
}
|
|
|
|
string ThrowPathError(const char *ptr, const char *end, const bool binder) {
|
|
ptr--;
|
|
auto msg = StringUtil::Format("JSON path error near '%s'", string(ptr, end - ptr));
|
|
if (binder) {
|
|
throw BinderException(msg);
|
|
} else {
|
|
throw InvalidInputException(msg);
|
|
}
|
|
}
|
|
|
|
struct JSONKeyReadResult {
|
|
public:
|
|
static inline JSONKeyReadResult Empty() {
|
|
return {idx_t(0), false, string()};
|
|
}
|
|
|
|
static inline JSONKeyReadResult WildCard() {
|
|
return {1, false, "*"};
|
|
}
|
|
|
|
static inline JSONKeyReadResult RecWildCard() {
|
|
return {2, true, "*"};
|
|
}
|
|
|
|
static inline JSONKeyReadResult RecWildCardShortcut() {
|
|
return {1, true, "*"};
|
|
}
|
|
|
|
inline bool IsValid() {
|
|
return (chars_read != 0);
|
|
}
|
|
|
|
inline bool IsWildCard() {
|
|
return key == "*";
|
|
}
|
|
|
|
public:
|
|
idx_t chars_read;
|
|
bool recursive;
|
|
string key;
|
|
};
|
|
|
|
static inline JSONKeyReadResult ReadString(const char *ptr, const char *const end, const bool escaped) {
|
|
const char *const before = ptr;
|
|
if (escaped) {
|
|
auto key = make_unsafe_uniq_array_uninitialized<char>(end - ptr);
|
|
idx_t key_len = 0;
|
|
|
|
bool backslash = false;
|
|
while (ptr != end) {
|
|
if (backslash) {
|
|
if (*ptr != '"' && *ptr != '\\') {
|
|
key[key_len++] = '\\';
|
|
}
|
|
backslash = false;
|
|
} else {
|
|
if (*ptr == '"') {
|
|
break;
|
|
} else if (*ptr == '\\') {
|
|
backslash = true;
|
|
ptr++;
|
|
continue;
|
|
}
|
|
}
|
|
key[key_len++] = *ptr++;
|
|
}
|
|
if (ptr == end || backslash) {
|
|
return JSONKeyReadResult::Empty();
|
|
} else {
|
|
return {idx_t(ptr - before), false, string(key.get(), key_len)};
|
|
}
|
|
} else {
|
|
while (ptr != end) {
|
|
if (*ptr == '.' || *ptr == '[') {
|
|
break;
|
|
}
|
|
ptr++;
|
|
}
|
|
return {idx_t(ptr - before), false, string(before, ptr - before)};
|
|
}
|
|
}
|
|
|
|
static inline idx_t ReadInteger(const char *ptr, const char *const end, idx_t &idx) {
|
|
static constexpr auto IDX_T_SAFE_DIG = 19;
|
|
static constexpr auto IDX_T_MAX = ((idx_t)(~(idx_t)0));
|
|
|
|
const char *const before = ptr;
|
|
idx = 0;
|
|
for (idx_t i = 0; i < IDX_T_SAFE_DIG; i++) {
|
|
if (ptr == end) {
|
|
// No closing ']'
|
|
return 0;
|
|
}
|
|
if (*ptr == ']') {
|
|
break;
|
|
}
|
|
uint8_t add = (uint8_t)(*ptr - '0');
|
|
if (add <= 9) {
|
|
idx = add + idx * 10;
|
|
} else {
|
|
// Not a digit
|
|
return 0;
|
|
}
|
|
ptr++;
|
|
}
|
|
// Invalid if overflow
|
|
return idx >= (idx_t)IDX_T_MAX ? 0 : ptr - before;
|
|
}
|
|
|
|
static inline JSONKeyReadResult ReadKey(const char *ptr, const char *const end) {
|
|
D_ASSERT(ptr != end);
|
|
if (*ptr == '*') { // Wildcard
|
|
if (*(ptr + 1) == '*') {
|
|
return JSONKeyReadResult::RecWildCard();
|
|
}
|
|
return JSONKeyReadResult::WildCard();
|
|
}
|
|
bool recursive = false;
|
|
if (*ptr == '.') {
|
|
char next = *(ptr + 1);
|
|
if (next == '*') {
|
|
return JSONKeyReadResult::RecWildCard();
|
|
}
|
|
if (next == '[') {
|
|
return JSONKeyReadResult::RecWildCardShortcut();
|
|
}
|
|
ptr++;
|
|
recursive = true;
|
|
}
|
|
bool escaped = false;
|
|
if (*ptr == '"') {
|
|
ptr++; // Skip past opening '"'
|
|
escaped = true;
|
|
}
|
|
auto result = ReadString(ptr, end, escaped);
|
|
if (!result.IsValid()) {
|
|
return result;
|
|
}
|
|
if (escaped) {
|
|
result.chars_read += 2; // Account for surrounding quotes
|
|
}
|
|
if (recursive) {
|
|
result.chars_read += 1;
|
|
result.recursive = true;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
static inline bool ReadArrayIndex(const char *&ptr, const char *const end, idx_t &array_index, bool &from_back) {
|
|
D_ASSERT(ptr != end);
|
|
from_back = false;
|
|
if (*ptr == '*') { // Wildcard
|
|
ptr++;
|
|
if (ptr == end || *ptr != ']') {
|
|
return false;
|
|
}
|
|
array_index = DConstants::INVALID_INDEX;
|
|
} else {
|
|
if (*ptr == '#') { // SQLite syntax to index from back of array
|
|
ptr++; // Skip over '#'
|
|
if (ptr == end) {
|
|
return false;
|
|
}
|
|
if (*ptr == ']') {
|
|
// [#] always returns NULL in SQLite, so we return an array index that will do the same
|
|
array_index = NumericLimits<uint32_t>::Maximum();
|
|
ptr++;
|
|
return true;
|
|
}
|
|
if (*ptr != '-') {
|
|
return false;
|
|
}
|
|
from_back = true;
|
|
}
|
|
if (*ptr == '-') {
|
|
ptr++; // Skip over '-'
|
|
from_back = true;
|
|
}
|
|
auto idx_len = ReadInteger(ptr, end, array_index);
|
|
if (idx_len == 0) {
|
|
return false;
|
|
}
|
|
ptr += idx_len;
|
|
}
|
|
ptr++; // Skip past closing ']'
|
|
return true;
|
|
}
|
|
|
|
JSONPathType JSONCommon::ValidatePath(const char *ptr, const idx_t &len, const bool binder) {
|
|
D_ASSERT(len >= 1 && *ptr == '$');
|
|
JSONPathType path_type = JSONPathType::REGULAR;
|
|
const char *const end = ptr + len;
|
|
ptr++; // Skip past '$'
|
|
while (ptr != end) {
|
|
const auto &c = *ptr++;
|
|
if (ptr == end) {
|
|
ThrowPathError(ptr, end, binder);
|
|
}
|
|
switch (c) {
|
|
case '.': { // Object field
|
|
auto key = ReadKey(ptr, end);
|
|
if (!key.IsValid()) {
|
|
ThrowPathError(ptr, end, binder);
|
|
} else if (key.IsWildCard() || key.recursive) {
|
|
path_type = JSONPathType::WILDCARD;
|
|
}
|
|
ptr += key.chars_read;
|
|
break;
|
|
}
|
|
case '[': { // Array index
|
|
idx_t array_index;
|
|
bool from_back;
|
|
if (!ReadArrayIndex(ptr, end, array_index, from_back)) {
|
|
ThrowPathError(ptr, end, binder);
|
|
}
|
|
if (array_index == DConstants::INVALID_INDEX) {
|
|
path_type = JSONPathType::WILDCARD;
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
ThrowPathError(ptr, end, binder);
|
|
}
|
|
}
|
|
return path_type;
|
|
}
|
|
|
|
yyjson_val *JSONCommon::GetPath(yyjson_val *val, const char *ptr, const idx_t &len) {
|
|
// Path has been validated at this point
|
|
const char *const end = ptr + len;
|
|
ptr++; // Skip past '$'
|
|
while (val != nullptr && ptr != end) {
|
|
const auto &c = *ptr++;
|
|
D_ASSERT(ptr != end);
|
|
switch (c) {
|
|
case '.': { // Object field
|
|
if (!unsafe_yyjson_is_obj(val)) {
|
|
return nullptr;
|
|
}
|
|
auto key_result = ReadKey(ptr, end);
|
|
D_ASSERT(key_result.IsValid());
|
|
ptr += key_result.chars_read;
|
|
val = yyjson_obj_getn(val, key_result.key.c_str(), key_result.key.size());
|
|
break;
|
|
}
|
|
case '[': { // Array index
|
|
if (!unsafe_yyjson_is_arr(val)) {
|
|
return nullptr;
|
|
}
|
|
idx_t array_index;
|
|
bool from_back;
|
|
#ifdef DEBUG
|
|
bool success =
|
|
#endif
|
|
ReadArrayIndex(ptr, end, array_index, from_back);
|
|
#ifdef DEBUG
|
|
D_ASSERT(success);
|
|
#endif
|
|
if (from_back && array_index != 0) {
|
|
array_index = unsafe_yyjson_get_len(val) - array_index;
|
|
}
|
|
val = yyjson_arr_get(val, array_index);
|
|
break;
|
|
}
|
|
default: // LCOV_EXCL_START
|
|
throw InternalException(
|
|
"Invalid JSON Path encountered in JSONCommon::GetPath, call JSONCommon::ValidatePath first!");
|
|
} // LCOV_EXCL_STOP
|
|
}
|
|
return val;
|
|
}
|
|
|
|
void GetWildcardPathInternal(yyjson_val *val, const char *ptr, const char *const end, vector<yyjson_val *> &vals) {
|
|
while (val != nullptr && ptr != end) {
|
|
const auto &c = *ptr++;
|
|
D_ASSERT(ptr != end);
|
|
switch (c) {
|
|
case '.': { // Object field
|
|
auto key_result = ReadKey(ptr, end);
|
|
D_ASSERT(key_result.IsValid());
|
|
if (key_result.recursive) {
|
|
if (key_result.IsWildCard()) {
|
|
ptr += key_result.chars_read;
|
|
}
|
|
vector<yyjson_val *> rec_vals;
|
|
rec_vals.emplace_back(val);
|
|
for (idx_t i = 0; i < rec_vals.size(); i++) {
|
|
yyjson_val *rec_val = rec_vals[i];
|
|
if (yyjson_is_arr(rec_val)) {
|
|
size_t idx, max;
|
|
yyjson_val *element;
|
|
yyjson_arr_foreach(rec_val, idx, max, element) {
|
|
rec_vals.emplace_back(element);
|
|
}
|
|
} else if (yyjson_is_obj(rec_val)) {
|
|
size_t idx, max;
|
|
yyjson_val *key, *element;
|
|
yyjson_obj_foreach(rec_val, idx, max, key, element) {
|
|
rec_vals.emplace_back(element);
|
|
}
|
|
}
|
|
if (i > 0 || ptr != end) {
|
|
GetWildcardPathInternal(rec_val, ptr, end, vals);
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
ptr += key_result.chars_read;
|
|
if (!unsafe_yyjson_is_obj(val)) {
|
|
return;
|
|
}
|
|
if (key_result.IsWildCard()) { // Wildcard
|
|
size_t idx, max;
|
|
yyjson_val *key, *obj_val;
|
|
yyjson_obj_foreach(val, idx, max, key, obj_val) {
|
|
GetWildcardPathInternal(obj_val, ptr, end, vals);
|
|
}
|
|
return;
|
|
}
|
|
val = yyjson_obj_getn(val, key_result.key.c_str(), key_result.key.size());
|
|
break;
|
|
}
|
|
case '[': { // Array index
|
|
if (!unsafe_yyjson_is_arr(val)) {
|
|
return;
|
|
}
|
|
idx_t array_index;
|
|
bool from_back;
|
|
#ifdef DEBUG
|
|
bool success =
|
|
#endif
|
|
ReadArrayIndex(ptr, end, array_index, from_back);
|
|
#ifdef DEBUG
|
|
D_ASSERT(success);
|
|
#endif
|
|
|
|
if (array_index == DConstants::INVALID_INDEX) { // Wildcard
|
|
size_t idx, max;
|
|
yyjson_val *arr_val;
|
|
yyjson_arr_foreach(val, idx, max, arr_val) {
|
|
GetWildcardPathInternal(arr_val, ptr, end, vals);
|
|
}
|
|
return;
|
|
}
|
|
if (from_back && array_index != 0) {
|
|
array_index = unsafe_yyjson_get_len(val) - array_index;
|
|
}
|
|
val = yyjson_arr_get(val, array_index);
|
|
break;
|
|
}
|
|
default: // LCOV_EXCL_START
|
|
throw InternalException(
|
|
"Invalid JSON Path encountered in GetWildcardPathInternal, call JSONCommon::ValidatePath first!");
|
|
} // LCOV_EXCL_STOP
|
|
}
|
|
if (val != nullptr) {
|
|
vals.emplace_back(val);
|
|
}
|
|
return;
|
|
}
|
|
|
|
void JSONCommon::GetWildcardPath(yyjson_val *val, const char *ptr, const idx_t &len, vector<yyjson_val *> &vals) {
|
|
// Path has been validated at this point
|
|
const char *const end = ptr + len;
|
|
ptr++; // Skip past '$'
|
|
GetWildcardPathInternal(val, ptr, end, vals);
|
|
}
|
|
|
|
} // namespace duckdb
|