Files
email-tracker/external/duckdb/extension/json/json_reader.cpp
2025-10-24 19:21:19 -05:00

1102 lines
37 KiB
C++

#include "json_reader.hpp"
#include "duckdb/common/file_opener.hpp"
#include "duckdb/common/serializer/deserializer.hpp"
#include "duckdb/common/serializer/serializer.hpp"
#include "json_scan.hpp"
#include <utility>
namespace duckdb {
JSONBufferHandle::JSONBufferHandle(JSONReader &reader, idx_t buffer_index_p, idx_t readers_p, AllocatedData &&buffer_p,
idx_t buffer_size_p, idx_t buffer_start_p)
: reader(reader), buffer_index(buffer_index_p), readers(readers_p), buffer(std::move(buffer_p)),
buffer_size(buffer_size_p), buffer_start(buffer_start_p) {
}
JSONFileHandle::JSONFileHandle(QueryContext context_p, unique_ptr<FileHandle> file_handle_p, Allocator &allocator_p)
: context(context_p), file_handle(std::move(file_handle_p)), allocator(allocator_p),
can_seek(file_handle->CanSeek()), file_size(file_handle->GetFileSize()), read_position(0), requested_reads(0),
actual_reads(0), last_read_requested(false), cached_size(0) {
}
bool JSONFileHandle::IsOpen() const {
return file_handle != nullptr;
}
void JSONFileHandle::Close() {
if (IsOpen() && !file_handle->IsPipe()) {
file_handle->Close();
file_handle = nullptr;
}
}
void JSONFileHandle::Reset() {
D_ASSERT(RequestedReadsComplete());
read_position = 0;
requested_reads = 0;
actual_reads = 0;
last_read_requested = false;
if (IsOpen() && !IsPipe()) {
file_handle->Reset();
}
}
bool JSONFileHandle::RequestedReadsComplete() {
return requested_reads == actual_reads;
}
bool JSONFileHandle::LastReadRequested() const {
return last_read_requested;
}
idx_t JSONFileHandle::FileSize() const {
return file_size;
}
idx_t JSONFileHandle::Remaining() const {
return file_size - read_position;
}
bool JSONFileHandle::CanSeek() const {
return can_seek;
}
bool JSONFileHandle::IsPipe() const {
return file_handle->IsPipe();
}
FileHandle &JSONFileHandle::GetHandle() {
return *file_handle;
}
bool JSONFileHandle::GetPositionAndSize(idx_t &position, idx_t &size, idx_t requested_size) {
D_ASSERT(requested_size != 0);
if (last_read_requested) {
return false;
}
position = read_position;
size = MinValue<idx_t>(requested_size, Remaining());
read_position += size;
requested_reads++;
if (size == 0) {
last_read_requested = true;
}
return true;
}
void JSONFileHandle::ReadAtPosition(char *pointer, idx_t size, idx_t position,
optional_ptr<FileHandle> override_handle) {
if (IsPipe()) {
throw InternalException("ReadAtPosition is not supported for pipes");
}
if (size != 0) {
auto &handle = override_handle ? *override_handle.get() : *file_handle.get();
handle.Read(context, pointer, size, position);
}
const auto incremented_actual_reads = ++actual_reads;
if (incremented_actual_reads > requested_reads) {
throw InternalException("JSONFileHandle performed more actual reads than requested reads");
}
if (last_read_requested && incremented_actual_reads == requested_reads) {
Close();
}
}
bool JSONFileHandle::Read(char *pointer, idx_t &read_size, idx_t requested_size) {
D_ASSERT(requested_size != 0);
if (last_read_requested) {
return false;
}
read_size = 0;
if (!cached_buffers.empty() || read_position < cached_size) {
read_size += ReadFromCache(pointer, requested_size, read_position);
}
auto temp_read_size = ReadInternal(pointer, requested_size);
if (IsPipe() && temp_read_size != 0) { // Cache the buffer
cached_buffers.emplace_back(allocator.Allocate(temp_read_size));
memcpy(cached_buffers.back().get(), pointer, temp_read_size);
cached_size += temp_read_size;
}
read_position += temp_read_size;
read_size += temp_read_size;
if (read_size == 0) {
last_read_requested = true;
}
return true;
}
idx_t JSONFileHandle::ReadInternal(char *pointer, const idx_t requested_size) {
// Deal with reading from pipes
idx_t total_read_size = 0;
while (total_read_size < requested_size) {
auto read_size = file_handle->Read(pointer + total_read_size, requested_size - total_read_size);
if (read_size == 0) {
break;
}
total_read_size += read_size;
}
return total_read_size;
}
idx_t JSONFileHandle::ReadFromCache(char *&pointer, idx_t &size, atomic<idx_t> &position) {
idx_t read_size = 0;
idx_t total_offset = 0;
idx_t cached_buffer_idx;
for (cached_buffer_idx = 0; cached_buffer_idx < cached_buffers.size(); cached_buffer_idx++) {
auto &cached_buffer = cached_buffers[cached_buffer_idx];
if (size == 0) {
break;
}
if (position < total_offset + cached_buffer.GetSize()) {
idx_t within_buffer_offset = position - total_offset;
idx_t copy_size = MinValue<idx_t>(size, cached_buffer.GetSize() - within_buffer_offset);
memcpy(pointer, cached_buffer.get() + within_buffer_offset, copy_size);
read_size += copy_size;
pointer += copy_size;
size -= copy_size;
position += copy_size;
}
total_offset += cached_buffer.GetSize();
}
return read_size;
}
JSONReader::JSONReader(ClientContext &context, JSONReaderOptions options_p, OpenFileInfo file_p)
: BaseFileReader(std::move(file_p)), context(context), options(std::move(options_p)), initialized(0),
next_buffer_index(0), thrown(false) {
}
void JSONReader::OpenJSONFile() {
lock_guard<mutex> guard(lock);
if (!IsOpen()) {
auto &fs = FileSystem::GetFileSystem(context);
auto regular_file_handle = fs.OpenFile(file, FileFlags::FILE_FLAGS_READ | options.compression);
file_handle = make_uniq<JSONFileHandle>(context, std::move(regular_file_handle), BufferAllocator::Get(context));
}
Reset();
}
void JSONReader::CloseHandle() {
lock_guard<mutex> guard(lock);
if (IsOpen()) {
file_handle->Close();
}
}
void JSONReader::Reset() {
initialized = false;
next_buffer_index = 0;
buffer_map.clear();
buffer_line_or_object_counts.clear();
auto_detect_data.Reset();
auto_detect_data_size = 0;
if (HasFileHandle()) {
file_handle->Reset();
}
}
bool JSONReader::HasFileHandle() const {
return file_handle != nullptr;
}
bool JSONReader::IsOpen() const {
if (HasFileHandle()) {
return file_handle->IsOpen();
}
return false;
}
JSONReaderOptions &JSONReader::GetOptions() {
return options;
}
JSONFormat JSONReader::GetFormat() const {
return options.format;
}
void JSONReader::SetFormat(JSONFormat format) {
D_ASSERT(options.format == JSONFormat::AUTO_DETECT);
options.format = format;
}
JSONRecordType JSONReader::GetRecordType() const {
return options.record_type;
}
void JSONReader::SetRecordType(duckdb::JSONRecordType type) {
D_ASSERT(options.record_type == JSONRecordType::AUTO_DETECT);
options.record_type = type;
}
const string &JSONReader::GetFileName() const {
return file.path;
}
JSONFileHandle &JSONReader::GetFileHandle() const {
D_ASSERT(HasFileHandle());
return *file_handle;
}
void JSONReader::InsertBuffer(idx_t buffer_idx, unique_ptr<JSONBufferHandle> &&buffer) {
lock_guard<mutex> guard(lock);
D_ASSERT(buffer_map.find(buffer_idx) == buffer_map.end());
buffer_map.insert(make_pair(buffer_idx, std::move(buffer)));
}
optional_ptr<JSONBufferHandle> JSONReader::GetBuffer(idx_t buffer_idx) {
lock_guard<mutex> guard(lock);
auto it = buffer_map.find(buffer_idx);
return it == buffer_map.end() ? nullptr : it->second.get();
}
AllocatedData JSONReader::RemoveBuffer(JSONBufferHandle &handle) {
lock_guard<mutex> guard(lock);
auto it = buffer_map.find(handle.buffer_index);
D_ASSERT(it != buffer_map.end());
D_ASSERT(RefersToSameObject(handle, *it->second));
auto result = std::move(it->second->buffer);
buffer_map.erase(it);
return result;
}
idx_t JSONReader::GetBufferIndex() {
buffer_line_or_object_counts.push_back(-1);
return next_buffer_index++;
}
void JSONReader::SetBufferLineOrObjectCount(JSONBufferHandle &handle, idx_t count) {
lock_guard<mutex> guard(lock);
D_ASSERT(buffer_map.find(handle.buffer_index) != buffer_map.end());
D_ASSERT(RefersToSameObject(handle, *buffer_map.find(handle.buffer_index)->second));
D_ASSERT(buffer_line_or_object_counts[handle.buffer_index] == -1);
buffer_line_or_object_counts[handle.buffer_index] = count;
// if we have any errors - try to report them after finishing a buffer
ThrowErrorsIfPossible();
}
void JSONReader::AddParseError(JSONReaderScanState &scan_state, idx_t line_or_object_in_buf, yyjson_read_err &err,
const string &extra) {
string unit = options.format == JSONFormat::NEWLINE_DELIMITED ? "line" : "record/value";
auto error_msg = StringUtil::Format("Malformed JSON in file \"%s\", at byte %llu in %s {line}: %s. %s",
GetFileName(), err.pos + 1, unit, err.msg, extra);
lock_guard<mutex> guard(lock);
AddError(scan_state.current_buffer_handle->buffer_index, line_or_object_in_buf + 1, error_msg);
ThrowErrorsIfPossible();
// if we could not throw immediately - finish processing this buffer
scan_state.buffer_offset = 0;
scan_state.scan_count = 0;
}
void JSONReader::AddTransformError(JSONReaderScanState &scan_state, idx_t object_index, const string &error_message) {
D_ASSERT(scan_state.current_buffer_handle);
D_ASSERT(object_index != DConstants::INVALID_INDEX);
auto line_or_object_in_buffer = scan_state.lines_or_objects_in_buffer - scan_state.scan_count + object_index;
string unit = options.format == JSONFormat::NEWLINE_DELIMITED ? "line" : "record/value";
auto error_msg =
StringUtil::Format("JSON transform error in file \"%s\", in %s {line}: %s", GetFileName(), unit, error_message);
lock_guard<mutex> guard(lock);
AddError(scan_state.current_buffer_handle->buffer_index, line_or_object_in_buffer, error_msg);
ThrowErrorsIfPossible();
// if we could not throw immediately - finish processing this buffer
scan_state.buffer_offset = scan_state.buffer_size;
scan_state.scan_count = 0;
}
void JSONReader::AddError(idx_t buf_index, idx_t line_or_object_in_buf, const string &error_msg) {
if (error) {
// we already have an error - check if it happened before this error
if (error->buf_index < buf_index ||
(error->buf_index == buf_index && error->line_or_object_in_buf < line_or_object_in_buf)) {
// it did! don't record this error
return;
}
} else {
error = make_uniq<JSONError>();
}
error->buf_index = buf_index;
error->line_or_object_in_buf = line_or_object_in_buf;
error->error_msg = error_msg;
}
optional_idx JSONReader::TryGetLineNumber(idx_t buf_index, idx_t line_or_object_in_buf) {
idx_t line = line_or_object_in_buf;
for (idx_t b_idx = 0; b_idx < buf_index; b_idx++) {
if (buffer_line_or_object_counts[b_idx] == -1) {
// this buffer has not been parsed yet - we cannot throw
return optional_idx();
}
line += buffer_line_or_object_counts[b_idx];
}
return line;
}
void JSONReader::ThrowErrorsIfPossible() {
if (!error) {
return;
}
// check if we finished all buffers before the error buffer
auto line = TryGetLineNumber(error->buf_index, error->line_or_object_in_buf);
if (!line.IsValid()) {
return;
}
// we can throw!
thrown = true;
auto formatted_error = StringUtil::Replace(error->error_msg, "{line}", to_string(line.GetIndex() + 1));
throw InvalidInputException(formatted_error);
}
bool JSONReader::HasThrown() {
if (context.GetExecutor().HasError()) {
return true;
}
lock_guard<mutex> guard(lock);
return thrown;
}
double JSONReader::GetProgress() const {
lock_guard<mutex> guard(lock);
if (HasFileHandle()) {
return 100.0 - 100.0 * double(file_handle->Remaining()) / double(file_handle->FileSize());
} else {
return 0;
}
}
static inline void TrimWhitespace(JSONString &line) {
while (line.size != 0 && StringUtil::CharacterIsSpace(line[0])) {
line.pointer++;
line.size--;
}
while (line.size != 0 && StringUtil::CharacterIsSpace(line[line.size - 1])) {
line.size--;
}
}
JSONReaderScanState::JSONReaderScanState(ClientContext &context, Allocator &global_allocator, idx_t buffer_capacity)
: fs(FileSystem::GetFileSystem(context)), global_allocator(global_allocator),
allocator(BufferAllocator::Get(context)), buffer_capacity(buffer_capacity) {
}
void JSONReaderScanState::ResetForNextParse() {
allocator.Reset();
scan_count = 0;
}
void JSONReaderScanState::ClearBufferHandle() {
if (!current_buffer_handle) {
return;
}
// Free up the current buffer - if any
auto &handle = *current_buffer_handle;
if (!RefersToSameObject(handle.reader, *current_reader)) {
throw InternalException("Handle reader and current reader are unaligned");
}
handle.reader.DecrementBufferUsage(*current_buffer_handle, lines_or_objects_in_buffer, read_buffer);
current_buffer_handle = nullptr;
}
void JSONReaderScanState::ResetForNextBuffer() {
ClearBufferHandle();
buffer_index = optional_idx();
buffer_size = 0;
scan_count = 0;
is_last = false;
file_read_type = JSONFileReadType::SCAN_PARTIAL;
}
static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset, const idx_t &buffer_size) {
for (; buffer_offset != buffer_size; buffer_offset++) {
if (!StringUtil::CharacterIsSpace(buffer_ptr[buffer_offset])) {
break;
}
}
}
static inline const char *NextNewline(const char *ptr, const idx_t size) {
return const_char_ptr_cast(memchr(ptr, '\n', size));
}
static inline const char *PreviousNewline(const char *ptr, const idx_t size) {
const auto end = ptr - size;
for (ptr--; ptr != end; ptr--) {
if (*ptr == '\n') {
break;
}
}
return ptr;
}
static inline const char *NextJSONDefault(const char *ptr, const char *const end) {
idx_t parents = 0;
while (ptr != end) {
switch (*ptr++) {
case '{':
case '[':
parents++;
continue;
case '}':
case ']':
parents--;
break;
case '"':
while (ptr != end) {
auto string_char = *ptr++;
if (string_char == '"') {
break;
} else if (string_char == '\\') {
if (ptr != end) {
ptr++; // Skip the escaped char
}
}
}
break;
default:
continue;
}
if (parents == 0) {
break;
}
}
return ptr;
}
static inline const char *NextJSON(const char *ptr, const idx_t size) {
D_ASSERT(!StringUtil::CharacterIsSpace(*ptr)); // Should be handled before
const char *const end = ptr + size;
switch (*ptr) {
case '{':
case '[':
case '"':
ptr = NextJSONDefault(ptr, end);
break;
default:
// Special case: JSON array containing JSON without clear "parents", i.e., not obj/arr/str
while (ptr != end) {
switch (*ptr++) {
case ',':
case ']':
ptr--;
break;
default:
continue;
}
break;
}
}
return ptr == end ? nullptr : ptr;
}
void JSONReader::SkipOverArrayStart(JSONReaderScanState &scan_state) {
// First read of this buffer, check if it's actually an array and skip over the bytes
auto &buffer_ptr = scan_state.buffer_ptr;
auto &buffer_offset = scan_state.buffer_offset;
auto &buffer_size = scan_state.buffer_size;
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
if (buffer_offset == buffer_size) {
return; // Empty file
}
if (buffer_ptr[buffer_offset] != '[') {
throw InvalidInputException(
"Expected top-level JSON array with format='array', but first character is '%c' in file \"%s\"."
"\n Try setting format='auto' or format='newline_delimited'.",
buffer_ptr[buffer_offset], GetFileName());
}
SkipWhitespace(buffer_ptr, ++buffer_offset, buffer_size);
if (buffer_offset >= buffer_size) {
throw InvalidInputException("Missing closing brace ']' in JSON array with format='array' in file \"%s\"",
GetFileName());
}
if (buffer_ptr[buffer_offset] == ']') {
// Empty array
SkipWhitespace(buffer_ptr, ++buffer_offset, buffer_size);
if (buffer_offset != buffer_size) {
throw InvalidInputException(
"Empty array with trailing data when parsing JSON array with format='array' in file \"%s\"",
GetFileName());
}
}
}
static pair<JSONFormat, JSONRecordType> DetectFormatAndRecordType(char *const buffer_ptr, const idx_t buffer_size,
yyjson_alc *alc) {
// First we do the easy check whether it's NEWLINE_DELIMITED
auto line_end = NextNewline(buffer_ptr, buffer_size);
if (line_end != nullptr) {
idx_t line_size = line_end - buffer_ptr;
SkipWhitespace(buffer_ptr, line_size, buffer_size);
yyjson_read_err error;
auto doc = JSONCommon::ReadDocumentUnsafe(buffer_ptr, line_size, JSONCommon::READ_FLAG, alc, &error);
if (error.code == YYJSON_READ_SUCCESS) { // We successfully read the line
if (yyjson_is_arr(doc->root) && line_size == buffer_size) {
// It's just one array, let's actually assume ARRAY, not NEWLINE_DELIMITED
if (yyjson_arr_size(doc->root) == 0 || yyjson_is_obj(yyjson_arr_get(doc->root, 0))) {
// Either an empty array (assume records), or an array of objects
return make_pair(JSONFormat::ARRAY, JSONRecordType::RECORDS);
} else {
return make_pair(JSONFormat::ARRAY, JSONRecordType::VALUES);
}
} else if (yyjson_is_obj(doc->root)) {
return make_pair(JSONFormat::NEWLINE_DELIMITED, JSONRecordType::RECORDS);
} else {
return make_pair(JSONFormat::NEWLINE_DELIMITED, JSONRecordType::VALUES);
}
}
}
// Skip whitespace
idx_t buffer_offset = 0;
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
auto remaining = buffer_size - buffer_offset;
// We know it's not NEWLINE_DELIMITED at this point, if there's a '{', we know it's not ARRAY either
// Also if it's fully whitespace we just return something because we don't know
if (remaining == 0 || buffer_ptr[buffer_offset] == '{') {
return make_pair(JSONFormat::UNSTRUCTURED, JSONRecordType::RECORDS);
}
// We know it's not top-level records, if it's not '[', it's not ARRAY either
if (buffer_ptr[buffer_offset] != '[') {
return make_pair(JSONFormat::UNSTRUCTURED, JSONRecordType::VALUES);
}
// It's definitely an ARRAY, but now we have to figure out if there's more than one top-level array
yyjson_read_err error;
auto doc =
JSONCommon::ReadDocumentUnsafe(buffer_ptr + buffer_offset, remaining, JSONCommon::READ_STOP_FLAG, alc, &error);
if (error.code == YYJSON_READ_SUCCESS) {
D_ASSERT(yyjson_is_arr(doc->root));
// We successfully read something!
buffer_offset += yyjson_doc_get_read_size(doc);
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
remaining = buffer_size - buffer_offset;
if (remaining != 0) { // There's more
return make_pair(JSONFormat::UNSTRUCTURED, JSONRecordType::VALUES);
}
// Just one array, check what's in there
if (yyjson_arr_size(doc->root) == 0 || yyjson_is_obj(yyjson_arr_get(doc->root, 0))) {
// Either an empty array (assume records), or an array of objects
return make_pair(JSONFormat::ARRAY, JSONRecordType::RECORDS);
} else {
return make_pair(JSONFormat::ARRAY, JSONRecordType::VALUES);
}
}
// We weren't able to parse an array, could be broken or an array larger than our buffer size, let's skip over '['
SkipWhitespace(buffer_ptr, ++buffer_offset, --remaining);
remaining = buffer_size - buffer_offset;
// If it's '{' we know there's RECORDS in the ARRAY, else it's VALUES
if (remaining == 0 || buffer_ptr[buffer_offset] == '{') {
return make_pair(JSONFormat::ARRAY, JSONRecordType::RECORDS);
}
// It's not RECORDS, so it must be VALUES
return make_pair(JSONFormat::ARRAY, JSONRecordType::VALUES);
}
void JSONReader::ParseJSON(JSONReaderScanState &scan_state, char *const json_start, const idx_t json_size,
const idx_t remaining) {
yyjson_doc *doc;
yyjson_read_err err;
if (options.type == JSONScanType::READ_JSON_OBJECTS) { // If we return strings, we cannot parse INSITU
doc = JSONCommon::ReadDocumentUnsafe(json_start, json_size, JSONCommon::READ_STOP_FLAG,
scan_state.allocator.GetYYAlc(), &err);
} else {
doc = JSONCommon::ReadDocumentUnsafe(json_start, remaining, JSONCommon::READ_INSITU_FLAG,
scan_state.allocator.GetYYAlc(), &err);
}
if (err.code != YYJSON_READ_SUCCESS) {
auto can_ignore_this_error = options.ignore_errors;
string extra;
if (GetFormat() != JSONFormat::NEWLINE_DELIMITED) {
can_ignore_this_error = false;
extra = options.ignore_errors
? "Parse errors cannot be ignored for JSON formats other than 'newline_delimited'"
: "";
}
if (!can_ignore_this_error) {
AddParseError(scan_state, scan_state.lines_or_objects_in_buffer, err, extra);
return;
}
}
// We parse with YYJSON_STOP_WHEN_DONE, so we need to check this by hand
const auto read_size = yyjson_doc_get_read_size(doc);
if (read_size > json_size) {
// Can't go past the boundary, even with ignore_errors
err.code = YYJSON_READ_ERROR_UNEXPECTED_END;
err.msg = "unexpected end of data";
err.pos = json_size;
AddParseError(scan_state, scan_state.lines_or_objects_in_buffer, err, "Try auto-detecting the JSON format");
return;
} else if (!options.ignore_errors && read_size < json_size) {
idx_t off = read_size;
idx_t rem = json_size;
SkipWhitespace(json_start, off, rem);
if (off != rem) { // Between end of document and boundary should be whitespace only
err.code = YYJSON_READ_ERROR_UNEXPECTED_CONTENT;
err.msg = "unexpected content after document";
err.pos = read_size;
AddParseError(scan_state, scan_state.lines_or_objects_in_buffer, err, "Try auto-detecting the JSON format");
return;
}
}
scan_state.lines_or_objects_in_buffer++;
if (!doc) {
scan_state.values[scan_state.scan_count] = nullptr;
return;
}
// Set the JSONLine and trim
scan_state.units[scan_state.scan_count] = JSONString(json_start, json_size);
TrimWhitespace(scan_state.units[scan_state.scan_count]);
scan_state.values[scan_state.scan_count] = doc->root;
}
void JSONReader::AutoDetect(Allocator &allocator, idx_t buffer_capacity) {
// read the first buffer from the file
auto read_buffer = allocator.Allocate(buffer_capacity);
idx_t read_size = 0;
auto buffer_ptr = char_ptr_cast(read_buffer.get());
if (!file_handle->Read(buffer_ptr, read_size, buffer_capacity - YYJSON_PADDING_SIZE)) {
// could not read anything
return;
}
if (read_size == 0) {
// file is empty - skip
return;
}
// perform auto-detection over the data we just read
JSONAllocator json_allocator(allocator);
auto format_and_record_type = DetectFormatAndRecordType(buffer_ptr, read_size, json_allocator.GetYYAlc());
if (GetFormat() == JSONFormat::AUTO_DETECT) {
SetFormat(format_and_record_type.first);
}
if (GetRecordType() == JSONRecordType::AUTO_DETECT) {
SetRecordType(format_and_record_type.second);
}
if (!options.ignore_errors && options.record_type == JSONRecordType::RECORDS &&
GetRecordType() != JSONRecordType::RECORDS) {
string unit = options.format == JSONFormat::NEWLINE_DELIMITED ? "line" : "record/value";
throw InvalidInputException(
"JSON auto-detection error in file \"%s\": Expected records, detected non-record JSON instead",
GetFileName());
}
// store the buffer in the file so it can be re-used by the first reader of the file
if (!file_handle->IsPipe()) {
auto_detect_data = std::move(read_buffer);
auto_detect_data_size = read_size;
} else {
file_handle->Reset();
}
}
void JSONReader::ThrowObjectSizeError(const idx_t object_size) {
throw InvalidInputException(
"\"maximum_object_size\" of %llu bytes exceeded while reading file \"%s\" (>%llu bytes)."
"\n Try increasing \"maximum_object_size\".",
options.maximum_object_size, GetFileName(), object_size);
}
bool JSONReader::CopyRemainderFromPreviousBuffer(JSONReaderScanState &scan_state) {
D_ASSERT(scan_state.buffer_index.GetIndex() != 0);
D_ASSERT(GetFormat() == JSONFormat::NEWLINE_DELIMITED);
// Spinlock until the previous batch index has also read its buffer
optional_ptr<JSONBufferHandle> previous_buffer_handle;
while (!previous_buffer_handle) {
if (HasThrown()) {
return false;
}
previous_buffer_handle = GetBuffer(scan_state.buffer_index.GetIndex() - 1);
}
// First we find the newline in the previous block
idx_t prev_buffer_size = previous_buffer_handle->buffer_size - previous_buffer_handle->buffer_start;
auto prev_buffer_ptr = char_ptr_cast(previous_buffer_handle->buffer.get()) + previous_buffer_handle->buffer_size;
auto prev_object_start = PreviousNewline(prev_buffer_ptr, prev_buffer_size);
auto prev_object_size = NumericCast<idx_t>(prev_buffer_ptr - prev_object_start);
D_ASSERT(scan_state.buffer_offset == options.maximum_object_size);
if (prev_object_size > scan_state.buffer_offset) {
ThrowObjectSizeError(prev_object_size);
}
// Now copy the data to our reconstruct buffer
memcpy(scan_state.buffer_ptr + scan_state.buffer_offset - prev_object_size, prev_object_start, prev_object_size);
// We copied the object, so we are no longer reading the previous buffer
if (--previous_buffer_handle->readers == 0) {
RemoveBuffer(*previous_buffer_handle);
}
if (prev_object_size == 1) {
// Just a newline
return false;
}
scan_state.buffer_offset -= prev_object_size;
return true;
}
void JSONReader::ParseNextChunk(JSONReaderScanState &scan_state) {
const auto format = GetFormat();
auto &buffer_ptr = scan_state.buffer_ptr;
auto &buffer_offset = scan_state.buffer_offset;
auto &buffer_size = scan_state.buffer_size;
auto &scan_count = scan_state.scan_count;
for (; scan_count < STANDARD_VECTOR_SIZE; scan_count++) {
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
auto json_start = buffer_ptr + buffer_offset;
idx_t remaining = buffer_size - buffer_offset;
if (remaining == 0) {
break;
}
D_ASSERT(format != JSONFormat::AUTO_DETECT);
const char *json_end = format == JSONFormat::NEWLINE_DELIMITED ? NextNewline(json_start, remaining)
: NextJSON(json_start, remaining);
if (json_end == nullptr) {
if (remaining > options.maximum_object_size) {
ThrowObjectSizeError(remaining);
}
// We reached the end of the buffer
if (!scan_state.is_last) {
// Last bit of data belongs to the next batch
if (scan_state.file_read_type == JSONFileReadType::SCAN_ENTIRE_FILE) {
scan_state.prev_buffer_remainder = remaining;
scan_state.prev_buffer_offset = json_start - buffer_ptr;
}
buffer_offset = buffer_size;
break;
}
json_end = json_start + remaining;
}
idx_t json_size = json_end - json_start;
ParseJSON(scan_state, json_start, json_size, remaining);
buffer_offset += json_size;
if (format == JSONFormat::ARRAY) {
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
if (buffer_ptr[buffer_offset] == ',' || buffer_ptr[buffer_offset] == ']') {
buffer_offset++;
} else { // We can't ignore this error, even with 'ignore_errors'
yyjson_read_err err;
err.code = YYJSON_READ_ERROR_UNEXPECTED_CHARACTER;
err.msg = "unexpected character";
err.pos = json_size;
AddParseError(scan_state, scan_state.lines_or_objects_in_buffer, err);
return;
}
}
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
}
}
void JSONReader::Initialize(Allocator &allocator, idx_t buffer_size) {
if (initialized) {
throw InternalException("JSON InitializeScan called twice on the same reader without resetting");
}
// Open the file if it is not yet open
if (!IsOpen()) {
OpenJSONFile();
}
initialized = true;
// Auto-detect if we haven't yet done this during the bind
if (options.record_type == JSONRecordType::AUTO_DETECT || GetFormat() == JSONFormat::AUTO_DETECT) {
// We have to detect the JSON format
AutoDetect(allocator, buffer_size);
}
}
bool JSONReader::InitializeScan(JSONReaderScanState &scan_state, JSONFileReadType file_read_type) {
if (file_read_type == JSONFileReadType::SCAN_PARTIAL && GetFormat() != JSONFormat::NEWLINE_DELIMITED) {
throw InternalException("JSON Partial scans are only possible on ND json");
}
scan_state.current_reader = this;
scan_state.is_first_scan = true;
scan_state.file_read_type = file_read_type;
if (file_read_type == JSONFileReadType::SCAN_ENTIRE_FILE) {
// when initializing a single-file scan we don't need to read anything yet
return true;
}
// partial read
// we need to check if there is data available within our current reader
if (PrepareBufferForRead(scan_state)) {
// there is data available! return
return true;
}
return false;
}
idx_t JSONReader::Scan(JSONReaderScanState &scan_state) {
PrepareForScan(scan_state);
while (scan_state.scan_count == 0) {
while (scan_state.buffer_offset >= scan_state.buffer_size) {
// we have exhausted the current buffer
if (scan_state.file_read_type == JSONFileReadType::SCAN_PARTIAL) {
// we are not scanning the entire file
// return and fetch the next buffer from the global state
return 0;
}
// read the next buffer
if (!ReadNextBuffer(scan_state)) {
// we have exhausted the file
return 0;
}
}
ParseNextChunk(scan_state);
}
return scan_state.scan_count;
}
void JSONReader::PrepareForScan(JSONReaderScanState &scan_state) {
if (!scan_state.is_first_scan) {
// we have already scanned from this buffer before - just reset and scan the next batch
scan_state.ResetForNextParse();
return;
}
scan_state.is_first_scan = false;
if (scan_state.file_read_type == JSONFileReadType::SCAN_ENTIRE_FILE) {
// first time scanning from this reader while scanning the entire file
// we need to initialize the reader
if (!scan_state.current_reader->IsInitialized()) {
scan_state.current_reader->Initialize(scan_state.global_allocator, scan_state.buffer_capacity);
}
return;
}
if (!scan_state.needs_to_read && !scan_state.read_buffer.IsSet()) {
// we have already read (because we auto-detected) - skip
return;
}
// we are scanning only a buffer - finalize it so we can start reading
FinalizeBuffer(scan_state);
}
void JSONReader::FinalizeBuffer(JSONReaderScanState &scan_state) {
if (scan_state.needs_to_read) {
// we haven't read into the buffer yet - this can only happen if we are reading a file we can seek into
// read into the buffer
ReadNextBufferSeek(scan_state);
scan_state.needs_to_read = false;
}
// we read something
// skip over the array start if required
if (!scan_state.is_last) {
if (scan_state.buffer_index.GetIndex() == 0) {
StringUtil::SkipBOM(scan_state.buffer_ptr, scan_state.buffer_size, scan_state.buffer_offset);
if (GetFormat() == JSONFormat::ARRAY) {
SkipOverArrayStart(scan_state);
}
}
}
// then finalize the buffer
FinalizeBufferInternal(scan_state, scan_state.read_buffer, scan_state.buffer_index.GetIndex());
}
bool JSONReader::ReadNextBuffer(JSONReaderScanState &scan_state) {
if (!PrepareBufferForRead(scan_state)) {
return false;
}
// finalize the buffer
FinalizeBuffer(scan_state);
return true;
}
void JSONReader::FinalizeBufferInternal(JSONReaderScanState &scan_state, AllocatedData &buffer, idx_t buffer_index) {
idx_t readers = 1;
if (scan_state.file_read_type == JSONFileReadType::SCAN_PARTIAL) {
readers = scan_state.is_last ? 1 : 2;
}
// Create an entry and insert it into the map
auto json_buffer_handle = make_uniq<JSONBufferHandle>(*this, buffer_index, readers, std::move(buffer),
scan_state.buffer_size, scan_state.buffer_offset);
scan_state.current_buffer_handle = json_buffer_handle.get();
InsertBuffer(buffer_index, std::move(json_buffer_handle));
if (scan_state.file_read_type == JSONFileReadType::SCAN_PARTIAL) {
// if we are not scanning the entire file - copy the remainder of the previous buffer into this buffer
// we don't need to do this for the first buffer
// we do this after inserting the buffer in the map to ensure we can still read in parallel
if (scan_state.buffer_index.GetIndex() != 0) {
CopyRemainderFromPreviousBuffer(scan_state);
}
}
scan_state.prev_buffer_remainder = 0;
scan_state.lines_or_objects_in_buffer = 0;
// YYJSON needs this
memset(scan_state.buffer_ptr + scan_state.buffer_size, 0, YYJSON_PADDING_SIZE);
}
void JSONReader::DecrementBufferUsage(JSONBufferHandle &handle, idx_t lines_or_object_in_buffer,
AllocatedData &buffer) {
SetBufferLineOrObjectCount(handle, lines_or_object_in_buffer);
if (--handle.readers == 0) {
buffer = RemoveBuffer(handle);
}
}
void JSONReader::PrepareForReadInternal(JSONReaderScanState &scan_state) {
// clear the previous buffer handle
scan_state.ClearBufferHandle();
// if we don't have a buffer - allocate it
if (!scan_state.read_buffer.IsSet()) {
scan_state.read_buffer = scan_state.global_allocator.Allocate(scan_state.buffer_capacity);
scan_state.buffer_ptr = char_ptr_cast(scan_state.read_buffer.get());
}
if (scan_state.file_read_type == JSONFileReadType::SCAN_ENTIRE_FILE) {
// Copy last bit of previous buffer to the beginning if we are doing a single-threaded read
memmove(scan_state.buffer_ptr, scan_state.buffer_ptr + scan_state.prev_buffer_offset,
scan_state.prev_buffer_remainder);
}
}
bool JSONReader::PrepareBufferForRead(JSONReaderScanState &scan_state) {
if (auto_detect_data.IsSet()) {
// we have auto-detected data - re-use the buffer
if (next_buffer_index != 0 || auto_detect_data_size == 0 || scan_state.prev_buffer_remainder != 0) {
throw InternalException("Invalid re-use of auto-detect data in JSON");
}
scan_state.buffer_index = GetBufferIndex();
scan_state.buffer_size = auto_detect_data_size;
scan_state.read_buffer = std::move(auto_detect_data);
scan_state.buffer_ptr = char_ptr_cast(scan_state.read_buffer.get());
scan_state.prev_buffer_remainder = 0;
scan_state.needs_to_read = false;
scan_state.is_last = false;
scan_state.buffer_offset = 0;
auto_detect_data.Reset();
auto_detect_data_size = 0;
return true;
}
if (scan_state.file_read_type == JSONFileReadType::SCAN_PARTIAL && GetFileHandle().CanSeek()) {
// we can seek and are doing a parallel read - we don't need to read immediately yet
// we only need to prepare the read now
if (!PrepareBufferSeek(scan_state)) {
return false;
}
} else {
// we cannot seek - we need to read immediately here
if (!ReadNextBufferNoSeek(scan_state)) {
return false;
}
}
return true;
}
bool JSONReader::PrepareBufferSeek(JSONReaderScanState &scan_state) {
scan_state.request_size = scan_state.buffer_capacity / 2 - scan_state.prev_buffer_remainder - YYJSON_PADDING_SIZE;
if (!IsOpen()) {
return false;
}
auto &file_handle = GetFileHandle();
if (file_handle.LastReadRequested()) {
return false;
}
if (!file_handle.GetPositionAndSize(scan_state.read_position, scan_state.read_size, scan_state.request_size)) {
return false; // We weren't able to read
}
scan_state.buffer_index = GetBufferIndex();
scan_state.is_last = scan_state.read_size == 0;
scan_state.needs_to_read = true;
scan_state.buffer_size = 0;
return true;
}
void JSONReader::ReadNextBufferSeek(JSONReaderScanState &scan_state) {
PrepareForReadInternal(scan_state);
// we start reading at "options.maximum_object_size" to leave space for data from the previous buffer
idx_t read_offset = options.maximum_object_size;
if (scan_state.read_size > 0) {
auto &file_handle = GetFileHandle();
{
lock_guard<mutex> reader_guard(lock);
auto &raw_handle = file_handle.GetHandle();
// For non-on-disk files, we create a handle per thread: this is faster for e.g. S3Filesystem where
// throttling per tcp connection can occur meaning that using multiple connections is faster.
if (!raw_handle.OnDiskFile() && raw_handle.CanSeek()) {
if (!scan_state.thread_local_filehandle ||
scan_state.thread_local_filehandle->GetPath() != raw_handle.GetPath()) {
scan_state.thread_local_filehandle = scan_state.fs.OpenFile(
raw_handle.GetPath(), FileFlags::FILE_FLAGS_READ | FileFlags::FILE_FLAGS_DIRECT_IO);
}
} else if (scan_state.thread_local_filehandle) {
scan_state.thread_local_filehandle = nullptr;
}
}
// Now read the file lock-free!
file_handle.ReadAtPosition(scan_state.buffer_ptr + read_offset, scan_state.read_size, scan_state.read_position,
scan_state.thread_local_filehandle);
}
scan_state.buffer_size = read_offset + scan_state.read_size;
scan_state.buffer_offset = read_offset;
scan_state.prev_buffer_remainder = 0;
}
bool JSONReader::ReadNextBufferNoSeek(JSONReaderScanState &scan_state) {
idx_t read_offset;
if (scan_state.file_read_type == JSONFileReadType::SCAN_ENTIRE_FILE) {
read_offset = scan_state.prev_buffer_remainder;
} else {
// we start reading at "options.maximum_object_size" to leave space for data from the previous buffer
read_offset = options.maximum_object_size;
}
idx_t request_size = scan_state.buffer_capacity - read_offset - YYJSON_PADDING_SIZE;
idx_t read_size;
if (!IsOpen()) {
return false; // Couldn't read anything
}
auto &file_handle = GetFileHandle();
if (file_handle.LastReadRequested()) {
return false;
}
scan_state.buffer_index = GetBufferIndex();
PrepareForReadInternal(scan_state);
if (!file_handle.Read(scan_state.buffer_ptr + read_offset, read_size, request_size)) {
return false; // Couldn't read anything
}
scan_state.is_last = read_size == 0;
if (scan_state.is_last) {
file_handle.Close();
}
scan_state.buffer_size = read_offset + read_size;
if (scan_state.file_read_type == JSONFileReadType::SCAN_PARTIAL) {
// if we are doing a partial read we don't have the reconstruction data yet
scan_state.buffer_offset = read_offset;
} else {
scan_state.buffer_offset = 0;
}
scan_state.needs_to_read = false;
scan_state.prev_buffer_remainder = 0;
return true;
}
} // namespace duckdb