should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,302 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// json_reader.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb/common/atomic.hpp"
#include "duckdb/common/enum_util.hpp"
#include "duckdb/common/enums/file_compression_type.hpp"
#include "duckdb/common/file_system.hpp"
#include "duckdb/common/multi_file/base_file_reader.hpp"
#include "duckdb/common/multi_file/multi_file_reader.hpp"
#include "json_reader_options.hpp"
#include "duckdb/common/mutex.hpp"
#include "json_common.hpp"
#include "json_enums.hpp"
namespace duckdb {
struct JSONScanGlobalState;
class JSONReader;
struct JSONBufferHandle {
public:
JSONBufferHandle(JSONReader &reader, idx_t buffer_index, idx_t readers, AllocatedData &&buffer, idx_t buffer_size,
idx_t buffer_start);
public:
//! The reader this buffer comes from
JSONReader &reader;
//! Buffer index (within same file)
const idx_t buffer_index;
//! Number of readers for this buffer
atomic<idx_t> readers;
//! The buffer
AllocatedData buffer;
//! The size of the data in the buffer (can be less than buffer.GetSize())
const idx_t buffer_size;
//! The start position in the buffer
idx_t buffer_start;
};
struct JSONFileHandle {
public:
JSONFileHandle(QueryContext context, unique_ptr<FileHandle> file_handle, Allocator &allocator);
bool IsOpen() const;
void Close();
void Reset();
bool RequestedReadsComplete();
bool LastReadRequested() const;
idx_t FileSize() const;
idx_t Remaining() const;
bool CanSeek() const;
bool IsPipe() const;
FileHandle &GetHandle();
//! The next two functions return whether the read was successful
bool GetPositionAndSize(idx_t &position, idx_t &size, idx_t requested_size);
bool Read(char *pointer, idx_t &read_size, idx_t requested_size);
//! Read at position optionally allows passing a custom handle to read from, otherwise the default one is used
void ReadAtPosition(char *pointer, idx_t size, idx_t position, optional_ptr<FileHandle> override_handle = nullptr);
private:
idx_t ReadInternal(char *pointer, const idx_t requested_size);
idx_t ReadFromCache(char *&pointer, idx_t &size, atomic<idx_t> &position);
private:
QueryContext context;
//! The JSON file handle
unique_ptr<FileHandle> file_handle;
Allocator &allocator;
//! File properties
const bool can_seek;
const idx_t file_size;
//! Read properties
atomic<idx_t> read_position;
atomic<idx_t> requested_reads;
atomic<idx_t> actual_reads;
atomic<bool> last_read_requested;
//! Cached buffers for resetting when reading stream
vector<AllocatedData> cached_buffers;
idx_t cached_size;
};
struct JSONString {
public:
JSONString() {
}
JSONString(const char *pointer_p, idx_t size_p) : pointer(pointer_p), size(size_p) {
}
const char *pointer;
idx_t size;
public:
string ToString() {
return string(pointer, size);
}
const char &operator[](size_t i) const {
return pointer[i];
}
};
enum class JSONFileReadType { SCAN_ENTIRE_FILE, SCAN_PARTIAL };
struct JSONReaderScanState {
explicit JSONReaderScanState(ClientContext &context, Allocator &global_allocator,
idx_t reconstruct_buffer_capacity);
FileSystem &fs;
Allocator &global_allocator;
//! Thread-local allocator
JSONAllocator allocator;
idx_t buffer_capacity;
bool initialized = false;
// if we have a buffer already - this is our buffer index
optional_idx buffer_index;
//! Whether or not we are scanning the entire file
//! If we are scanning the entire file we don't share reads between threads and just read the file until we are done
JSONFileReadType file_read_type = JSONFileReadType::SCAN_PARTIAL;
// Data for reading (if we have postponed reading)
//! Buffer (if we have one)
AllocatedData read_buffer;
bool needs_to_read = false;
idx_t request_size;
idx_t read_position;
idx_t read_size;
//! Current scan data
idx_t scan_count = 0;
JSONString units[STANDARD_VECTOR_SIZE];
yyjson_val *values[STANDARD_VECTOR_SIZE];
optional_ptr<JSONBufferHandle> current_buffer_handle;
//! Current buffer read info
optional_ptr<JSONReader> current_reader;
char *buffer_ptr = nullptr;
idx_t buffer_size = 0;
idx_t buffer_offset = 0;
idx_t prev_buffer_remainder = 0;
idx_t prev_buffer_offset = 0;
idx_t lines_or_objects_in_buffer = 0;
//! Whether this is the first time scanning this buffer
bool is_first_scan = false;
//! Whether this is the last batch of the file
bool is_last = false;
//! Buffer to reconstruct split values
optional_idx batch_index;
//! For some filesystems (e.g. S3), using a filehandle per thread increases performance
unique_ptr<FileHandle> thread_local_filehandle;
public:
//! Reset for parsing the next batch of JSON from the current buffer
void ResetForNextParse();
//! Reset state for reading the next buffer
void ResetForNextBuffer();
//! Clear the buffer handle (if any)
void ClearBufferHandle();
};
struct JSONError {
idx_t buf_index;
idx_t line_or_object_in_buf;
string error_msg;
};
class JSONReader : public BaseFileReader {
public:
JSONReader(ClientContext &context, JSONReaderOptions options, OpenFileInfo file);
void OpenJSONFile();
void CloseHandle();
void Reset();
bool HasFileHandle() const;
bool IsOpen() const;
bool IsInitialized() const {
return initialized;
}
JSONReaderOptions &GetOptions();
JSONFormat GetFormat() const;
void SetFormat(JSONFormat format);
JSONRecordType GetRecordType() const;
void SetRecordType(JSONRecordType type);
const string &GetFileName() const;
JSONFileHandle &GetFileHandle() const;
public:
string GetReaderType() const override {
return "JSON";
}
void PrepareReader(ClientContext &context, GlobalTableFunctionState &) override;
bool TryInitializeScan(ClientContext &context, GlobalTableFunctionState &gstate,
LocalTableFunctionState &lstate) override;
void Scan(ClientContext &context, GlobalTableFunctionState &global_state, LocalTableFunctionState &local_state,
DataChunk &chunk) override;
void FinishFile(ClientContext &context, GlobalTableFunctionState &gstate_p) override;
double GetProgressInFile(ClientContext &context) override;
public:
//! Get a new buffer index (must hold the lock)
idx_t GetBufferIndex();
//! Set line count for a buffer that is done (grabs the lock)
void SetBufferLineOrObjectCount(JSONBufferHandle &handle, idx_t count);
//! Records a parse error in the specified buffer
void AddParseError(JSONReaderScanState &scan_state, idx_t line_or_object_in_buf, yyjson_read_err &err,
const string &extra = "");
//! Records a transform error in the specified buffer
void AddTransformError(JSONReaderScanState &scan_state, idx_t object_index, const string &error_message);
//! Whether this reader has thrown if an error has occurred
bool HasThrown();
void Initialize(Allocator &allocator, idx_t buffer_size);
bool InitializeScan(JSONReaderScanState &state, JSONFileReadType file_read_type);
void ParseJSON(JSONReaderScanState &scan_state, char *const json_start, const idx_t json_size,
const idx_t remaining);
void ParseNextChunk(JSONReaderScanState &scan_state);
idx_t Scan(JSONReaderScanState &scan_state);
bool ReadNextBuffer(JSONReaderScanState &scan_state);
bool PrepareBufferForRead(JSONReaderScanState &scan_state);
//! Scan progress
double GetProgress() const;
void DecrementBufferUsage(JSONBufferHandle &handle, idx_t lines_or_object_in_buffer, AllocatedData &buffer);
private:
void SkipOverArrayStart(JSONReaderScanState &scan_state);
void AutoDetect(Allocator &allocator, idx_t buffer_size);
bool CopyRemainderFromPreviousBuffer(JSONReaderScanState &scan_state);
void FinalizeBufferInternal(JSONReaderScanState &scan_state, AllocatedData &buffer, idx_t buffer_index);
void PrepareForReadInternal(JSONReaderScanState &scan_state);
void PrepareForScan(JSONReaderScanState &scan_state);
bool PrepareBufferSeek(JSONReaderScanState &scan_state);
void ReadNextBufferSeek(JSONReaderScanState &scan_state);
bool ReadNextBufferNoSeek(JSONReaderScanState &scan_state);
void FinalizeBuffer(JSONReaderScanState &scan_state);
//! Insert/get/remove buffer (grabs the lock)
void InsertBuffer(idx_t buffer_idx, unique_ptr<JSONBufferHandle> &&buffer);
optional_ptr<JSONBufferHandle> GetBuffer(idx_t buffer_idx);
AllocatedData RemoveBuffer(JSONBufferHandle &handle);
void ThrowObjectSizeError(const idx_t object_size);
private:
//! Add an error to the buffer - requires the lock to be held
void AddError(idx_t buf_index, idx_t line_or_object_in_buf, const string &error_msg);
//! Throw errors if possible - requires the lock to be held
void ThrowErrorsIfPossible();
//! Try to get the line number - requires the lock to be held
optional_idx TryGetLineNumber(idx_t buf_index, idx_t line_or_object_in_buf);
private:
ClientContext &context;
JSONReaderOptions options;
//! File handle
unique_ptr<JSONFileHandle> file_handle;
//! Whether or not the reader has been initialized
bool initialized;
//! Next buffer index within the file
idx_t next_buffer_index;
//! Mapping from batch index to currently held buffers
unordered_map<idx_t, unique_ptr<JSONBufferHandle>> buffer_map;
//! Line count per buffer
vector<int64_t> buffer_line_or_object_counts;
//! Whether any of the reading threads has thrown an error
bool thrown;
//! If we have auto-detected, this is the buffer read by the auto-detection
AllocatedData auto_detect_data;
idx_t auto_detect_data_size = 0;
//! The first error we found in the file (if any)
unique_ptr<JSONError> error;
public:
mutable mutex lock;
};
} // namespace duckdb