Files
email-tracker/external/duckdb/extension/json/json_scan.cpp
2025-10-24 19:21:19 -05:00

129 lines
5.1 KiB
C++

#include "json_scan.hpp"
#include "duckdb/common/enum_util.hpp"
#include "duckdb/common/multi_file/multi_file_reader.hpp"
#include "duckdb/common/serializer/deserializer.hpp"
#include "duckdb/common/serializer/serializer.hpp"
#include "duckdb/main/extension_helper.hpp"
#include "duckdb/parallel/task_scheduler.hpp"
#include "duckdb/storage/buffer_manager.hpp"
#include "json_multi_file_info.hpp"
namespace duckdb {
JSONScanData::JSONScanData() {
}
void JSONScanData::InitializeFormats() {
InitializeFormats(options.auto_detect);
}
void JSONScanData::InitializeFormats(bool auto_detect_p) {
type_id_map_t<vector<StrpTimeFormat>> candidate_formats;
// Initialize date_format_map if anything was specified
if (!options.date_format.empty()) {
DateFormatMap::AddFormat(candidate_formats, LogicalTypeId::DATE, options.date_format);
}
if (!options.timestamp_format.empty()) {
DateFormatMap::AddFormat(candidate_formats, LogicalTypeId::TIMESTAMP, options.timestamp_format);
}
if (auto_detect_p) {
static const type_id_map_t<vector<const char *>> FORMAT_TEMPLATES = {
{LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
{LogicalTypeId::TIMESTAMP,
{"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
"%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"}},
};
// Populate possible date/timestamp formats, assume this is consistent across columns
for (auto &kv : FORMAT_TEMPLATES) {
const auto &logical_type = kv.first;
if (DateFormatMap::HasFormats(candidate_formats, logical_type)) {
continue; // Already populated
}
const auto &format_strings = kv.second;
for (auto &format_string : format_strings) {
DateFormatMap::AddFormat(candidate_formats, logical_type, format_string);
}
}
}
date_format_map = make_uniq<DateFormatMap>(std::move(candidate_formats));
}
JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, const MultiFileBindData &bind_data_p)
: bind_data(bind_data_p), json_data(bind_data.bind_data->Cast<JSONScanData>()),
transform_options(json_data.transform_options), allocator(BufferAllocator::Get(context)),
buffer_capacity(json_data.options.maximum_object_size * 2),
system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()),
enable_parallel_scans(bind_data.file_list->GetTotalFileCount() < system_threads) {
}
JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
: scan_state(context, gstate.allocator, gstate.buffer_capacity) {
}
JSONGlobalTableFunctionState::JSONGlobalTableFunctionState(ClientContext &context, const MultiFileBindData &bind_data)
: state(context, bind_data) {
}
JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate)
: state(context, gstate) {
}
idx_t JSONScanLocalState::Read() {
return scan_state.current_reader->Scan(scan_state);
}
void JSONScanLocalState::ParseJSON(char *const json_start, const idx_t json_size, const idx_t remaining) {
scan_state.current_reader->ParseJSON(scan_state, json_start, json_size, remaining);
}
bool JSONScanLocalState::TryInitializeScan(JSONScanGlobalState &gstate, JSONReader &reader) {
// try to initialize a scan in the given reader
// three scenarios:
// scenario 1 - unseekable file - Read from the file and setup the buffers
// scenario 2 - seekable file - get the position from the file to read and return
// scenario 3 - entire file readers - if we are reading an entire file at once, do not do anything here, except for
// setting up the basics
auto read_type = JSONFileReadType::SCAN_PARTIAL;
if (!gstate.enable_parallel_scans || reader.GetFormat() != JSONFormat::NEWLINE_DELIMITED) {
read_type = JSONFileReadType::SCAN_ENTIRE_FILE;
}
if (read_type == JSONFileReadType::SCAN_ENTIRE_FILE) {
if (gstate.file_is_assigned) {
return false;
}
gstate.file_is_assigned = true;
}
return reader.InitializeScan(scan_state, read_type);
}
void JSONScanLocalState::AddTransformError(idx_t object_index, const string &error_message) {
scan_state.current_reader->AddTransformError(scan_state, object_index, error_message);
}
void JSONScan::Serialize(Serializer &serializer, const optional_ptr<FunctionData> bind_data_p, const TableFunction &) {
throw NotImplementedException("JSONScan Serialize not implemented");
}
unique_ptr<FunctionData> JSONScan::Deserialize(Deserializer &deserializer, TableFunction &) {
throw NotImplementedException("JSONScan Deserialize not implemented");
}
void JSONScan::TableFunctionDefaults(TableFunction &table_function) {
table_function.named_parameters["maximum_object_size"] = LogicalType::UINTEGER;
table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
table_function.named_parameters["format"] = LogicalType::VARCHAR;
table_function.named_parameters["compression"] = LogicalType::VARCHAR;
table_function.serialize = Serialize;
table_function.deserialize = Deserialize;
table_function.projection_pushdown = true;
table_function.filter_pushdown = false;
table_function.filter_prune = false;
}
} // namespace duckdb