#include "json_multi_file_info.hpp" #include "json_scan.hpp" #include "duckdb/common/types/value.hpp" namespace duckdb { unique_ptr JSONMultiFileInfo::CreateInterface(ClientContext &context) { return make_uniq(); } unique_ptr JSONMultiFileInfo::InitializeOptions(ClientContext &context, optional_ptr info) { auto reader_options = make_uniq(); auto &options = reader_options->options; if (info) { auto &scan_info = info->Cast(); options.type = scan_info.type; options.format = scan_info.format; options.record_type = scan_info.record_type; options.auto_detect = scan_info.auto_detect; if (scan_info.type == JSONScanType::READ_JSON_OBJECTS) { // read_json_objects always emits a single JSON column called "json" options.sql_type_list.push_back(LogicalType::JSON()); options.name_list.emplace_back("json"); } } else { // COPY options.type = JSONScanType::READ_JSON; options.record_type = JSONRecordType::RECORDS; options.format = JSONFormat::AUTO_DETECT; options.auto_detect = false; } return std::move(reader_options); } bool JSONMultiFileInfo::ParseOption(ClientContext &context, const string &key, const Value &value, MultiFileOptions &, BaseFileReaderOptions &options_p) { auto &reader_options = options_p.Cast(); auto &options = reader_options.options; if (value.IsNull()) { throw BinderException("Cannot use NULL as argument to key %s", key); } auto loption = StringUtil::Lower(key); if (loption == "ignore_errors") { options.ignore_errors = BooleanValue::Get(value); return true; } if (loption == "maximum_object_size") { options.maximum_object_size = MaxValue(UIntegerValue::Get(value), options.maximum_object_size); return true; } if (loption == "format") { auto arg = StringUtil::Lower(StringValue::Get(value)); static const auto FORMAT_OPTIONS = case_insensitive_map_t {{"auto", JSONFormat::AUTO_DETECT}, {"unstructured", JSONFormat::UNSTRUCTURED}, {"newline_delimited", JSONFormat::NEWLINE_DELIMITED}, {"nd", JSONFormat::NEWLINE_DELIMITED}, {"array", JSONFormat::ARRAY}}; auto lookup = FORMAT_OPTIONS.find(arg); if (lookup == FORMAT_OPTIONS.end()) { vector valid_options; for (auto &pair : FORMAT_OPTIONS) { valid_options.push_back(StringUtil::Format("'%s'", pair.first)); } throw BinderException("format must be one of [%s], not '%s'", StringUtil::Join(valid_options, ", "), arg); } options.format = lookup->second; return true; } if (loption == "compression") { options.compression = EnumUtil::FromString(StringUtil::Upper(StringValue::Get(value))); return true; } if (loption == "columns") { auto &child_type = value.type(); if (child_type.id() != LogicalTypeId::STRUCT) { throw BinderException("read_json \"columns\" parameter requires a struct as input."); } auto &struct_children = StructValue::GetChildren(value); D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size()); for (idx_t i = 0; i < struct_children.size(); i++) { auto &name = StructType::GetChildName(child_type, i); auto &val = struct_children[i]; if (val.IsNull()) { throw BinderException("read_json \"columns\" parameter type specification cannot be NULL."); } options.name_list.push_back(name); if (val.type().id() != LogicalTypeId::VARCHAR) { throw BinderException("read_json \"columns\" parameter type specification must be VARCHAR."); } options.sql_type_list.emplace_back(TransformStringToLogicalType(StringValue::Get(val), context)); } D_ASSERT(options.name_list.size() == options.sql_type_list.size()); if (options.name_list.empty()) { throw BinderException("read_json \"columns\" parameter needs at least one column."); } return true; } if (loption == "auto_detect") { options.auto_detect = BooleanValue::Get(value); return true; } if (loption == "sample_size") { auto arg = BigIntValue::Get(value); if (arg == -1) { options.sample_size = NumericLimits::Maximum(); } else if (arg > 0) { options.sample_size = arg; } else { throw BinderException("read_json \"sample_size\" parameter must be positive, or -1 to sample all input " "files entirely, up to \"maximum_sample_files\" files."); } return true; } if (loption == "maximum_depth") { auto arg = BigIntValue::Get(value); if (arg == -1) { options.max_depth = NumericLimits::Maximum(); } else { options.max_depth = arg; } return true; } if (loption == "field_appearance_threshold") { auto arg = DoubleValue::Get(value); if (arg < 0 || arg > 1) { throw BinderException("read_json_auto \"field_appearance_threshold\" parameter must be between 0 and 1"); } options.field_appearance_threshold = arg; return true; } if (loption == "map_inference_threshold") { auto arg = BigIntValue::Get(value); if (arg == -1) { options.map_inference_threshold = NumericLimits::Maximum(); } else if (arg >= 0) { options.map_inference_threshold = arg; } else { throw BinderException("read_json_auto \"map_inference_threshold\" parameter must be 0 or positive, " "or -1 to disable map inference for consistent objects."); } return true; } if (loption == "dateformat" || loption == "date_format") { auto format_string = StringValue::Get(value); if (StringUtil::Lower(format_string) == "iso") { format_string = "%Y-%m-%d"; } options.date_format = format_string; StrpTimeFormat format; auto error = StrTimeFormat::ParseFormatSpecifier(format_string, format); if (!error.empty()) { throw BinderException("read_json could not parse \"dateformat\": '%s'.", error.c_str()); } return true; } if (loption == "timestampformat" || loption == "timestamp_format") { auto format_string = StringValue::Get(value); if (StringUtil::Lower(format_string) == "iso") { format_string = "%Y-%m-%dT%H:%M:%S.%fZ"; } options.timestamp_format = format_string; StrpTimeFormat format; auto error = StrTimeFormat::ParseFormatSpecifier(format_string, format); if (!error.empty()) { throw BinderException("read_json could not parse \"timestampformat\": '%s'.", error.c_str()); } return true; } if (loption == "records") { auto arg = StringValue::Get(value); if (arg == "auto") { options.record_type = JSONRecordType::AUTO_DETECT; } else if (arg == "true") { options.record_type = JSONRecordType::RECORDS; } else if (arg == "false") { options.record_type = JSONRecordType::VALUES; } else { throw BinderException("read_json requires \"records\" to be one of ['auto', 'true', 'false']."); } return true; } if (loption == "maximum_sample_files") { auto arg = BigIntValue::Get(value); if (arg == -1) { options.maximum_sample_files = NumericLimits::Maximum(); } else if (arg > 0) { options.maximum_sample_files = arg; } else { throw BinderException("read_json \"maximum_sample_files\" parameter must be positive, or -1 to remove " "the limit on the number of files used to sample \"sample_size\" rows."); } return true; } if (loption == "convert_strings_to_integers") { options.convert_strings_to_integers = BooleanValue::Get(value); return true; } return false; } static void JSONCheckSingleParameter(const string &key, const vector &values) { if (values.size() == 1) { return; } throw BinderException("COPY (FORMAT JSON) parameter %s expects a single argument.", key); } bool JSONMultiFileInfo::ParseCopyOption(ClientContext &context, const string &key, const vector &values, BaseFileReaderOptions &options_p, vector &expected_names, vector &expected_types) { auto &reader_options = options_p.Cast(); auto &options = reader_options.options; const auto &loption = StringUtil::Lower(key); if (loption == "dateformat" || loption == "date_format") { JSONCheckSingleParameter(key, values); options.date_format = StringValue::Get(values.back()); return true; } if (loption == "timestampformat" || loption == "timestamp_format") { JSONCheckSingleParameter(key, values); options.timestamp_format = StringValue::Get(values.back()); return true; } if (loption == "auto_detect") { if (values.empty()) { options.auto_detect = true; } else { JSONCheckSingleParameter(key, values); options.auto_detect = BooleanValue::Get(values.back().DefaultCastAs(LogicalTypeId::BOOLEAN)); options.format = JSONFormat::NEWLINE_DELIMITED; } return true; } if (loption == "compression") { JSONCheckSingleParameter(key, values); options.compression = EnumUtil::FromString(StringUtil::Upper(StringValue::Get(values.back()))); return true; } if (loption == "array") { if (values.empty()) { options.format = JSONFormat::ARRAY; } else { JSONCheckSingleParameter(key, values); if (BooleanValue::Get(values.back().DefaultCastAs(LogicalTypeId::BOOLEAN))) { options.format = JSONFormat::ARRAY; } else { // Default to newline-delimited otherwise options.format = JSONFormat::NEWLINE_DELIMITED; } } return true; } return false; } unique_ptr JSONMultiFileInfo::InitializeBindData(MultiFileBindData &multi_file_data, unique_ptr options) { auto &reader_options = options->Cast(); auto json_data = make_uniq(); json_data->options = std::move(reader_options.options); return std::move(json_data); } void JSONMultiFileInfo::BindReader(ClientContext &context, vector &return_types, vector &names, MultiFileBindData &bind_data) { auto &json_data = bind_data.bind_data->Cast(); auto &options = json_data.options; names = options.name_list; return_types = options.sql_type_list; if (options.record_type == JSONRecordType::AUTO_DETECT && return_types.size() > 1) { // More than one specified column implies records options.record_type = JSONRecordType::RECORDS; } // Specifying column names overrides auto-detect if (!return_types.empty()) { options.auto_detect = false; } if (!options.auto_detect) { // Need to specify columns if RECORDS and not auto-detecting if (return_types.empty()) { throw BinderException("When auto_detect=false, read_json requires columns to be specified through the " "\"columns\" parameter."); } // If we are reading VALUES, we can only have one column if (json_data.options.record_type == JSONRecordType::VALUES && return_types.size() != 1) { throw BinderException("read_json requires a single column to be specified through the \"columns\" " "parameter when \"records\" is set to 'false'."); } } json_data.InitializeFormats(); if (options.auto_detect || options.record_type == JSONRecordType::AUTO_DETECT) { JSONScan::AutoDetect(context, bind_data, return_types, names); D_ASSERT(return_types.size() == names.size()); } json_data.key_names = names; bind_data.multi_file_reader->BindOptions(bind_data.file_options, *bind_data.file_list, return_types, names, bind_data.reader_bind); auto &transform_options = json_data.transform_options; transform_options.strict_cast = !options.ignore_errors; transform_options.error_duplicate_key = !options.ignore_errors; transform_options.error_missing_key = false; transform_options.error_unknown_key = options.auto_detect && !options.ignore_errors; transform_options.date_format_map = json_data.date_format_map.get(); transform_options.delay_error = true; if (options.auto_detect) { // JSON may contain columns such as "id" and "Id", which are duplicates for us due to case-insensitivity // We rename them so we can parse the file anyway. Note that we can't change json_data.key_names, // because the JSON reader gets columns by exact name, not position case_insensitive_map_t name_collision_count; for (auto &col_name : names) { // Taken from CSV header_detection.cpp while (name_collision_count.find(col_name) != name_collision_count.end()) { name_collision_count[col_name] += 1; col_name = col_name + "_" + to_string(name_collision_count[col_name]); } name_collision_count[col_name] = 0; } } bool reuse_readers = true; for (auto &union_reader : bind_data.union_readers) { if (!union_reader || !union_reader->reader) { // not all readers have been initialized - don't re-use any reuse_readers = false; break; } auto &json_reader = union_reader->reader->Cast(); if (!json_reader.IsOpen()) { // no open file-handle - close reuse_readers = false; } } if (!reuse_readers) { bind_data.union_readers.clear(); } else { // re-use readers for (auto &union_reader : bind_data.union_readers) { auto &json_reader = union_reader->reader->Cast(); union_reader->names = names; union_reader->types = return_types; union_reader->reader->columns = MultiFileColumnDefinition::ColumnsFromNamesAndTypes(names, return_types); json_reader.Reset(); } } } void JSONMultiFileInfo::FinalizeCopyBind(ClientContext &context, BaseFileReaderOptions &options_p, const vector &expected_names, const vector &expected_types) { auto &reader_options = options_p.Cast(); auto &options = reader_options.options; options.name_list = expected_names; options.sql_type_list = expected_types; if (options.auto_detect && options.format != JSONFormat::ARRAY) { options.format = JSONFormat::AUTO_DETECT; } } unique_ptr JSONMultiFileInfo::InitializeGlobalState(ClientContext &context, MultiFileBindData &bind_data, MultiFileGlobalState &global_state) { auto json_state = make_uniq(context, bind_data); auto &json_data = bind_data.bind_data->Cast(); auto &gstate = json_state->state; // Perform projection pushdown for (idx_t col_idx = 0; col_idx < global_state.column_indexes.size(); col_idx++) { auto &column_index = global_state.column_indexes[col_idx]; const auto &col_id = column_index.GetPrimaryIndex(); // Skip any multi-file reader / row id stuff if (bind_data.reader_bind.filename_idx.IsValid() && col_id == bind_data.reader_bind.filename_idx.GetIndex()) { continue; } if (IsVirtualColumn(col_id)) { continue; } bool skip = false; for (const auto &hive_partitioning_index : bind_data.reader_bind.hive_partitioning_indexes) { if (col_id == hive_partitioning_index.index) { skip = true; break; } } if (skip) { continue; } gstate.names.push_back(json_data.key_names[col_id]); gstate.column_ids.push_back(col_idx); gstate.column_indices.push_back(column_index); } if (gstate.names.size() < json_data.key_names.size() || bind_data.file_options.union_by_name) { // If we are auto-detecting, but don't need all columns present in the file, // then we don't need to throw an error if we encounter an unseen column gstate.transform_options.error_unknown_key = false; } return std::move(json_state); } unique_ptr JSONMultiFileInfo::InitializeLocalState(ExecutionContext &context, GlobalTableFunctionState &global_state) { auto &gstate = global_state.Cast(); auto result = make_uniq(context.client, gstate.state); // Copy the transform options / date format map because we need to do thread-local stuff result->state.transform_options = gstate.state.transform_options; return std::move(result); } double JSONReader::GetProgressInFile(ClientContext &context) { return GetProgress(); } shared_ptr JSONMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &gstate_p, BaseUnionData &union_data, const MultiFileBindData &bind_data_p) { auto &json_data = bind_data_p.bind_data->Cast(); auto reader = make_shared_ptr(context, json_data.options, union_data.GetFileName()); reader->columns = MultiFileColumnDefinition::ColumnsFromNamesAndTypes(union_data.names, union_data.types); return std::move(reader); } shared_ptr JSONMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &gstate_p, const OpenFileInfo &file, idx_t file_idx, const MultiFileBindData &bind_data) { auto &json_data = bind_data.bind_data->Cast(); auto reader = make_shared_ptr(context, json_data.options, file.path); reader->columns = MultiFileColumnDefinition::ColumnsFromNamesAndTypes(bind_data.names, bind_data.types); return std::move(reader); } void JSONReader::PrepareReader(ClientContext &context, GlobalTableFunctionState &gstate_p) { auto &gstate = gstate_p.Cast().state; if (gstate.enable_parallel_scans) { // if we are doing parallel scans we need to open the file here Initialize(gstate.allocator, gstate.buffer_capacity); } } bool JSONReader::TryInitializeScan(ClientContext &context, GlobalTableFunctionState &gstate_p, LocalTableFunctionState &lstate_p) { auto &gstate = gstate_p.Cast().state; auto &lstate = lstate_p.Cast().state; lstate.GetScanState().ResetForNextBuffer(); return lstate.TryInitializeScan(gstate, *this); } void ReadJSONFunction(ClientContext &context, JSONReader &json_reader, JSONScanGlobalState &gstate, JSONScanLocalState &lstate, DataChunk &output) { auto &scan_state = lstate.GetScanState(); D_ASSERT(RefersToSameObject(json_reader, *scan_state.current_reader)); const auto count = lstate.Read(); yyjson_val **values = scan_state.values; auto &column_ids = json_reader.column_ids; if (!gstate.names.empty()) { vector result_vectors; result_vectors.reserve(column_ids.size()); for (idx_t i = 0; i < column_ids.size(); i++) { result_vectors.emplace_back(&output.data[i]); } D_ASSERT(gstate.json_data.options.record_type != JSONRecordType::AUTO_DETECT); bool success; if (gstate.json_data.options.record_type == JSONRecordType::RECORDS) { success = JSONTransform::TransformObject(values, scan_state.allocator.GetYYAlc(), count, gstate.names, result_vectors, lstate.transform_options, gstate.column_indices, lstate.transform_options.error_unknown_key); } else { D_ASSERT(gstate.json_data.options.record_type == JSONRecordType::VALUES); success = JSONTransform::Transform(values, scan_state.allocator.GetYYAlc(), *result_vectors[0], count, lstate.transform_options, gstate.column_indices[0]); } if (!success) { string hint = gstate.json_data.options.auto_detect ? "\nTry increasing 'sample_size', reducing 'maximum_depth', specifying 'columns', 'format' or " "'records' manually, setting 'ignore_errors' to true, or setting 'union_by_name' to true when " "reading multiple files with a different structure." : "\nTry setting 'auto_detect' to true, specifying 'format' or 'records' manually, or setting " "'ignore_errors' to true."; lstate.AddTransformError(lstate.transform_options.object_index, lstate.transform_options.error_message + hint); return; } } output.SetCardinality(count); } void ReadJSONObjectsFunction(ClientContext &context, JSONReader &json_reader, JSONScanGlobalState &gstate, JSONScanLocalState &lstate, DataChunk &output) { // Fetch next lines auto &scan_state = lstate.GetScanState(); D_ASSERT(RefersToSameObject(json_reader, *scan_state.current_reader)); const auto count = lstate.Read(); const auto units = scan_state.units; const auto objects = scan_state.values; if (!gstate.names.empty()) { // Create the strings without copying them auto strings = FlatVector::GetData(output.data[0]); auto &validity = FlatVector::Validity(output.data[0]); for (idx_t i = 0; i < count; i++) { if (objects[i]) { strings[i] = string_t(units[i].pointer, units[i].size); } else { validity.SetInvalid(i); } } } output.SetCardinality(count); } void JSONReader::Scan(ClientContext &context, GlobalTableFunctionState &global_state, LocalTableFunctionState &local_state, DataChunk &output) { auto &gstate = global_state.Cast().state; auto &lstate = local_state.Cast().state; auto &json_data = gstate.bind_data.bind_data->Cast(); switch (json_data.options.type) { case JSONScanType::READ_JSON: ReadJSONFunction(context, *this, gstate, lstate, output); break; case JSONScanType::READ_JSON_OBJECTS: ReadJSONObjectsFunction(context, *this, gstate, lstate, output); break; default: throw InternalException("Unsupported scan type for JSONMultiFileInfo::Scan"); } } void JSONReader::FinishFile(ClientContext &context, GlobalTableFunctionState &global_state) { auto &gstate = global_state.Cast().state; gstate.file_is_assigned = false; } void JSONMultiFileInfo::FinishReading(ClientContext &context, GlobalTableFunctionState &global_state, LocalTableFunctionState &local_state) { auto &lstate = local_state.Cast().state; lstate.GetScanState().ResetForNextBuffer(); } unique_ptr JSONMultiFileInfo::GetCardinality(const MultiFileBindData &bind_data, idx_t file_count) { auto &json_data = bind_data.bind_data->Cast(); idx_t per_file_cardinality = 42; // get the average per-file cardinality from the bind data (if it is set) if (json_data.estimated_cardinality_per_file.IsValid()) { per_file_cardinality = json_data.estimated_cardinality_per_file.GetIndex(); } return make_uniq(per_file_cardinality * file_count); } optional_idx JSONMultiFileInfo::MaxThreads(const MultiFileBindData &bind_data, const MultiFileGlobalState &global_state, FileExpandResult expand_result) { if (expand_result == FileExpandResult::MULTIPLE_FILES) { return optional_idx(); } // get the max threads from the bind data (if it is set) auto &json_data = bind_data.bind_data->Cast(); return json_data.max_threads; } FileGlobInput JSONMultiFileInfo::GetGlobInput() { return FileGlobInput(FileGlobOptions::FALLBACK_GLOB, "json"); } } // namespace duckdb