Files
email-tracker/external/duckdb/extension/json/include/json_reader_options.hpp
2025-10-24 19:21:19 -05:00

130 lines
4.2 KiB
C++

//===----------------------------------------------------------------------===//
// DuckDB
//
// json_reader_options.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "json_common.hpp"
#include "json_enums.hpp"
#include "duckdb/common/types/type_map.hpp"
#include "duckdb/function/scalar/strftime_format.hpp"
namespace duckdb {
struct DateFormatMap {
friend class MutableDateFormatMap;
public:
explicit DateFormatMap(type_id_map_t<vector<StrpTimeFormat>> candidate_formats_p)
: candidate_formats(std::move(candidate_formats_p)) {
}
bool HasFormats(LogicalTypeId type) const {
return HasFormats(candidate_formats, type);
}
const StrpTimeFormat &GetFormat(LogicalTypeId type) const {
D_ASSERT(candidate_formats.find(type) != candidate_formats.end());
return candidate_formats.find(type)->second.back();
}
public:
static void AddFormat(type_id_map_t<vector<StrpTimeFormat>> &candidate_formats, LogicalTypeId type,
const string &format_string) {
auto &formats = candidate_formats[type];
formats.emplace_back();
formats.back().format_specifier = format_string;
StrpTimeFormat::ParseFormatSpecifier(formats.back().format_specifier, formats.back());
}
static bool HasFormats(const type_id_map_t<vector<StrpTimeFormat>> &candidate_formats, LogicalTypeId type) {
return candidate_formats.find(type) != candidate_formats.end();
}
private:
type_id_map_t<vector<StrpTimeFormat>> candidate_formats;
};
class MutableDateFormatMap {
public:
explicit MutableDateFormatMap(DateFormatMap &date_format_map) : date_format_map(date_format_map) {
}
bool HasFormats(LogicalTypeId type) {
lock_guard<mutex> lock(format_lock);
return date_format_map.HasFormats(type);
}
idx_t NumberOfFormats(LogicalTypeId type) {
lock_guard<mutex> lock(format_lock);
return date_format_map.candidate_formats.at(type).size();
}
bool GetFormatAtIndex(LogicalTypeId type, idx_t index, StrpTimeFormat &format) {
lock_guard<mutex> lock(format_lock);
auto &formats = date_format_map.candidate_formats.at(type);
if (index >= formats.size()) {
return false;
}
format = formats[index];
return true;
}
void ShrinkFormatsToSize(LogicalTypeId type, idx_t size) {
lock_guard<mutex> lock(format_lock);
auto &formats = date_format_map.candidate_formats[type];
while (formats.size() > size) {
formats.pop_back();
}
}
private:
mutex format_lock;
DateFormatMap &date_format_map;
};
struct JSONReaderOptions {
//! Scan type
JSONScanType type = JSONScanType::READ_JSON;
//! The format of the JSON
JSONFormat format = JSONFormat::AUTO_DETECT;
//! Whether record types in the JSON
JSONRecordType record_type = JSONRecordType::AUTO_DETECT;
//! Whether file is compressed or not, and if so which compression type
FileCompressionType compression = FileCompressionType::AUTO_DETECT;
//! Whether or not we should ignore malformed JSON (default to NULL)
bool ignore_errors = false;
//! Maximum JSON object size (defaults to 16MB minimum)
idx_t maximum_object_size = 16777216;
//! Whether we auto-detect a schema
bool auto_detect = false;
//! Sample size for detecting schema
idx_t sample_size = idx_t(STANDARD_VECTOR_SIZE) * 10;
//! Max depth we go to detect nested JSON schema (defaults to unlimited)
idx_t max_depth = NumericLimits<idx_t>::Maximum();
//! We divide the number of appearances of each JSON field by the auto-detection sample size
//! If the average over the fields of an object is less than this threshold,
//! we default to the MAP type with value type of merged field types
double field_appearance_threshold = 0.1;
//! The maximum number of files we sample to sample sample_size rows
idx_t maximum_sample_files = 32;
//! Whether we auto-detect and convert JSON strings to integers
bool convert_strings_to_integers = false;
//! If a struct contains more fields than this threshold with at least 80% similar types,
//! we infer it as MAP type
idx_t map_inference_threshold = 200;
//! User-provided list of names (in order)
vector<string> name_list;
//! User-provided list of types (in order)
vector<LogicalType> sql_type_list;
//! Forced date/timestamp formats
string date_format;
string timestamp_format;
};
} // namespace duckdb