Files
email-tracker/external/duckdb/extension/json/json_functions/copy_json.cpp
2025-10-24 19:21:19 -05:00

134 lines
5.8 KiB
C++

#include "duckdb/function/copy_function.hpp"
#include "duckdb/parser/expression/constant_expression.hpp"
#include "duckdb/parser/expression/function_expression.hpp"
#include "duckdb/parser/expression/positional_reference_expression.hpp"
#include "duckdb/parser/query_node/select_node.hpp"
#include "duckdb/parser/tableref/subqueryref.hpp"
#include "duckdb/planner/binder.hpp"
#include "duckdb/common/helper.hpp"
#include "json_functions.hpp"
#include "json_scan.hpp"
#include "json_transform.hpp"
#include "json_multi_file_info.hpp"
namespace duckdb {
static void ThrowJSONCopyParameterException(const string &loption) {
throw BinderException("COPY (FORMAT JSON) parameter %s expects a single argument.", loption);
}
static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
static const unordered_set<string> SUPPORTED_BASE_OPTIONS {
"compression", "encoding", "use_tmp_file", "overwrite_or_ignore", "overwrite", "append", "filename_pattern",
"file_extension", "per_thread_output", "file_size_bytes",
// "partition_by", unsupported
"return_files", "preserve_order", "return_stats", "write_partition_columns", "write_empty_file",
"hive_file_pattern"};
auto stmt_copy = stmt.Copy();
auto &copy = stmt_copy->Cast<CopyStatement>();
auto &copied_info = *copy.info;
// Parse the options, creating options for the CSV writer while doing so
string date_format;
string timestamp_format;
// We insert the JSON file extension here so it works properly with PER_THREAD_OUTPUT/FILE_SIZE_BYTES etc.
case_insensitive_map_t<vector<Value>> csv_copy_options {{"file_extension", {"json"}}};
for (const auto &kv : copied_info.options) {
const auto &loption = StringUtil::Lower(kv.first);
if (loption == "dateformat" || loption == "date_format") {
if (kv.second.size() != 1) {
ThrowJSONCopyParameterException(loption);
}
date_format = StringValue::Get(kv.second.back());
} else if (loption == "timestampformat" || loption == "timestamp_format") {
if (kv.second.size() != 1) {
ThrowJSONCopyParameterException(loption);
}
timestamp_format = StringValue::Get(kv.second.back());
} else if (loption == "array") {
if (kv.second.size() > 1) {
ThrowJSONCopyParameterException(loption);
}
if (kv.second.empty() || BooleanValue::Get(kv.second.back().DefaultCastAs(LogicalTypeId::BOOLEAN))) {
csv_copy_options["prefix"] = {"[\n\t"};
csv_copy_options["suffix"] = {"\n]\n"};
csv_copy_options["new_line"] = {",\n\t"};
}
} else if (SUPPORTED_BASE_OPTIONS.find(loption) != SUPPORTED_BASE_OPTIONS.end()) {
// We support these base options
csv_copy_options.insert(kv);
} else {
throw BinderException("Unknown option for COPY ... TO ... (FORMAT JSON): \"%s\".", loption);
}
}
// Bind the select statement of the original to resolve the types
auto dummy_binder = Binder::CreateBinder(binder.context, &binder);
auto bound_original = dummy_binder->Bind(*stmt.info->select_statement);
// Create new SelectNode with the original SelectNode as a subquery in the FROM clause
auto select_stmt = make_uniq<SelectStatement>();
select_stmt->node = std::move(copied_info.select_statement);
auto subquery_ref = make_uniq<SubqueryRef>(std::move(select_stmt));
copied_info.select_statement = make_uniq_base<QueryNode, SelectNode>();
auto &select_node = copied_info.select_statement->Cast<SelectNode>();
select_node.from_table = std::move(subquery_ref);
// Create new select list
vector<unique_ptr<ParsedExpression>> select_list;
select_list.reserve(bound_original.types.size());
// strftime if the user specified a format (loop also gives columns a name, needed for struct_pack)
// TODO: deal with date/timestamp within nested types
vector<unique_ptr<ParsedExpression>> strftime_children;
for (idx_t col_idx = 0; col_idx < bound_original.types.size(); col_idx++) {
auto column = make_uniq_base<ParsedExpression, PositionalReferenceExpression>(col_idx + 1);
strftime_children = vector<unique_ptr<ParsedExpression>>();
const auto &type = bound_original.types[col_idx];
const auto &name = bound_original.names[col_idx];
if (!date_format.empty() && type == LogicalTypeId::DATE) {
strftime_children.emplace_back(std::move(column));
strftime_children.emplace_back(make_uniq<ConstantExpression>(date_format));
column = make_uniq<FunctionExpression>("strftime", std::move(strftime_children));
} else if (!timestamp_format.empty() && type == LogicalTypeId::TIMESTAMP) {
strftime_children.emplace_back(std::move(column));
strftime_children.emplace_back(make_uniq<ConstantExpression>(timestamp_format));
column = make_uniq<FunctionExpression>("strftime", std::move(strftime_children));
}
column->SetAlias(name);
select_list.emplace_back(std::move(column));
}
// Now create the struct_pack/to_json to create a JSON object per row
vector<unique_ptr<ParsedExpression>> struct_pack_child;
struct_pack_child.emplace_back(make_uniq<FunctionExpression>("struct_pack", std::move(select_list)));
select_node.select_list.emplace_back(make_uniq<FunctionExpression>("to_json", std::move(struct_pack_child)));
// Now we can just use the CSV writer
copied_info.format = "csv";
copied_info.options = std::move(csv_copy_options);
copied_info.options["quote"] = {""};
copied_info.options["escape"] = {""};
copied_info.options["delimiter"] = {"\n"};
copied_info.options["header"] = {{0}};
return binder.Bind(*stmt_copy);
}
CopyFunction JSONFunctions::GetJSONCopyFunction() {
CopyFunction function("json");
function.extension = "json";
function.plan = CopyToJSONPlan;
function.copy_from_bind = MultiFileFunction<JSONMultiFileInfo>::MultiFileBindCopy;
function.copy_from_function = JSONFunctions::GetReadJSONTableFunction(make_shared_ptr<JSONScanInfo>(
JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT, JSONRecordType::RECORDS, false));
return function;
}
} // namespace duckdb