#include "dbgen/dbgen.hpp" #include "dbgen/dbgen_gunk.hpp" #include "tpch_constants.hpp" #include "duckdb/common/exception.hpp" #include "duckdb/common/types/date.hpp" #include "duckdb/parser/column_definition.hpp" #include "duckdb/parser/parsed_data/create_table_info.hpp" #include "duckdb/parser/constraints/not_null_constraint.hpp" #include "duckdb/catalog/catalog.hpp" #include "duckdb/main/appender.hpp" #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" #ifndef DUCKDB_NO_THREADS #include "duckdb/common/thread.hpp" #endif #define DECLARER /* EXTERN references get defined here */ #include "dbgen/dss.h" #include "dbgen/dsstypes.h" #include #include #include using namespace duckdb; namespace tpch { struct tpch_append_information { duckdb::unique_ptr appender; }; void append_int32(tpch_append_information &info, int32_t value) { info.appender->Append(value); } void append_int64(tpch_append_information &info, int64_t value) { info.appender->Append(value); } void append_string(tpch_append_information &info, const char *value) { info.appender->Append(value); } void append_decimal(tpch_append_information &info, int64_t value) { info.appender->Append(value); } void append_date(tpch_append_information &info, string value) { info.appender->Append(Date::FromString(value)); } void append_char(tpch_append_information &info, char value) { char val[2]; val[0] = value; val[1] = '\0'; append_string(info, val); } static void append_order(order_t *o, tpch_append_information *info) { auto &append_info = info[ORDER]; // fill the current row with the order information append_info.appender->BeginRow(); // o_orderkey append_int64(append_info, o->okey); // o_custkey append_int64(append_info, o->custkey); // o_orderstatus append_char(append_info, o->orderstatus); // o_totalprice append_decimal(append_info, o->totalprice); // o_orderdate append_date(append_info, o->odate); // o_orderpriority append_string(append_info, o->opriority); // o_clerk append_string(append_info, o->clerk); // o_shippriority append_int32(append_info, o->spriority); // o_comment append_string(append_info, o->comment); append_info.appender->EndRow(); } static void append_line(order_t *o, tpch_append_information *info) { auto &append_info = info[LINE]; // fill the current row with the order information for (DSS_HUGE i = 0; i < o->lines; i++) { append_info.appender->BeginRow(); // l_orderkey append_int64(append_info, o->l[i].okey); // l_partkey append_int64(append_info, o->l[i].partkey); // l_suppkey append_int64(append_info, o->l[i].suppkey); // l_linenumber append_int64(append_info, o->l[i].lcnt); // l_quantity append_decimal(append_info, o->l[i].quantity); // l_extendedprice append_decimal(append_info, o->l[i].eprice); // l_discount append_decimal(append_info, o->l[i].discount); // l_tax append_decimal(append_info, o->l[i].tax); // l_returnflag append_char(append_info, o->l[i].rflag[0]); // l_linestatus append_char(append_info, o->l[i].lstatus[0]); // l_shipdate append_date(append_info, o->l[i].sdate); // l_commitdate append_date(append_info, o->l[i].cdate); // l_receiptdate append_date(append_info, o->l[i].rdate); // l_shipinstruct append_string(append_info, o->l[i].shipinstruct); // l_shipmode append_string(append_info, o->l[i].shipmode); // l_comment append_string(append_info, o->l[i].comment); append_info.appender->EndRow(); } } static void append_order_line(order_t *o, tpch_append_information *info) { append_order(o, info); append_line(o, info); } static void append_supp(supplier_t *supp, tpch_append_information *info) { auto &append_info = info[SUPP]; append_info.appender->BeginRow(); // s_suppkey append_int64(append_info, supp->suppkey); // s_name append_string(append_info, supp->name); // s_address append_string(append_info, supp->address); // s_nationkey append_int32(append_info, supp->nation_code); // s_phone append_string(append_info, supp->phone); // s_acctbal append_decimal(append_info, supp->acctbal); // s_comment append_string(append_info, supp->comment); append_info.appender->EndRow(); } static void append_cust(customer_t *c, tpch_append_information *info) { auto &append_info = info[CUST]; append_info.appender->BeginRow(); // c_custkey append_int64(append_info, c->custkey); // c_name append_string(append_info, c->name); // c_address append_string(append_info, c->address); // c_nationkey append_int32(append_info, c->nation_code); // c_phone append_string(append_info, c->phone); // c_acctbal append_decimal(append_info, c->acctbal); // c_mktsegment append_string(append_info, c->mktsegment); // c_comment append_string(append_info, c->comment); append_info.appender->EndRow(); } static void append_part(part_t *part, tpch_append_information *info) { auto &append_info = info[PART]; append_info.appender->BeginRow(); // p_partkey append_int64(append_info, part->partkey); // p_name append_string(append_info, part->name); // p_mfgr append_string(append_info, part->mfgr); // p_brand append_string(append_info, part->brand); // p_type append_string(append_info, part->type); // p_size append_int32(append_info, part->size); // p_container append_string(append_info, part->container); // p_retailprice append_decimal(append_info, part->retailprice); // p_comment append_string(append_info, part->comment); append_info.appender->EndRow(); } static void append_psupp(part_t *part, tpch_append_information *info) { auto &append_info = info[PSUPP]; for (size_t i = 0; i < SUPP_PER_PART; i++) { append_info.appender->BeginRow(); // ps_partkey append_int64(append_info, part->s[i].partkey); // ps_suppkey append_int64(append_info, part->s[i].suppkey); // ps_availqty append_int64(append_info, part->s[i].qty); // ps_supplycost append_decimal(append_info, part->s[i].scost); // ps_comment append_string(append_info, part->s[i].comment); append_info.appender->EndRow(); } } static void append_part_psupp(part_t *part, tpch_append_information *info) { append_part(part, info); append_psupp(part, info); } static void append_nation(code_t *c, tpch_append_information *info) { auto &append_info = info[NATION]; append_info.appender->BeginRow(); // n_nationkey append_int32(append_info, c->code); // n_name append_string(append_info, c->text); // n_regionkey append_int32(append_info, c->join); // n_comment append_string(append_info, c->comment); append_info.appender->EndRow(); } static void append_region(code_t *c, tpch_append_information *info) { auto &append_info = info[REGION]; append_info.appender->BeginRow(); // r_regionkey append_int32(append_info, c->code); // r_name append_string(append_info, c->text); // r_comment append_string(append_info, c->comment); append_info.appender->EndRow(); } static void gen_tbl(ClientContext &context, int tnum, DSS_HUGE count, tpch_append_information *info, DBGenContext *dbgen_ctx, idx_t offset = 0) { order_t o; supplier_t supp; customer_t cust; part_t part; code_t code; for (DSS_HUGE i = offset + 1; count; count--, i++) { if (count % 1000 == 0 && context.interrupted) { return; } row_start(tnum, dbgen_ctx); switch (tnum) { case LINE: case ORDER: case ORDER_LINE: mk_order(i, &o, dbgen_ctx, 0); append_order_line(&o, info); break; case SUPP: mk_supp(i, &supp, dbgen_ctx); append_supp(&supp, info); break; case CUST: mk_cust(i, &cust, dbgen_ctx); append_cust(&cust, info); break; case PSUPP: case PART: case PART_PSUPP: mk_part(i, &part, dbgen_ctx); append_part_psupp(&part, info); break; case NATION: mk_nation(i, &code, dbgen_ctx); append_nation(&code, info); break; case REGION: mk_region(i, &code, dbgen_ctx); append_region(&code, info); break; } row_stop_h(tnum, dbgen_ctx); } } string get_table_name(int num) { switch (num) { case PART: return "part"; case PSUPP: return "partsupp"; case SUPP: return "supplier"; case CUST: return "customer"; case ORDER: return "orders"; case LINE: return "lineitem"; case NATION: return "nation"; case REGION: return "region"; default: return ""; } } struct RegionInfo { static constexpr char *Name = "region"; static constexpr idx_t ColumnCount = 3; static const char *Columns[]; static const LogicalType Types[]; }; const char *RegionInfo::Columns[] = {"r_regionkey", "r_name", "r_comment"}; const LogicalType RegionInfo::Types[] = {LogicalType(LogicalTypeId::INTEGER), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::VARCHAR)}; struct NationInfo { static constexpr char *Name = "nation"; static const char *Columns[]; static constexpr idx_t ColumnCount = 4; static const LogicalType Types[]; }; const char *NationInfo::Columns[] = {"n_nationkey", "n_name", "n_regionkey", "n_comment"}; const LogicalType NationInfo::Types[] = {LogicalType(LogicalTypeId::INTEGER), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::INTEGER), LogicalType(LogicalTypeId::VARCHAR)}; struct SupplierInfo { static constexpr char *Name = "supplier"; static const char *Columns[]; static constexpr idx_t ColumnCount = 7; static const LogicalType Types[]; }; const char *SupplierInfo::Columns[] = {"s_suppkey", "s_name", "s_address", "s_nationkey", "s_phone", "s_acctbal", "s_comment"}; const LogicalType SupplierInfo::Types[] = {LogicalType(LogicalTypeId::BIGINT), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::INTEGER), LogicalType(LogicalTypeId::VARCHAR), LogicalType::DECIMAL(15, 2), LogicalType(LogicalTypeId::VARCHAR)}; struct CustomerInfo { static constexpr char *Name = "customer"; static const char *Columns[]; static constexpr idx_t ColumnCount = 8; static const LogicalType Types[]; }; const char *CustomerInfo::Columns[] = {"c_custkey", "c_name", "c_address", "c_nationkey", "c_phone", "c_acctbal", "c_mktsegment", "c_comment"}; const LogicalType CustomerInfo::Types[] = {LogicalType(LogicalTypeId::BIGINT), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::INTEGER), LogicalType(LogicalTypeId::VARCHAR), LogicalType::DECIMAL(15, 2), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::VARCHAR)}; struct PartInfo { static constexpr char *Name = "part"; static const char *Columns[]; static constexpr idx_t ColumnCount = 9; static const LogicalType Types[]; }; const char *PartInfo::Columns[] = {"p_partkey", "p_name", "p_mfgr", "p_brand", "p_type", "p_size", "p_container", "p_retailprice", "p_comment"}; const LogicalType PartInfo::Types[] = { LogicalType(LogicalTypeId::BIGINT), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::INTEGER), LogicalType(LogicalTypeId::VARCHAR), LogicalType::DECIMAL(15, 2), LogicalType(LogicalTypeId::VARCHAR)}; struct PartsuppInfo { static constexpr char *Name = "partsupp"; static const char *Columns[]; static constexpr idx_t ColumnCount = 5; static const LogicalType Types[]; }; const char *PartsuppInfo::Columns[] = {"ps_partkey", "ps_suppkey", "ps_availqty", "ps_supplycost", "ps_comment"}; const LogicalType PartsuppInfo::Types[] = {LogicalType(LogicalTypeId::BIGINT), LogicalType(LogicalTypeId::BIGINT), LogicalType(LogicalTypeId::BIGINT), LogicalType::DECIMAL(15, 2), LogicalType(LogicalTypeId::VARCHAR)}; struct OrdersInfo { static constexpr char *Name = "orders"; static const char *Columns[]; static constexpr idx_t ColumnCount = 9; static const LogicalType Types[]; }; const char *OrdersInfo::Columns[] = {"o_orderkey", "o_custkey", "o_orderstatus", "o_totalprice", "o_orderdate", "o_orderpriority", "o_clerk", "o_shippriority", "o_comment"}; const LogicalType OrdersInfo::Types[] = { LogicalType(LogicalTypeId::BIGINT), LogicalType(LogicalTypeId::BIGINT), LogicalType(LogicalTypeId::VARCHAR), LogicalType::DECIMAL(15, 2), LogicalType(LogicalTypeId::DATE), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::INTEGER), LogicalType(LogicalTypeId::VARCHAR)}; struct LineitemInfo { static constexpr char *Name = "lineitem"; static const char *Columns[]; static constexpr idx_t ColumnCount = 16; static const LogicalType Types[]; }; const char *LineitemInfo::Columns[] = {"l_orderkey", "l_partkey", "l_suppkey", "l_linenumber", "l_quantity", "l_extendedprice", "l_discount", "l_tax", "l_returnflag", "l_linestatus", "l_shipdate", "l_commitdate", "l_receiptdate", "l_shipinstruct", "l_shipmode", "l_comment"}; const LogicalType LineitemInfo::Types[] = { LogicalType(LogicalTypeId::BIGINT), LogicalType(LogicalTypeId::BIGINT), LogicalType(LogicalTypeId::BIGINT), LogicalType(LogicalTypeId::BIGINT), LogicalType::DECIMAL(15, 2), LogicalType::DECIMAL(15, 2), LogicalType::DECIMAL(15, 2), LogicalType::DECIMAL(15, 2), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::DATE), LogicalType(LogicalTypeId::DATE), LogicalType(LogicalTypeId::DATE), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::VARCHAR), LogicalType(LogicalTypeId::VARCHAR)}; template static void CreateTPCHTable(ClientContext &context, string catalog_name, string schema, string suffix) { auto info = make_uniq(); info->catalog = catalog_name; info->schema = schema; info->table = T::Name + suffix; info->on_conflict = OnCreateConflict::IGNORE_ON_CONFLICT; info->temporary = false; for (idx_t i = 0; i < T::ColumnCount; i++) { info->columns.AddColumn(ColumnDefinition(T::Columns[i], T::Types[i])); info->constraints.push_back(make_uniq(LogicalIndex(i))); } auto &catalog = Catalog::GetCatalog(context, catalog_name); catalog.CreateTable(context, std::move(info)); } void DBGenWrapper::CreateTPCHSchema(ClientContext &context, string catalog, string schema, string suffix) { CreateTPCHTable(context, catalog, schema, suffix); CreateTPCHTable(context, catalog, schema, suffix); CreateTPCHTable(context, catalog, schema, suffix); CreateTPCHTable(context, catalog, schema, suffix); CreateTPCHTable(context, catalog, schema, suffix); CreateTPCHTable(context, catalog, schema, suffix); CreateTPCHTable(context, catalog, schema, suffix); CreateTPCHTable(context, catalog, schema, suffix); } void skip(int table, int children, DSS_HUGE step, DBGenContext &dbgen_ctx) { switch (table) { case CUST: sd_cust(children, step, &dbgen_ctx); break; case SUPP: sd_supp(children, step, &dbgen_ctx); break; case NATION: sd_nation(children, step, &dbgen_ctx); break; case REGION: sd_region(children, step, &dbgen_ctx); break; case ORDER_LINE: sd_line(children, step, &dbgen_ctx); sd_order(children, step, &dbgen_ctx); break; case PART_PSUPP: sd_part(children, step, &dbgen_ctx); sd_psupp(children, step, &dbgen_ctx); break; } } struct TPCHDBgenParameters { TPCHDBgenParameters(ClientContext &context, Catalog &catalog, const string &schema, const string &suffix) { tables.resize(REGION + 1); for (size_t i = PART; i <= REGION; i++) { auto tname = get_table_name(i); if (!tname.empty()) { string full_tname = string(tname) + string(suffix); auto &tbl_catalog = catalog.GetEntry(context, schema, full_tname); tables[i] = &tbl_catalog; } } } vector> tables; }; class TPCHDataAppender { public: TPCHDataAppender(ClientContext &context, TPCHDBgenParameters ¶meters, DBGenContext base_context, idx_t flush_count) : context(context), parameters(parameters) { dbgen_ctx = base_context; append_info = duckdb::unique_ptr(new tpch_append_information[REGION + 1]); memset(append_info.get(), 0, sizeof(tpch_append_information) * REGION + 1); for (size_t i = PART; i <= REGION; i++) { if (parameters.tables[i]) { auto &tbl_catalog = *parameters.tables[i]; if (!tbl_catalog.IsDuckTable()) { throw InvalidInputException("dbgen is only supported for DuckDB database files"); } append_info[i].appender = make_uniq(context, tbl_catalog, flush_count); } } } void GenerateTableData(int table_index, idx_t row_count, idx_t offset) { gen_tbl(context, table_index, static_cast(row_count), append_info.get(), &dbgen_ctx, offset); } void AppendData(int children, int current_step) { DSS_HUGE i; DSS_HUGE rowcnt = 0; for (i = PART; i <= REGION; i++) { if (table & (1 << i)) { if (i < NATION) { rowcnt = dbgen_ctx.tdefs[i].base * dbgen_ctx.scale_factor; } else { rowcnt = dbgen_ctx.tdefs[i].base; } if (context.interrupted) { return; } if (children > 1 && current_step != -1) { size_t part_size = std::ceil((double)rowcnt / (double)children); auto part_offset = part_size * current_step; auto part_end = part_offset + part_size; rowcnt = part_end > rowcnt ? rowcnt - part_offset : part_size; skip(i, children, part_offset, dbgen_ctx); if (rowcnt > 0) { // generate part of the table GenerateTableData((int) i, rowcnt, part_offset); } } else { // generate full table GenerateTableData((int) i, rowcnt, 0); } } } } void Flush() { // flush any incomplete chunks for (idx_t i = PART; i <= REGION; i++) { if (append_info[i].appender) { append_info[i].appender->Flush(); append_info[i].appender.reset(); } } } private: ClientContext &context; TPCHDBgenParameters ¶meters; unique_ptr append_info; DBGenContext dbgen_ctx; }; static void ParallelTPCHAppend(TPCHDataAppender *appender, int children, int current_step) { appender->AppendData(children, current_step); } void DBGenWrapper::LoadTPCHData(ClientContext &context, double flt_scale, string catalog_name, string schema, string suffix, int children, int current_step) { if (flt_scale == 0) { return; } // all tables table = (1 << CUST) | (1 << SUPP) | (1 << NATION) | (1 << REGION) | (1 << PART_PSUPP) | (1 << ORDER_LINE); force = 0; insert_segments = 0; delete_segments = 0; insert_orders_segment = 0; insert_lineitem_segment = 0; delete_segment = 0; verbose = 0; set_seeds = 0; updates = 0; d_path = NULL; DBGenContext base_context; tdef *tdefs = base_context.tdefs; tdefs[PART].base = 200000; tdefs[PSUPP].base = 200000; tdefs[SUPP].base = 10000; tdefs[CUST].base = 150000; tdefs[ORDER].base = 150000 * ORDERS_PER_CUST; tdefs[LINE].base = 150000 * ORDERS_PER_CUST; tdefs[ORDER_LINE].base = 150000 * ORDERS_PER_CUST; tdefs[PART_PSUPP].base = 200000; tdefs[NATION].base = NATIONS_MAX; tdefs[REGION].base = NATIONS_MAX; if (flt_scale < MIN_SCALE) { int i; int int_scale; base_context.scale_factor = 1; int_scale = (int)(1000 * flt_scale); for (i = PART; i < REGION; i++) { tdefs[i].base = (DSS_HUGE)(int_scale * tdefs[i].base) / 1000; if (tdefs[i].base < 1) { tdefs[i].base = 1; } } } else { base_context.scale_factor = (long)flt_scale; } if (current_step >= children) { return; } load_dists(10 * 1024 * 1024, &base_context); // 10MiB /* have to do this after init */ tdefs[NATION].base = nations.count; tdefs[REGION].base = regions.count; auto &catalog = Catalog::GetCatalog(context, catalog_name); TPCHDBgenParameters parameters(context, catalog, schema, suffix); #ifndef DUCKDB_NO_THREADS bool explicit_partial_generation = children > 1 && current_step != -1; auto thread_count = TaskScheduler::GetScheduler(context).NumberOfThreads(); if (explicit_partial_generation || thread_count <= 1) { #endif // if we are doing explicit partial generation the parallelism is managed outside of dbgen // only generate the chunk we are interested in TPCHDataAppender appender(context, parameters, base_context, BaseAppender::DEFAULT_FLUSH_COUNT); appender.AppendData(children, current_step); appender.Flush(); #ifndef DUCKDB_NO_THREADS } else { // we split into 20 children per scale factor by default static constexpr idx_t CHILDREN_PER_SCALE_FACTOR = 20; idx_t child_count; if (flt_scale < 1) { child_count = 1; } else { child_count = MinValue(static_cast(CHILDREN_PER_SCALE_FACTOR * flt_scale), MAX_CHILDREN); } idx_t step = 0; vector finished_appenders; while(step < child_count) { // launch N threads vector new_appenders; vector threads; idx_t launched_step = step; // initialize the appenders for each thread // note we prevent the threads themselves from flushing the appenders by specifying a very high flush count here for(idx_t thr_idx = 0; thr_idx < thread_count && launched_step < child_count; thr_idx++, launched_step++) { new_appenders.emplace_back(context, parameters, base_context, NumericLimits::Maximum()); } // launch the threads for(idx_t thr_idx = 0; thr_idx < new_appenders.size(); thr_idx++) { threads.emplace_back(ParallelTPCHAppend, &new_appenders[thr_idx], child_count, step); step++; } ErrorData error; try { // flush the previous batch of appenders while waiting (if any are there) // now flush the appenders in-order for(auto &appender : finished_appenders) { appender.Flush(); } } catch(std::exception &ex) { error = ErrorData(ex); } finished_appenders.clear(); // wait for all threads to finish for(auto &thread : threads) { thread.join(); } if (error.HasError()) { error.Throw(); } finished_appenders = std::move(new_appenders); } // flush the final batch of appenders for(auto &appender : finished_appenders) { appender.Flush(); } } #endif cleanup_dists(); } string DBGenWrapper::GetQuery(int query) { if (query <= 0 || query > TPCH_QUERIES_COUNT) { throw SyntaxException("Out of range TPC-H query number %d", query); } return TPCH_QUERIES[query - 1]; } string DBGenWrapper::GetAnswer(double sf, int query) { if (query <= 0 || query > TPCH_QUERIES_COUNT) { throw SyntaxException("Out of range TPC-H query number %d", query); } const char *answer; if (sf == 0.01) { answer = TPCH_ANSWERS_SF0_01[query - 1]; } else if (sf == 0.1) { answer = TPCH_ANSWERS_SF0_1[query - 1]; } else if (sf == 1) { answer = TPCH_ANSWERS_SF1[query - 1]; } else { throw NotImplementedException("Don't have TPC-H answers for SF %llf!", sf); } return answer; } } // namespace tpch