Files
email-tracker/external/duckdb/extension/tpcds/dsdgen/dsdgen-c/join.cpp
2025-10-24 19:21:19 -05:00

421 lines
12 KiB
C++

/*
* Legal Notice
*
* This document and associated source code (the "Work") is a part of a
* benchmark specification maintained by the TPC.
*
* The TPC reserves all right, title, and interest to the Work as provided
* under U.S. and international laws, including without limitation all patent
* and trademark rights therein.
*
* No Warranty
*
* 1.1 TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THE INFORMATION
* CONTAINED HEREIN IS PROVIDED "AS IS" AND WITH ALL FAULTS, AND THE
* AUTHORS AND DEVELOPERS OF THE WORK HEREBY DISCLAIM ALL OTHER
* WARRANTIES AND CONDITIONS, EITHER EXPRESS, IMPLIED OR STATUTORY,
* INCLUDING, BUT NOT LIMITED TO, ANY (IF ANY) IMPLIED WARRANTIES,
* DUTIES OR CONDITIONS OF MERCHANTABILITY, OF FITNESS FOR A PARTICULAR
* PURPOSE, OF ACCURACY OR COMPLETENESS OF RESPONSES, OF RESULTS, OF
* WORKMANLIKE EFFORT, OF LACK OF VIRUSES, AND OF LACK OF NEGLIGENCE.
* ALSO, THERE IS NO WARRANTY OR CONDITION OF TITLE, QUIET ENJOYMENT,
* QUIET POSSESSION, CORRESPONDENCE TO DESCRIPTION OR NON-INFRINGEMENT
* WITH REGARD TO THE WORK.
* 1.2 IN NO EVENT WILL ANY AUTHOR OR DEVELOPER OF THE WORK BE LIABLE TO
* ANY OTHER PARTY FOR ANY DAMAGES, INCLUDING BUT NOT LIMITED TO THE
* COST OF PROCURING SUBSTITUTE GOODS OR SERVICES, LOST PROFITS, LOSS
* OF USE, LOSS OF DATA, OR ANY INCIDENTAL, CONSEQUENTIAL, DIRECT,
* INDIRECT, OR SPECIAL DAMAGES WHETHER UNDER CONTRACT, TORT, WARRANTY,
* OR OTHERWISE, ARISING IN ANY WAY OUT OF THIS OR ANY OTHER AGREEMENT
* RELATING TO THE WORK, WHETHER OR NOT SUCH AUTHOR OR DEVELOPER HAD
* ADVANCE NOTICE OF THE POSSIBILITY OF SUCH DAMAGES.
*
* Contributors:
* Gradient Systems
*/
#include "config.h"
#include "porting.h"
#include "init.h"
#include <stdio.h>
#include "date.h"
#include "decimal.h"
#include "dist.h"
#include "constants.h"
#include "columns.h"
#include "genrand.h"
#include "tdefs.h"
#include "tables.h"
#include "build_support.h"
#include "tpcds.idx.h"
#include "scaling.h"
#include "w_web_sales.h"
#include "error_msg.h"
#include "tdefs.h"
#include "scd.h"
#include "r_params.h"
#include "sparse.h"
static ds_key_t web_join(int col, ds_key_t join_key);
/*
* Routine: date_join(int from_tbl, int join_count)
* Purpose: account for the different date-adjusted patterns in the data set
* Data Structures:
*
* Params:
* Returns:
* Called By:
* Calls:
* Assumptions:
* Side Effects:
* TODO: Relies on existing RNG code, which isn't really 64bit; will probably
* requre a rework of the genrand_xx routines
*/
static ds_key_t date_join(int from_tbl, int from_col, ds_key_t join_count, int nYear) {
int nDay, nTemp, nMin = -1, nMax = -1, nResult;
static int jToday;
date_t TempDate;
if (InitConstants::date_join_init == 0) {
strtodt(&TempDate, TODAYS_DATE);
jToday = dttoj(&TempDate);
InitConstants::date_join_init = 1;
}
switch (from_tbl) {
case STORE_SALES:
case CATALOG_SALES:
case WEB_SALES:
pick_distribution(&nDay, "calendar", 1, calendar_sales + is_leap(nYear), from_col);
break;
/*
* returns are keyed to the sale date, with the lag between sale and return
* selected within a known range, based on sales channel
*/
case STORE_RETURNS:
nMin = SS_MIN_SHIP_DELAY;
nMax = SS_MAX_SHIP_DELAY;
case CATALOG_RETURNS:
if (nMin == -1) {
nMin = CS_MIN_SHIP_DELAY;
nMax = CS_MAX_SHIP_DELAY;
}
case WEB_RETURNS:
if (nMin == -1) {
nMin = WS_MIN_SHIP_DELAY;
nMax = WS_MAX_SHIP_DELAY;
}
genrand_integer(&nTemp, DIST_UNIFORM, nMin * 2, nMax * 2, 0, from_col);
return (join_count + nTemp);
break;
case WEB_SITE:
case WEB_PAGE:
return (web_join(from_col, join_count));
default:
pick_distribution(&nDay, "calendar", 1, 1 + is_leap(nYear), from_col);
break;
}
TempDate.year = nYear;
TempDate.month = 1;
TempDate.day = 1;
nResult = dttoj(&TempDate) + nDay;
return ((ds_key_t)(nResult > jToday) ? -1 : nResult);
}
/*
* Routine: time_join(int from_tbl, int join_count)
* Purpose: create joins that are time-skewed
* Data Structures:
*
* Params:
* Returns:
* Called By:
* Calls:
* Assumptions:
* Side Effects:
* TODO: Relies on existing RNG code, which isn't really 64bit; will probably
* requre a rework of the genrand_xx routines
*/
static ds_key_t time_join(int to_tbl, int to_col, ds_key_t join_count) {
int hour, secs;
switch (to_tbl) {
case STORE_SALES:
case STORE_RETURNS:
pick_distribution(&hour, "hours", 1, 2, to_col);
break;
case CATALOG_SALES:
case WEB_SALES:
case CATALOG_RETURNS:
case WEB_RETURNS:
pick_distribution(&hour, "hours", 1, 3, to_col);
break;
default:
pick_distribution(&hour, "hours", 1, 1, to_col);
break;
}
genrand_integer(&secs, DIST_UNIFORM, 0, 3599, 0, to_col);
return ((ds_key_t)(hour * 3600 + secs));
}
/*
* Routine: cp_join(int from_tbl, int join_count)
* Purpose: create joins to catalog_page
* Data Structures:
*
* Params:
* Returns:
* Called By:
* Calls:
* Assumptions:
* Side Effects:
* TODO: None
*/
static ds_key_t cp_join(int tbl, int col, ds_key_t jDate) {
ds_key_t res;
static int nPagePerCatalog;
int nType, nCount, nOffset, nPage;
static date_t dTemp;
char *szTemp;
if (!InitConstants::cp_join_init) {
nPagePerCatalog = ((int)get_rowcount(CATALOG_PAGE) / CP_CATALOGS_PER_YEAR) / (YEAR_MAXIMUM - YEAR_MINIMUM + 2);
strtodt(&dTemp, DATA_START_DATE);
InitConstants::cp_join_init = 1;
}
nType = pick_distribution(&szTemp, "catalog_page_type", 1, 2, col);
genrand_integer(&nPage, DIST_UNIFORM, 1, nPagePerCatalog, 0, col);
nOffset = (int)jDate - dTemp.julian - 1;
nCount = (nOffset / 365) * CP_CATALOGS_PER_YEAR;
nOffset %= 365;
switch (nType) {
case 1: /* bi-annual */
if (nOffset > 183)
nCount += 1;
break;
case 2: /* quarterly */
nCount += (nOffset / 91);
break;
case 3: /* monthly */
nCount += (nOffset / 31);
break;
}
res = CP_SK(nCount, nPagePerCatalog, nPage);
return (res);
}
/*
* Routine:
* Purpose:
* Algorithm:
* Data Structures:
*
* Params:
* Returns:
* Called By:
* Calls:
* Assumptions:
* Side Effects:
* TODO: None
*/
ds_key_t getCatalogNumberFromPage(ds_key_t kPageNumber) {
static int nPagePerCatalog;
if (!InitConstants::getCatalogNumberFromPage_init) {
nPagePerCatalog = ((int)get_rowcount(CATALOG_PAGE) / CP_CATALOGS_PER_YEAR) / (YEAR_MAXIMUM - YEAR_MINIMUM + 2);
InitConstants::getCatalogNumberFromPage_init = 1;
}
return (kPageNumber / nPagePerCatalog);
}
/*
* Routine: web_join(int col, ds_key_t join_key)
* Purpose: create joins to web_site/web_page. These need to be handled
*together, since the date of transaction must fit within the lifetime of a
*particular page, which must fit within the lifetime of a particular site Data
*Structures:
*
* Params:
* join_key is one of two things:
* 1. the xxx_sk for a particular row in the dimension for which we need
*appropriate dates
* 2. a julian date for which we need to pick a valid xxx_sk value
* Returns:
* Called By:
* Calls:
* Assumptions:
* Side Effects:
* TODO: None
*/
static ds_key_t web_join(int col, ds_key_t join_key) {
ds_key_t res = -1, kSite;
static int nConcurrentSites, nSiteDuration, nOffset;
static date_t dSiteOpen, /* open/close dates for current web site */
dSiteClose;
int nTemp;
tdef *pWS = getSimpleTdefsByNumber(WEB_SITE);
tdef *pWP = getSimpleTdefsByNumber(WEB_PAGE);
if (!InitConstants::web_join_init) {
strtodt(&dSiteClose, WEB_END_DATE);
nSiteDuration = dSiteClose.julian;
nConcurrentSites = (int)get_rowcount(CONCURRENT_WEB_SITES);
strtodt(&dSiteOpen, WEB_START_DATE);
nSiteDuration -= dSiteOpen.julian;
nSiteDuration *= nConcurrentSites;
nOffset = (dSiteClose.julian - dSiteOpen.julian) / (2 * nSiteDuration);
InitConstants::web_join_init = 1;
}
switch (col) {
/**************
* join_key is the xxx_sk value for a dimension
*/
case WEB_OPEN_DATE:
strtodt(&dSiteOpen, DATE_MINIMUM);
res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2);
if (WEB_IS_REPLACED(join_key)) /* this site is completely replaced */
{
if (WEB_IS_REPLACEMENT(join_key)) /* this is the second site */
{
/* the open date of the second site needs to align on a revision
* boundary */
res += nOffset * nSiteDuration;
}
}
break;
case WEB_CLOSE_DATE:
strtodt(&dSiteOpen, DATE_MINIMUM);
res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2);
res += pWS->nParam * nSiteDuration;
if (WEB_IS_REPLACED(join_key)) /* this site is completely replaced */
{
if (!WEB_IS_REPLACEMENT(join_key)) /* this is the first site */
{
/* the close date of the first site needs to align on a revision
* boundary */
res -= pWS->nParam * nSiteDuration / 2;
}
}
break;
case WEB_REC_START_DATE_ID:
strtodt(&dSiteOpen, DATE_MINIMUM);
res = dSiteOpen.julian - (((join_key - 1) * WEB_DATE_STAGGER) % nSiteDuration / 2);
res += (join_key % pWS->nParam) * nSiteDuration;
break;
case WEB_REC_END_DATE_ID:
strtodt(&dSiteOpen, DATE_MINIMUM);
res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2);
res += ((join_key + 1) % pWS->nParam) * nSiteDuration * 5 - 1;
break;
case WP_REC_START_DATE_ID:
strtodt(&dSiteOpen, DATE_MINIMUM);
res = dSiteOpen.julian - (((join_key - 1) * WEB_DATE_STAGGER) % nSiteDuration / 2);
res += (join_key % pWP->nParam) * nSiteDuration * 5;
break;
case WP_REC_END_DATE_ID:
strtodt(&dSiteOpen, DATE_MINIMUM);
res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2);
res += ((join_key + 1) % pWP->nParam) * nSiteDuration - 1;
break;
case WP_CREATION_DATE_SK:
/* page creation has to happen outside of the page window, to assure a
* constant number of pages, so it occurs in the gap between site
* creation and the site's actual activity. For sites that are replaced
* in the time span of the data set, this will depend on whether they
* are the first version or the second
*/
strtodt(&dSiteOpen, DATE_MINIMUM);
kSite = join_key / WEB_PAGES_PER_SITE + 1;
res = dSiteOpen.julian - (((int)kSite * WEB_DATE_STAGGER) % nSiteDuration / 2);
if (((int)kSite % pWP->nParam) == 0) /* this is a site that gets replaced */
{
genrand_integer(&nTemp, DIST_UNIFORM, (int)res, dSiteOpen.julian, 0, col);
res = nTemp;
}
break;
/*****************
* join key from here on is a date for which a valid site/page must be
* found the sk for a web page is a compound value: <site id><page id>
* and each component is a combination of the unique site or page and
* the active revision to it
*/
case WR_WEB_PAGE_SK:
case WS_WEB_PAGE_SK:
res = genrand_integer(NULL, DIST_UNIFORM, 1, WEB_PAGES_PER_SITE, 0, col);
break;
}
return (res);
}
/*
* Routine: mk_join(int from_tbl, int to_tbl, int join_count)
* Purpose: return a primary key for to_tbl, creating a join between from_tbl
*and to_tbl Algorithm: all joins are currently uniformly distributed. The
*calling convention allows for each join in the schema to be distributed
*differently Data Structures:
*
* Params:
* Returns:
* Called By:
* Calls:
* Assumptions:
* Side Effects:
* TODO: Relies on existing RNG code, which isn't really 64bit; will probably
*requre a rework of the genrand_xx routines
*/
ds_key_t mk_join(int from_col, int to_tbl, ds_key_t join_count) {
ds_key_t res;
int nYear, nFromTable = 0, nTableIndex = to_tbl;
tdef *pTdef;
nFromTable = getTableFromColumn(from_col);
/*
* if the table being joined to employs sparse keys, the join gets handled
* in sparse.c
*/
pTdef = getSimpleTdefsByNumber(to_tbl);
if (pTdef->flags & FL_SPARSE) {
if (pTdef->arSparseKeys == NULL)
initSparseKeys(to_tbl);
}
switch (to_tbl) {
/* some tables require special handling */
case CATALOG_PAGE:
return (cp_join(nFromTable, from_col, join_count));
case DATET:
genrand_integer(&nYear, DIST_UNIFORM, YEAR_MINIMUM, YEAR_MAXIMUM, 0, from_col);
return (date_join(nFromTable, from_col, join_count, nYear));
case TIME:
return (time_join(nFromTable, from_col, join_count));
/* the rest of the tables use standard, uniform joins */
default:
/*
* all TYPE2 tables (i.e., history keeping dimensions) need a special
* join algorithm
*/
if (pTdef->flags & FL_TYPE_2)
return (scd_join(nTableIndex, from_col, join_count));
if (pTdef->flags & FL_SPARSE)
return (randomSparseKey(nTableIndex, from_col));
genrand_key(&res, DIST_UNIFORM, (ds_key_t)1, get_rowcount(nTableIndex), (ds_key_t)0, from_col);
break;
}
return ((ds_key_t)res);
}