should be it
This commit is contained in:
420
external/duckdb/extension/tpcds/dsdgen/dsdgen-c/join.cpp
vendored
Normal file
420
external/duckdb/extension/tpcds/dsdgen/dsdgen-c/join.cpp
vendored
Normal file
@@ -0,0 +1,420 @@
|
||||
/*
|
||||
* Legal Notice
|
||||
*
|
||||
* This document and associated source code (the "Work") is a part of a
|
||||
* benchmark specification maintained by the TPC.
|
||||
*
|
||||
* The TPC reserves all right, title, and interest to the Work as provided
|
||||
* under U.S. and international laws, including without limitation all patent
|
||||
* and trademark rights therein.
|
||||
*
|
||||
* No Warranty
|
||||
*
|
||||
* 1.1 TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THE INFORMATION
|
||||
* CONTAINED HEREIN IS PROVIDED "AS IS" AND WITH ALL FAULTS, AND THE
|
||||
* AUTHORS AND DEVELOPERS OF THE WORK HEREBY DISCLAIM ALL OTHER
|
||||
* WARRANTIES AND CONDITIONS, EITHER EXPRESS, IMPLIED OR STATUTORY,
|
||||
* INCLUDING, BUT NOT LIMITED TO, ANY (IF ANY) IMPLIED WARRANTIES,
|
||||
* DUTIES OR CONDITIONS OF MERCHANTABILITY, OF FITNESS FOR A PARTICULAR
|
||||
* PURPOSE, OF ACCURACY OR COMPLETENESS OF RESPONSES, OF RESULTS, OF
|
||||
* WORKMANLIKE EFFORT, OF LACK OF VIRUSES, AND OF LACK OF NEGLIGENCE.
|
||||
* ALSO, THERE IS NO WARRANTY OR CONDITION OF TITLE, QUIET ENJOYMENT,
|
||||
* QUIET POSSESSION, CORRESPONDENCE TO DESCRIPTION OR NON-INFRINGEMENT
|
||||
* WITH REGARD TO THE WORK.
|
||||
* 1.2 IN NO EVENT WILL ANY AUTHOR OR DEVELOPER OF THE WORK BE LIABLE TO
|
||||
* ANY OTHER PARTY FOR ANY DAMAGES, INCLUDING BUT NOT LIMITED TO THE
|
||||
* COST OF PROCURING SUBSTITUTE GOODS OR SERVICES, LOST PROFITS, LOSS
|
||||
* OF USE, LOSS OF DATA, OR ANY INCIDENTAL, CONSEQUENTIAL, DIRECT,
|
||||
* INDIRECT, OR SPECIAL DAMAGES WHETHER UNDER CONTRACT, TORT, WARRANTY,
|
||||
* OR OTHERWISE, ARISING IN ANY WAY OUT OF THIS OR ANY OTHER AGREEMENT
|
||||
* RELATING TO THE WORK, WHETHER OR NOT SUCH AUTHOR OR DEVELOPER HAD
|
||||
* ADVANCE NOTICE OF THE POSSIBILITY OF SUCH DAMAGES.
|
||||
*
|
||||
* Contributors:
|
||||
* Gradient Systems
|
||||
*/
|
||||
#include "config.h"
|
||||
#include "porting.h"
|
||||
#include "init.h"
|
||||
#include <stdio.h>
|
||||
#include "date.h"
|
||||
#include "decimal.h"
|
||||
#include "dist.h"
|
||||
#include "constants.h"
|
||||
#include "columns.h"
|
||||
#include "genrand.h"
|
||||
#include "tdefs.h"
|
||||
#include "tables.h"
|
||||
#include "build_support.h"
|
||||
#include "tpcds.idx.h"
|
||||
#include "scaling.h"
|
||||
#include "w_web_sales.h"
|
||||
#include "error_msg.h"
|
||||
#include "tdefs.h"
|
||||
#include "scd.h"
|
||||
#include "r_params.h"
|
||||
#include "sparse.h"
|
||||
|
||||
static ds_key_t web_join(int col, ds_key_t join_key);
|
||||
|
||||
/*
|
||||
* Routine: date_join(int from_tbl, int join_count)
|
||||
* Purpose: account for the different date-adjusted patterns in the data set
|
||||
* Data Structures:
|
||||
*
|
||||
* Params:
|
||||
* Returns:
|
||||
* Called By:
|
||||
* Calls:
|
||||
* Assumptions:
|
||||
* Side Effects:
|
||||
* TODO: Relies on existing RNG code, which isn't really 64bit; will probably
|
||||
* requre a rework of the genrand_xx routines
|
||||
*/
|
||||
static ds_key_t date_join(int from_tbl, int from_col, ds_key_t join_count, int nYear) {
|
||||
int nDay, nTemp, nMin = -1, nMax = -1, nResult;
|
||||
static int jToday;
|
||||
date_t TempDate;
|
||||
|
||||
if (InitConstants::date_join_init == 0) {
|
||||
strtodt(&TempDate, TODAYS_DATE);
|
||||
jToday = dttoj(&TempDate);
|
||||
InitConstants::date_join_init = 1;
|
||||
}
|
||||
|
||||
switch (from_tbl) {
|
||||
case STORE_SALES:
|
||||
case CATALOG_SALES:
|
||||
case WEB_SALES:
|
||||
pick_distribution(&nDay, "calendar", 1, calendar_sales + is_leap(nYear), from_col);
|
||||
break;
|
||||
|
||||
/*
|
||||
* returns are keyed to the sale date, with the lag between sale and return
|
||||
* selected within a known range, based on sales channel
|
||||
*/
|
||||
case STORE_RETURNS:
|
||||
nMin = SS_MIN_SHIP_DELAY;
|
||||
nMax = SS_MAX_SHIP_DELAY;
|
||||
case CATALOG_RETURNS:
|
||||
if (nMin == -1) {
|
||||
nMin = CS_MIN_SHIP_DELAY;
|
||||
nMax = CS_MAX_SHIP_DELAY;
|
||||
}
|
||||
case WEB_RETURNS:
|
||||
if (nMin == -1) {
|
||||
nMin = WS_MIN_SHIP_DELAY;
|
||||
nMax = WS_MAX_SHIP_DELAY;
|
||||
}
|
||||
genrand_integer(&nTemp, DIST_UNIFORM, nMin * 2, nMax * 2, 0, from_col);
|
||||
return (join_count + nTemp);
|
||||
break;
|
||||
case WEB_SITE:
|
||||
case WEB_PAGE:
|
||||
return (web_join(from_col, join_count));
|
||||
default:
|
||||
pick_distribution(&nDay, "calendar", 1, 1 + is_leap(nYear), from_col);
|
||||
break;
|
||||
}
|
||||
|
||||
TempDate.year = nYear;
|
||||
TempDate.month = 1;
|
||||
TempDate.day = 1;
|
||||
|
||||
nResult = dttoj(&TempDate) + nDay;
|
||||
|
||||
return ((ds_key_t)(nResult > jToday) ? -1 : nResult);
|
||||
}
|
||||
|
||||
/*
|
||||
* Routine: time_join(int from_tbl, int join_count)
|
||||
* Purpose: create joins that are time-skewed
|
||||
* Data Structures:
|
||||
*
|
||||
* Params:
|
||||
* Returns:
|
||||
* Called By:
|
||||
* Calls:
|
||||
* Assumptions:
|
||||
* Side Effects:
|
||||
* TODO: Relies on existing RNG code, which isn't really 64bit; will probably
|
||||
* requre a rework of the genrand_xx routines
|
||||
*/
|
||||
static ds_key_t time_join(int to_tbl, int to_col, ds_key_t join_count) {
|
||||
int hour, secs;
|
||||
|
||||
switch (to_tbl) {
|
||||
case STORE_SALES:
|
||||
case STORE_RETURNS:
|
||||
pick_distribution(&hour, "hours", 1, 2, to_col);
|
||||
break;
|
||||
case CATALOG_SALES:
|
||||
case WEB_SALES:
|
||||
case CATALOG_RETURNS:
|
||||
case WEB_RETURNS:
|
||||
pick_distribution(&hour, "hours", 1, 3, to_col);
|
||||
break;
|
||||
default:
|
||||
pick_distribution(&hour, "hours", 1, 1, to_col);
|
||||
break;
|
||||
}
|
||||
genrand_integer(&secs, DIST_UNIFORM, 0, 3599, 0, to_col);
|
||||
|
||||
return ((ds_key_t)(hour * 3600 + secs));
|
||||
}
|
||||
|
||||
/*
|
||||
* Routine: cp_join(int from_tbl, int join_count)
|
||||
* Purpose: create joins to catalog_page
|
||||
* Data Structures:
|
||||
*
|
||||
* Params:
|
||||
* Returns:
|
||||
* Called By:
|
||||
* Calls:
|
||||
* Assumptions:
|
||||
* Side Effects:
|
||||
* TODO: None
|
||||
*/
|
||||
static ds_key_t cp_join(int tbl, int col, ds_key_t jDate) {
|
||||
ds_key_t res;
|
||||
static int nPagePerCatalog;
|
||||
int nType, nCount, nOffset, nPage;
|
||||
static date_t dTemp;
|
||||
char *szTemp;
|
||||
|
||||
if (!InitConstants::cp_join_init) {
|
||||
nPagePerCatalog = ((int)get_rowcount(CATALOG_PAGE) / CP_CATALOGS_PER_YEAR) / (YEAR_MAXIMUM - YEAR_MINIMUM + 2);
|
||||
strtodt(&dTemp, DATA_START_DATE);
|
||||
InitConstants::cp_join_init = 1;
|
||||
}
|
||||
|
||||
nType = pick_distribution(&szTemp, "catalog_page_type", 1, 2, col);
|
||||
genrand_integer(&nPage, DIST_UNIFORM, 1, nPagePerCatalog, 0, col);
|
||||
nOffset = (int)jDate - dTemp.julian - 1;
|
||||
nCount = (nOffset / 365) * CP_CATALOGS_PER_YEAR;
|
||||
nOffset %= 365;
|
||||
|
||||
switch (nType) {
|
||||
case 1: /* bi-annual */
|
||||
if (nOffset > 183)
|
||||
nCount += 1;
|
||||
break;
|
||||
case 2: /* quarterly */
|
||||
nCount += (nOffset / 91);
|
||||
break;
|
||||
case 3: /* monthly */
|
||||
nCount += (nOffset / 31);
|
||||
break;
|
||||
}
|
||||
|
||||
res = CP_SK(nCount, nPagePerCatalog, nPage);
|
||||
|
||||
return (res);
|
||||
}
|
||||
/*
|
||||
* Routine:
|
||||
* Purpose:
|
||||
* Algorithm:
|
||||
* Data Structures:
|
||||
*
|
||||
* Params:
|
||||
* Returns:
|
||||
* Called By:
|
||||
* Calls:
|
||||
* Assumptions:
|
||||
* Side Effects:
|
||||
* TODO: None
|
||||
*/
|
||||
ds_key_t getCatalogNumberFromPage(ds_key_t kPageNumber) {
|
||||
static int nPagePerCatalog;
|
||||
|
||||
if (!InitConstants::getCatalogNumberFromPage_init) {
|
||||
nPagePerCatalog = ((int)get_rowcount(CATALOG_PAGE) / CP_CATALOGS_PER_YEAR) / (YEAR_MAXIMUM - YEAR_MINIMUM + 2);
|
||||
InitConstants::getCatalogNumberFromPage_init = 1;
|
||||
}
|
||||
|
||||
return (kPageNumber / nPagePerCatalog);
|
||||
}
|
||||
|
||||
/*
|
||||
* Routine: web_join(int col, ds_key_t join_key)
|
||||
* Purpose: create joins to web_site/web_page. These need to be handled
|
||||
*together, since the date of transaction must fit within the lifetime of a
|
||||
*particular page, which must fit within the lifetime of a particular site Data
|
||||
*Structures:
|
||||
*
|
||||
* Params:
|
||||
* join_key is one of two things:
|
||||
* 1. the xxx_sk for a particular row in the dimension for which we need
|
||||
*appropriate dates
|
||||
* 2. a julian date for which we need to pick a valid xxx_sk value
|
||||
* Returns:
|
||||
* Called By:
|
||||
* Calls:
|
||||
* Assumptions:
|
||||
* Side Effects:
|
||||
* TODO: None
|
||||
*/
|
||||
static ds_key_t web_join(int col, ds_key_t join_key) {
|
||||
ds_key_t res = -1, kSite;
|
||||
static int nConcurrentSites, nSiteDuration, nOffset;
|
||||
static date_t dSiteOpen, /* open/close dates for current web site */
|
||||
dSiteClose;
|
||||
int nTemp;
|
||||
tdef *pWS = getSimpleTdefsByNumber(WEB_SITE);
|
||||
tdef *pWP = getSimpleTdefsByNumber(WEB_PAGE);
|
||||
|
||||
if (!InitConstants::web_join_init) {
|
||||
strtodt(&dSiteClose, WEB_END_DATE);
|
||||
nSiteDuration = dSiteClose.julian;
|
||||
nConcurrentSites = (int)get_rowcount(CONCURRENT_WEB_SITES);
|
||||
strtodt(&dSiteOpen, WEB_START_DATE);
|
||||
nSiteDuration -= dSiteOpen.julian;
|
||||
nSiteDuration *= nConcurrentSites;
|
||||
nOffset = (dSiteClose.julian - dSiteOpen.julian) / (2 * nSiteDuration);
|
||||
InitConstants::web_join_init = 1;
|
||||
}
|
||||
|
||||
switch (col) {
|
||||
/**************
|
||||
* join_key is the xxx_sk value for a dimension
|
||||
*/
|
||||
case WEB_OPEN_DATE:
|
||||
strtodt(&dSiteOpen, DATE_MINIMUM);
|
||||
res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2);
|
||||
if (WEB_IS_REPLACED(join_key)) /* this site is completely replaced */
|
||||
{
|
||||
if (WEB_IS_REPLACEMENT(join_key)) /* this is the second site */
|
||||
{
|
||||
/* the open date of the second site needs to align on a revision
|
||||
* boundary */
|
||||
res += nOffset * nSiteDuration;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case WEB_CLOSE_DATE:
|
||||
strtodt(&dSiteOpen, DATE_MINIMUM);
|
||||
res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2);
|
||||
res += pWS->nParam * nSiteDuration;
|
||||
if (WEB_IS_REPLACED(join_key)) /* this site is completely replaced */
|
||||
{
|
||||
if (!WEB_IS_REPLACEMENT(join_key)) /* this is the first site */
|
||||
{
|
||||
/* the close date of the first site needs to align on a revision
|
||||
* boundary */
|
||||
res -= pWS->nParam * nSiteDuration / 2;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case WEB_REC_START_DATE_ID:
|
||||
strtodt(&dSiteOpen, DATE_MINIMUM);
|
||||
res = dSiteOpen.julian - (((join_key - 1) * WEB_DATE_STAGGER) % nSiteDuration / 2);
|
||||
res += (join_key % pWS->nParam) * nSiteDuration;
|
||||
break;
|
||||
case WEB_REC_END_DATE_ID:
|
||||
strtodt(&dSiteOpen, DATE_MINIMUM);
|
||||
res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2);
|
||||
res += ((join_key + 1) % pWS->nParam) * nSiteDuration * 5 - 1;
|
||||
break;
|
||||
case WP_REC_START_DATE_ID:
|
||||
strtodt(&dSiteOpen, DATE_MINIMUM);
|
||||
res = dSiteOpen.julian - (((join_key - 1) * WEB_DATE_STAGGER) % nSiteDuration / 2);
|
||||
res += (join_key % pWP->nParam) * nSiteDuration * 5;
|
||||
break;
|
||||
case WP_REC_END_DATE_ID:
|
||||
strtodt(&dSiteOpen, DATE_MINIMUM);
|
||||
res = dSiteOpen.julian - ((join_key * WEB_DATE_STAGGER) % nSiteDuration / 2);
|
||||
res += ((join_key + 1) % pWP->nParam) * nSiteDuration - 1;
|
||||
break;
|
||||
case WP_CREATION_DATE_SK:
|
||||
/* page creation has to happen outside of the page window, to assure a
|
||||
* constant number of pages, so it occurs in the gap between site
|
||||
* creation and the site's actual activity. For sites that are replaced
|
||||
* in the time span of the data set, this will depend on whether they
|
||||
* are the first version or the second
|
||||
*/
|
||||
strtodt(&dSiteOpen, DATE_MINIMUM);
|
||||
kSite = join_key / WEB_PAGES_PER_SITE + 1;
|
||||
res = dSiteOpen.julian - (((int)kSite * WEB_DATE_STAGGER) % nSiteDuration / 2);
|
||||
if (((int)kSite % pWP->nParam) == 0) /* this is a site that gets replaced */
|
||||
{
|
||||
genrand_integer(&nTemp, DIST_UNIFORM, (int)res, dSiteOpen.julian, 0, col);
|
||||
res = nTemp;
|
||||
}
|
||||
break;
|
||||
/*****************
|
||||
* join key from here on is a date for which a valid site/page must be
|
||||
* found the sk for a web page is a compound value: <site id><page id>
|
||||
* and each component is a combination of the unique site or page and
|
||||
* the active revision to it
|
||||
*/
|
||||
case WR_WEB_PAGE_SK:
|
||||
case WS_WEB_PAGE_SK:
|
||||
res = genrand_integer(NULL, DIST_UNIFORM, 1, WEB_PAGES_PER_SITE, 0, col);
|
||||
break;
|
||||
}
|
||||
|
||||
return (res);
|
||||
}
|
||||
|
||||
/*
|
||||
* Routine: mk_join(int from_tbl, int to_tbl, int join_count)
|
||||
* Purpose: return a primary key for to_tbl, creating a join between from_tbl
|
||||
*and to_tbl Algorithm: all joins are currently uniformly distributed. The
|
||||
*calling convention allows for each join in the schema to be distributed
|
||||
*differently Data Structures:
|
||||
*
|
||||
* Params:
|
||||
* Returns:
|
||||
* Called By:
|
||||
* Calls:
|
||||
* Assumptions:
|
||||
* Side Effects:
|
||||
* TODO: Relies on existing RNG code, which isn't really 64bit; will probably
|
||||
*requre a rework of the genrand_xx routines
|
||||
*/
|
||||
ds_key_t mk_join(int from_col, int to_tbl, ds_key_t join_count) {
|
||||
ds_key_t res;
|
||||
int nYear, nFromTable = 0, nTableIndex = to_tbl;
|
||||
tdef *pTdef;
|
||||
|
||||
nFromTable = getTableFromColumn(from_col);
|
||||
|
||||
/*
|
||||
* if the table being joined to employs sparse keys, the join gets handled
|
||||
* in sparse.c
|
||||
*/
|
||||
pTdef = getSimpleTdefsByNumber(to_tbl);
|
||||
if (pTdef->flags & FL_SPARSE) {
|
||||
if (pTdef->arSparseKeys == NULL)
|
||||
initSparseKeys(to_tbl);
|
||||
}
|
||||
|
||||
switch (to_tbl) {
|
||||
/* some tables require special handling */
|
||||
case CATALOG_PAGE:
|
||||
return (cp_join(nFromTable, from_col, join_count));
|
||||
case DATET:
|
||||
genrand_integer(&nYear, DIST_UNIFORM, YEAR_MINIMUM, YEAR_MAXIMUM, 0, from_col);
|
||||
return (date_join(nFromTable, from_col, join_count, nYear));
|
||||
case TIME:
|
||||
return (time_join(nFromTable, from_col, join_count));
|
||||
/* the rest of the tables use standard, uniform joins */
|
||||
default:
|
||||
/*
|
||||
* all TYPE2 tables (i.e., history keeping dimensions) need a special
|
||||
* join algorithm
|
||||
*/
|
||||
if (pTdef->flags & FL_TYPE_2)
|
||||
return (scd_join(nTableIndex, from_col, join_count));
|
||||
|
||||
if (pTdef->flags & FL_SPARSE)
|
||||
return (randomSparseKey(nTableIndex, from_col));
|
||||
|
||||
genrand_key(&res, DIST_UNIFORM, (ds_key_t)1, get_rowcount(nTableIndex), (ds_key_t)0, from_col);
|
||||
break;
|
||||
}
|
||||
|
||||
return ((ds_key_t)res);
|
||||
}
|
||||
Reference in New Issue
Block a user