should be it
This commit is contained in:
442
external/duckdb/extension/tpch/dbgen/text.cpp
vendored
Normal file
442
external/duckdb/extension/tpch/dbgen/text.cpp
vendored
Normal file
@@ -0,0 +1,442 @@
|
||||
/*
|
||||
* Copyright owned by the Transaction Processing Performance Council.
|
||||
*
|
||||
* A copy of the license is included under extension/tpch/dbgen/LICENSE
|
||||
* in this repository.
|
||||
*
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* THE TPC SOFTWARE IS AVAILABLE WITHOUT CHARGE FROM TPC.
|
||||
*/
|
||||
/*
|
||||
* text.c --- pseaudo text generator for use in DBGEN 2.0
|
||||
*
|
||||
* Defined Routines:
|
||||
* dbg_text() -- select and translate a sentance form
|
||||
*/
|
||||
|
||||
#ifdef TEXT_TEST
|
||||
#define DECLARER
|
||||
#endif /* TEST */
|
||||
|
||||
#include "dbgen/config.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#ifndef _WIN32
|
||||
/* Change for Windows NT */
|
||||
#include <unistd.h>
|
||||
#endif /* _WIN32 */
|
||||
#include <ctype.h>
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
#include <signal.h>
|
||||
#include <stdio.h> /* */
|
||||
#include <string.h>
|
||||
#ifdef HP
|
||||
#include <strings.h>
|
||||
#endif
|
||||
#if (defined(WIN32) && !defined(_POSIX_))
|
||||
#include <process.h>
|
||||
#pragma warning(disable : 4201)
|
||||
#pragma warning(disable : 4214)
|
||||
#pragma warning(disable : 4514)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOATOM
|
||||
#define NOGDICAPMASKS
|
||||
#define NOMETAFILE
|
||||
#define NOMINMAX
|
||||
#define NOMSG
|
||||
#define NOOPENFILE
|
||||
#define NORASTEROPS
|
||||
#define NOSCROLL
|
||||
#define NOSOUND
|
||||
#define NOSYSMETRICS
|
||||
#define NOTEXTMETRIC
|
||||
#define NOWH
|
||||
#define NOCOMM
|
||||
#define NOKANJI
|
||||
#define NOMCX
|
||||
#include <windows.h>
|
||||
#pragma warning(default : 4201)
|
||||
#pragma warning(default : 4214)
|
||||
#endif
|
||||
|
||||
#include "dbgen/dss.h"
|
||||
#include "dbgen/dsstypes.h"
|
||||
|
||||
/*
|
||||
* txt_vp() --
|
||||
* generate a verb phrase by
|
||||
* 1) selecting a verb phrase form
|
||||
* 2) parsing it to select parts of speech
|
||||
* 3) selecting appropriate words
|
||||
* 4) adding punctuation as required
|
||||
*
|
||||
* Returns: length of generated phrase
|
||||
* Called By: txt_sentence()
|
||||
* Calls: pick_str()
|
||||
*/
|
||||
static int txt_vp(char *dest, seed_t *seed) {
|
||||
char syntax[MAX_GRAMMAR_LEN + 1], *cptr, *parse_target;
|
||||
distribution *src;
|
||||
int i, res = 0;
|
||||
|
||||
pick_str(&vp, seed, &syntax[0]);
|
||||
parse_target = syntax;
|
||||
while ((cptr = strtok(parse_target, " ")) != NULL) {
|
||||
src = NULL;
|
||||
switch (*cptr) {
|
||||
case 'D':
|
||||
src = &adverbs;
|
||||
break;
|
||||
case 'V':
|
||||
src = &verbs;
|
||||
break;
|
||||
case 'X':
|
||||
src = &auxillaries;
|
||||
break;
|
||||
} /* end of POS switch statement */
|
||||
i = pick_str(src, seed, dest);
|
||||
i = (int)strlen(DIST_MEMBER(src, i));
|
||||
dest += i;
|
||||
res += i;
|
||||
if (*(++cptr)) /* miscelaneous fillagree, like punctuation */
|
||||
{
|
||||
dest += 1;
|
||||
res += 1;
|
||||
*dest = *cptr;
|
||||
}
|
||||
*dest = ' ';
|
||||
dest++;
|
||||
res++;
|
||||
parse_target = NULL;
|
||||
} /* end of while loop */
|
||||
|
||||
return (res);
|
||||
}
|
||||
|
||||
/*
|
||||
* txt_np() --
|
||||
* generate a noun phrase by
|
||||
* 1) selecting a noun phrase form
|
||||
* 2) parsing it to select parts of speech
|
||||
* 3) selecting appropriate words
|
||||
* 4) adding punctuation as required
|
||||
*
|
||||
* Returns: length of generated phrase
|
||||
* Called By: txt_sentence()
|
||||
* Calls: pick_str(),
|
||||
*/
|
||||
static int txt_np(char *dest, seed_t *seed) {
|
||||
char syntax[MAX_GRAMMAR_LEN + 1], *cptr, *parse_target;
|
||||
distribution *src;
|
||||
int i, res = 0;
|
||||
|
||||
pick_str(&np, seed, &syntax[0]);
|
||||
parse_target = syntax;
|
||||
while ((cptr = strtok(parse_target, " ")) != NULL) {
|
||||
src = NULL;
|
||||
switch (*cptr) {
|
||||
case 'A':
|
||||
src = &articles;
|
||||
break;
|
||||
case 'J':
|
||||
src = &adjectives;
|
||||
break;
|
||||
case 'D':
|
||||
src = &adverbs;
|
||||
break;
|
||||
case 'N':
|
||||
src = &nouns;
|
||||
break;
|
||||
} /* end of POS switch statement */
|
||||
i = pick_str(src, seed, dest);
|
||||
i = (int)strlen(DIST_MEMBER(src, i));
|
||||
dest += i;
|
||||
res += i;
|
||||
if (*(++cptr)) /* miscelaneous fillagree, like punctuation */
|
||||
{
|
||||
*dest = *cptr;
|
||||
dest += 1;
|
||||
res += 1;
|
||||
}
|
||||
*dest = ' ';
|
||||
dest++;
|
||||
res++;
|
||||
parse_target = NULL;
|
||||
} /* end of while loop */
|
||||
|
||||
return (res);
|
||||
}
|
||||
|
||||
/*
|
||||
* txt_sentence() --
|
||||
* generate a sentence by
|
||||
* 1) selecting a sentence form
|
||||
* 2) parsing it to select parts of speech or phrase types
|
||||
* 3) selecting appropriate words
|
||||
* 4) adding punctuation as required
|
||||
*
|
||||
* Returns: length of generated sentence
|
||||
* Called By: dbg_text()
|
||||
* Calls: pick_str(), txt_np(), txt_vp()
|
||||
*/
|
||||
static int txt_sentence(char *dest, seed_t *seed) {
|
||||
char syntax[MAX_GRAMMAR_LEN + 1], *cptr;
|
||||
int i, res = 0, len = 0;
|
||||
|
||||
pick_str(&grammar, seed, syntax);
|
||||
cptr = syntax;
|
||||
|
||||
next_token: /* I hate goto's, but can't seem to have parent and child use strtok() */
|
||||
while (*cptr && *cptr == ' ')
|
||||
cptr++;
|
||||
if (*cptr == '\0')
|
||||
goto done;
|
||||
switch (*cptr) {
|
||||
case 'V':
|
||||
len = txt_vp(dest, seed);
|
||||
break;
|
||||
case 'N':
|
||||
len = txt_np(dest, seed);
|
||||
break;
|
||||
case 'P':
|
||||
i = pick_str(&prepositions, seed, dest);
|
||||
len = (int)strlen(DIST_MEMBER(&prepositions, i));
|
||||
strcpy((dest + len), " the ");
|
||||
len += 5;
|
||||
len += txt_np(dest + len, seed);
|
||||
break;
|
||||
case 'T':
|
||||
i = pick_str(&terminators, seed, --dest); /*terminators should abut previous word */
|
||||
len = (int)strlen(DIST_MEMBER(&terminators, i));
|
||||
break;
|
||||
} /* end of POS switch statement */
|
||||
dest += len;
|
||||
res += len;
|
||||
cptr++;
|
||||
if (*cptr && *cptr != ' ') /* miscelaneous fillagree, like punctuation */
|
||||
{
|
||||
dest += 1;
|
||||
res += 1;
|
||||
*dest = *cptr;
|
||||
}
|
||||
goto next_token;
|
||||
done:
|
||||
*dest = '\0';
|
||||
return (--res);
|
||||
}
|
||||
|
||||
static char *gen_text(char *dest, seed_t *seed, distribution *s) {
|
||||
long i = 0;
|
||||
DSS_HUGE j;
|
||||
|
||||
RANDOM(j, 1, s->list[s->count - 1].weight, seed);
|
||||
while (s->list[i].weight < j)
|
||||
i++;
|
||||
char *src = s->list[i].text;
|
||||
int ind = 0;
|
||||
while (src[ind]) {
|
||||
dest[ind] = src[ind];
|
||||
ind++;
|
||||
}
|
||||
dest[ind] = ' ';
|
||||
return dest + ind + 1;
|
||||
}
|
||||
|
||||
#define NOUN_MAX_WEIGHT 340
|
||||
#define ADJECTIVES_MAX_WEIGHT 289
|
||||
#define ADVERBS_MAX_WEIGHT 262
|
||||
#define AUXILLARIES_MAX_WEIGHT 18
|
||||
#define VERBS_MAX_WEIGHT 174
|
||||
#define PREPOSITIONS_MAX_WEIGHT 456
|
||||
|
||||
static char *noun_index[NOUN_MAX_WEIGHT + 1];
|
||||
static char *adjectives_index[ADJECTIVES_MAX_WEIGHT + 1];
|
||||
static char *adverbs_index[ADVERBS_MAX_WEIGHT + 1];
|
||||
static char *auxillaries_index[AUXILLARIES_MAX_WEIGHT + 1];
|
||||
static char *verbs_index[VERBS_MAX_WEIGHT + 1];
|
||||
static char *prepositions_index[PREPOSITIONS_MAX_WEIGHT + 1];
|
||||
|
||||
static char *szTextPool = NULL;
|
||||
static long txtBufferSize = 0;
|
||||
|
||||
// generate a lookup table for weight -> str
|
||||
static void gen_index(char **index, distribution *s) {
|
||||
for (size_t w = 0; w <= s->list[s->count - 1].weight; w++) {
|
||||
long i = 0;
|
||||
while (s->list[i].weight < w)
|
||||
i++;
|
||||
index[w] = s->list[i].text;
|
||||
}
|
||||
}
|
||||
|
||||
static char *gen_text_index(char *dest, seed_t *seed, char **index, distribution *s) {
|
||||
long i = 0;
|
||||
DSS_HUGE j;
|
||||
|
||||
RANDOM(j, 1, s->list[s->count - 1].weight, seed);
|
||||
char *src = index[j];
|
||||
int ind = 0;
|
||||
while (src[ind]) {
|
||||
dest[ind] = src[ind];
|
||||
ind++;
|
||||
}
|
||||
dest[ind] = ' ';
|
||||
return dest + ind + 1;
|
||||
}
|
||||
|
||||
static char *gen_vp(char *dest, seed_t *seed) {
|
||||
DSS_HUGE j;
|
||||
RANDOM(j, 1, vp.list[vp.count - 1].weight, seed);
|
||||
int index = 0;
|
||||
index += vp.list[0].weight < j;
|
||||
index += vp.list[1].weight < j;
|
||||
index += vp.list[2].weight < j;
|
||||
|
||||
if (index == 0) {
|
||||
dest = gen_text_index(dest, seed, verbs_index, &verbs);
|
||||
} else if (index == 1) {
|
||||
dest = gen_text_index(dest, seed, auxillaries_index, &auxillaries);
|
||||
dest = gen_text_index(dest, seed, verbs_index, &verbs);
|
||||
} else if (index == 2) {
|
||||
dest = gen_text_index(dest, seed, verbs_index, &verbs);
|
||||
dest = gen_text_index(dest, seed, adverbs_index, &adverbs);
|
||||
} else {
|
||||
dest = gen_text_index(dest, seed, auxillaries_index, &auxillaries);
|
||||
dest = gen_text_index(dest, seed, verbs_index, &verbs);
|
||||
dest = gen_text_index(dest, seed, adverbs_index, &adverbs);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
static char *gen_np(char *dest, seed_t *seed) {
|
||||
DSS_HUGE j;
|
||||
RANDOM(j, 1, np.list[np.count - 1].weight, seed);
|
||||
int index = 0;
|
||||
index += np.list[0].weight < j;
|
||||
index += np.list[1].weight < j;
|
||||
index += np.list[2].weight < j;
|
||||
|
||||
if (index == 0) {
|
||||
dest = gen_text_index(dest, seed, noun_index, &nouns);
|
||||
} else if (index == 1) {
|
||||
dest = gen_text_index(dest, seed, adjectives_index, &adjectives);
|
||||
dest = gen_text_index(dest, seed, noun_index, &nouns);
|
||||
} else if (index == 2) {
|
||||
dest = gen_text_index(dest, seed, adjectives_index, &adjectives);
|
||||
dest[-1] = ',';
|
||||
*(dest++) = ' ';
|
||||
dest = gen_text_index(dest, seed, adjectives_index, &adjectives);
|
||||
dest = gen_text_index(dest, seed, noun_index, &nouns);
|
||||
} else {
|
||||
dest = gen_text_index(dest, seed, adverbs_index, &adverbs);
|
||||
dest = gen_text_index(dest, seed, adjectives_index, &adjectives);
|
||||
dest = gen_text_index(dest, seed, noun_index, &nouns);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
static char *gen_preposition(char *dest, seed_t *seed) {
|
||||
dest = gen_text_index(dest, seed, prepositions_index, &prepositions);
|
||||
*(dest++) = 't';
|
||||
*(dest++) = 'h';
|
||||
*(dest++) = 'e';
|
||||
*(dest++) = ' ';
|
||||
return gen_np(dest, seed);
|
||||
}
|
||||
|
||||
static char *gen_terminator(char *dest, seed_t *seed) {
|
||||
dest = gen_text(--dest, seed, &terminators);
|
||||
return dest - 1;
|
||||
}
|
||||
|
||||
static char *gen_sentence(char *dest, seed_t *seed) {
|
||||
const char *cptr;
|
||||
int i;
|
||||
|
||||
DSS_HUGE j;
|
||||
RANDOM(j, 1, grammar.list[grammar.count - 1].weight, seed);
|
||||
int index = 0;
|
||||
index += grammar.list[0].weight < j;
|
||||
index += grammar.list[1].weight < j;
|
||||
index += grammar.list[2].weight < j;
|
||||
index += grammar.list[3].weight < j;
|
||||
cptr = grammar.list[index].text;
|
||||
|
||||
if (index == 0) {
|
||||
dest = gen_np(dest, seed);
|
||||
dest = gen_vp(dest, seed);
|
||||
dest = gen_terminator(dest, seed);
|
||||
} else if (index == 1) {
|
||||
dest = gen_np(dest, seed);
|
||||
dest = gen_vp(dest, seed);
|
||||
dest = gen_preposition(dest, seed);
|
||||
dest = gen_terminator(dest, seed);
|
||||
} else if (index == 2) {
|
||||
dest = gen_np(dest, seed);
|
||||
dest = gen_vp(dest, seed);
|
||||
dest = gen_np(dest, seed);
|
||||
dest = gen_terminator(dest, seed);
|
||||
} else if (index == 3) {
|
||||
dest = gen_np(dest, seed);
|
||||
dest = gen_preposition(dest, seed);
|
||||
dest = gen_vp(dest, seed);
|
||||
dest = gen_np(dest, seed);
|
||||
dest = gen_terminator(dest, seed);
|
||||
} else {
|
||||
dest = gen_np(dest, seed);
|
||||
dest = gen_preposition(dest, seed);
|
||||
dest = gen_vp(dest, seed);
|
||||
dest = gen_preposition(dest, seed);
|
||||
dest = gen_terminator(dest, seed);
|
||||
}
|
||||
*dest = ' ';
|
||||
return dest + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* init_text_pool() --
|
||||
* allocate and initialize the internal text pool buffer (szTextPool).
|
||||
* Make sure to call it before using dbg_text().
|
||||
*/
|
||||
void init_text_pool(long bSize, DBGenContext *ctx) {
|
||||
gen_index(noun_index, &nouns);
|
||||
gen_index(adjectives_index, &adjectives);
|
||||
gen_index(adverbs_index, &adverbs);
|
||||
gen_index(auxillaries_index, &auxillaries);
|
||||
gen_index(verbs_index, &verbs);
|
||||
gen_index(prepositions_index, &prepositions);
|
||||
|
||||
txtBufferSize = bSize;
|
||||
szTextPool = (char*)malloc(bSize + 1 + 100);
|
||||
|
||||
char *ptr = szTextPool;
|
||||
char *endptr = szTextPool + bSize + 1;
|
||||
while (ptr < endptr) {
|
||||
ptr = gen_sentence(ptr, &ctx->Seed[5]);
|
||||
}
|
||||
szTextPool[bSize] = '\0';
|
||||
}
|
||||
|
||||
void free_text_pool() {
|
||||
free(szTextPool);
|
||||
}
|
||||
|
||||
/*
|
||||
* dbg_text() --
|
||||
* produce ELIZA-like text of random, bounded length, truncating the last
|
||||
* generated sentence as required
|
||||
*/
|
||||
void dbg_text(char *tgt, int min, int max, seed_t *seed) {
|
||||
DSS_HUGE hgLength = 0, hgOffset, wordlen = 0, s_len, needed;
|
||||
char sentence[MAX_SENT_LEN + 1], *cp;
|
||||
|
||||
RANDOM(hgOffset, 0, txtBufferSize - max, seed);
|
||||
RANDOM(hgLength, min, max, seed);
|
||||
strncpy(&tgt[0], &szTextPool[hgOffset], (int)hgLength);
|
||||
tgt[hgLength] = '\0';
|
||||
|
||||
return;
|
||||
}
|
||||
Reference in New Issue
Block a user