should be it

2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions
--- a/external/duckdb/extension/tpch/dbgen/text.cpp
+++ b/external/duckdb/extension/tpch/dbgen/text.cpp
@@ -0,0 +1,442 @@
+/*
+ * Copyright owned by the Transaction Processing Performance Council.
+ *
+ * A copy of the license is included under extension/tpch/dbgen/LICENSE
+ * in this repository.
+ *
+ * You may not use this file except in compliance with the License.
+ *
+ * THE TPC SOFTWARE IS AVAILABLE WITHOUT CHARGE FROM TPC.
+ */
+/*
+ * text.c --- pseaudo text generator for use in DBGEN 2.0
+ *
+ * Defined Routines:
+ *		dbg_text() -- select and translate a sentance form
+ */
+
+#ifdef TEXT_TEST
+#define DECLARER
+#endif /* TEST */
+
+#include "dbgen/config.h"
+
+#include <stdlib.h>
+#ifndef _WIN32
+ /* Change for Windows NT */
+#include <unistd.h>
+#endif /* _WIN32 */
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <math.h>
+#include <signal.h>
+#include <stdio.h> /* */
+#include <string.h>
+#ifdef HP
+#include <strings.h>
+#endif
+#if (defined(WIN32) && !defined(_POSIX_))
+#include <process.h>
+#pragma warning(disable : 4201)
+#pragma warning(disable : 4214)
+#pragma warning(disable : 4514)
+#define WIN32_LEAN_AND_MEAN
+#define NOATOM
+#define NOGDICAPMASKS
+#define NOMETAFILE
+#define NOMINMAX
+#define NOMSG
+#define NOOPENFILE
+#define NORASTEROPS
+#define NOSCROLL
+#define NOSOUND
+#define NOSYSMETRICS
+#define NOTEXTMETRIC
+#define NOWH
+#define NOCOMM
+#define NOKANJI
+#define NOMCX
+#include <windows.h>
+#pragma warning(default : 4201)
+#pragma warning(default : 4214)
+#endif
+
+#include "dbgen/dss.h"
+#include "dbgen/dsstypes.h"
+
+/*
+ * txt_vp() --
+ *		generate a verb phrase by
+ *		1) selecting a verb phrase form
+ *		2) parsing it to select parts of speech
+ *		3) selecting appropriate words
+ *		4) adding punctuation as required
+ *
+ *	Returns: length of generated phrase
+ *	Called By: txt_sentence()
+ *	Calls: pick_str()
+ */
+static int txt_vp(char *dest, seed_t *seed) {
+	char syntax[MAX_GRAMMAR_LEN + 1], *cptr, *parse_target;
+	distribution *src;
+	int i, res = 0;
+
+	pick_str(&vp, seed, &syntax[0]);
+	parse_target = syntax;
+	while ((cptr = strtok(parse_target, " ")) != NULL) {
+		src = NULL;
+		switch (*cptr) {
+		case 'D':
+			src = &adverbs;
+			break;
+		case 'V':
+			src = &verbs;
+			break;
+		case 'X':
+			src = &auxillaries;
+			break;
+		} /* end of POS switch statement */
+		i = pick_str(src, seed, dest);
+		i = (int)strlen(DIST_MEMBER(src, i));
+		dest += i;
+		res += i;
+		if (*(++cptr)) /* miscelaneous fillagree, like punctuation */
+		{
+			dest += 1;
+			res += 1;
+			*dest = *cptr;
+		}
+		*dest = ' ';
+		dest++;
+		res++;
+		parse_target = NULL;
+	} /* end of while loop */
+
+	return (res);
+}
+
+/*
+ * txt_np() --
+ *		generate a noun phrase by
+ *		1) selecting a noun phrase form
+ *		2) parsing it to select parts of speech
+ *		3) selecting appropriate words
+ *		4) adding punctuation as required
+ *
+ *	Returns: length of generated phrase
+ *	Called By: txt_sentence()
+ *	Calls: pick_str(),
+ */
+static int txt_np(char *dest, seed_t *seed) {
+	char syntax[MAX_GRAMMAR_LEN + 1], *cptr, *parse_target;
+	distribution *src;
+	int i, res = 0;
+
+	pick_str(&np, seed, &syntax[0]);
+	parse_target = syntax;
+	while ((cptr = strtok(parse_target, " ")) != NULL) {
+		src = NULL;
+		switch (*cptr) {
+		case 'A':
+			src = &articles;
+			break;
+		case 'J':
+			src = &adjectives;
+			break;
+		case 'D':
+			src = &adverbs;
+			break;
+		case 'N':
+			src = &nouns;
+			break;
+		} /* end of POS switch statement */
+		i = pick_str(src, seed, dest);
+		i = (int)strlen(DIST_MEMBER(src, i));
+		dest += i;
+		res += i;
+		if (*(++cptr)) /* miscelaneous fillagree, like punctuation */
+		{
+			*dest = *cptr;
+			dest += 1;
+			res += 1;
+		}
+		*dest = ' ';
+		dest++;
+		res++;
+		parse_target = NULL;
+	} /* end of while loop */
+
+	return (res);
+}
+
+/*
+ * txt_sentence() --
+ *		generate a sentence by
+ *		1) selecting a sentence form
+ *		2) parsing it to select parts of speech or phrase types
+ *		3) selecting appropriate words
+ *		4) adding punctuation as required
+ *
+ *	Returns: length of generated sentence
+ *	Called By: dbg_text()
+ *	Calls: pick_str(), txt_np(), txt_vp()
+ */
+static int txt_sentence(char *dest, seed_t *seed) {
+	char syntax[MAX_GRAMMAR_LEN + 1], *cptr;
+	int i, res = 0, len = 0;
+
+	pick_str(&grammar, seed, syntax);
+	cptr = syntax;
+
+next_token: /* I hate goto's, but can't seem to have parent and child use strtok() */
+	while (*cptr && *cptr == ' ')
+		cptr++;
+	if (*cptr == '\0')
+		goto done;
+	switch (*cptr) {
+	case 'V':
+		len = txt_vp(dest, seed);
+		break;
+	case 'N':
+		len = txt_np(dest, seed);
+		break;
+	case 'P':
+		i = pick_str(&prepositions, seed, dest);
+		len = (int)strlen(DIST_MEMBER(&prepositions, i));
+		strcpy((dest + len), " the ");
+		len += 5;
+		len += txt_np(dest + len, seed);
+		break;
+	case 'T':
+		i = pick_str(&terminators, seed, --dest); /*terminators should abut previous word */
+		len = (int)strlen(DIST_MEMBER(&terminators, i));
+		break;
+	} /* end of POS switch statement */
+	dest += len;
+	res += len;
+	cptr++;
+	if (*cptr && *cptr != ' ') /* miscelaneous fillagree, like punctuation */
+	{
+		dest += 1;
+		res += 1;
+		*dest = *cptr;
+	}
+	goto next_token;
+done:
+	*dest = '\0';
+	return (--res);
+}
+
+static char *gen_text(char *dest, seed_t *seed, distribution *s) {
+	long i = 0;
+	DSS_HUGE j;
+
+	RANDOM(j, 1, s->list[s->count - 1].weight, seed);
+	while (s->list[i].weight < j)
+		i++;
+	char *src = s->list[i].text;
+	int ind = 0;
+	while (src[ind]) {
+		dest[ind] = src[ind];
+		ind++;
+	}
+	dest[ind] = ' ';
+	return dest + ind + 1;
+}
+
+#define NOUN_MAX_WEIGHT 340
+#define ADJECTIVES_MAX_WEIGHT 289
+#define ADVERBS_MAX_WEIGHT 262
+#define AUXILLARIES_MAX_WEIGHT 18
+#define VERBS_MAX_WEIGHT 174
+#define PREPOSITIONS_MAX_WEIGHT 456
+
+static char *noun_index[NOUN_MAX_WEIGHT + 1];
+static char *adjectives_index[ADJECTIVES_MAX_WEIGHT + 1];
+static char *adverbs_index[ADVERBS_MAX_WEIGHT + 1];
+static char *auxillaries_index[AUXILLARIES_MAX_WEIGHT + 1];
+static char *verbs_index[VERBS_MAX_WEIGHT + 1];
+static char *prepositions_index[PREPOSITIONS_MAX_WEIGHT + 1];
+
+static char *szTextPool = NULL;
+static long txtBufferSize = 0;
+
+// generate a lookup table for weight -> str
+static void gen_index(char **index, distribution *s) {
+	for (size_t w = 0; w <= s->list[s->count - 1].weight; w++) {
+		long i = 0;
+		while (s->list[i].weight < w)
+			i++;
+		index[w] = s->list[i].text;
+	}
+}
+
+static char *gen_text_index(char *dest, seed_t *seed, char **index, distribution *s) {
+	long i = 0;
+	DSS_HUGE j;
+
+	RANDOM(j, 1, s->list[s->count - 1].weight, seed);
+	char *src = index[j];
+	int ind = 0;
+	while (src[ind]) {
+		dest[ind] = src[ind];
+		ind++;
+	}
+	dest[ind] = ' ';
+	return dest + ind + 1;
+}
+
+static char *gen_vp(char *dest, seed_t *seed) {
+	DSS_HUGE j;
+	RANDOM(j, 1, vp.list[vp.count - 1].weight, seed);
+	int index = 0;
+	index += vp.list[0].weight < j;
+	index += vp.list[1].weight < j;
+	index += vp.list[2].weight < j;
+
+	if (index == 0) {
+		dest = gen_text_index(dest, seed, verbs_index, &verbs);
+	} else if (index == 1) {
+		dest = gen_text_index(dest, seed, auxillaries_index, &auxillaries);
+		dest = gen_text_index(dest, seed, verbs_index, &verbs);
+	} else if (index == 2) {
+		dest = gen_text_index(dest, seed, verbs_index, &verbs);
+		dest = gen_text_index(dest, seed, adverbs_index, &adverbs);
+	} else {
+		dest = gen_text_index(dest, seed, auxillaries_index, &auxillaries);
+		dest = gen_text_index(dest, seed, verbs_index, &verbs);
+		dest = gen_text_index(dest, seed, adverbs_index, &adverbs);
+	}
+	return dest;
+}
+
+static char *gen_np(char *dest, seed_t *seed) {
+	DSS_HUGE j;
+	RANDOM(j, 1, np.list[np.count - 1].weight, seed);
+	int index = 0;
+	index += np.list[0].weight < j;
+	index += np.list[1].weight < j;
+	index += np.list[2].weight < j;
+
+	if (index == 0) {
+		dest = gen_text_index(dest, seed, noun_index, &nouns);
+	} else if (index == 1) {
+		dest = gen_text_index(dest, seed, adjectives_index, &adjectives);
+		dest = gen_text_index(dest, seed, noun_index, &nouns);
+	} else if (index == 2) {
+		dest = gen_text_index(dest, seed, adjectives_index, &adjectives);
+		dest[-1] = ',';
+		*(dest++) = ' ';
+		dest = gen_text_index(dest, seed, adjectives_index, &adjectives);
+		dest = gen_text_index(dest, seed, noun_index, &nouns);
+	} else {
+		dest = gen_text_index(dest, seed, adverbs_index, &adverbs);
+		dest = gen_text_index(dest, seed, adjectives_index, &adjectives);
+		dest = gen_text_index(dest, seed, noun_index, &nouns);
+	}
+	return dest;
+}
+
+static char *gen_preposition(char *dest, seed_t *seed) {
+	dest = gen_text_index(dest, seed, prepositions_index, &prepositions);
+	*(dest++) = 't';
+	*(dest++) = 'h';
+	*(dest++) = 'e';
+	*(dest++) = ' ';
+	return gen_np(dest, seed);
+}
+
+static char *gen_terminator(char *dest, seed_t *seed) {
+	dest = gen_text(--dest, seed, &terminators);
+	return dest - 1;
+}
+
+static char *gen_sentence(char *dest, seed_t *seed) {
+	const char *cptr;
+	int i;
+
+	DSS_HUGE j;
+	RANDOM(j, 1, grammar.list[grammar.count - 1].weight, seed);
+	int index = 0;
+	index += grammar.list[0].weight < j;
+	index += grammar.list[1].weight < j;
+	index += grammar.list[2].weight < j;
+	index += grammar.list[3].weight < j;
+	cptr = grammar.list[index].text;
+
+	if (index == 0) {
+		dest = gen_np(dest, seed);
+		dest = gen_vp(dest, seed);
+		dest = gen_terminator(dest, seed);
+	} else if (index == 1) {
+		dest = gen_np(dest, seed);
+		dest = gen_vp(dest, seed);
+		dest = gen_preposition(dest, seed);
+		dest = gen_terminator(dest, seed);
+	} else if (index == 2) {
+		dest = gen_np(dest, seed);
+		dest = gen_vp(dest, seed);
+		dest = gen_np(dest, seed);
+		dest = gen_terminator(dest, seed);
+	} else if (index == 3) {
+		dest = gen_np(dest, seed);
+		dest = gen_preposition(dest, seed);
+		dest = gen_vp(dest, seed);
+		dest = gen_np(dest, seed);
+		dest = gen_terminator(dest, seed);
+	} else {
+		dest = gen_np(dest, seed);
+		dest = gen_preposition(dest, seed);
+		dest = gen_vp(dest, seed);
+		dest = gen_preposition(dest, seed);
+		dest = gen_terminator(dest, seed);
+	}
+	*dest = ' ';
+	return dest + 1;
+}
+
+/*
+ * init_text_pool() --
+ *    allocate and initialize the internal text pool buffer (szTextPool).
+ *    Make sure to call it before using dbg_text().
+ */
+void init_text_pool(long bSize, DBGenContext *ctx) {
+	gen_index(noun_index, &nouns);
+	gen_index(adjectives_index, &adjectives);
+	gen_index(adverbs_index, &adverbs);
+	gen_index(auxillaries_index, &auxillaries);
+	gen_index(verbs_index, &verbs);
+	gen_index(prepositions_index, &prepositions);
+
+  txtBufferSize = bSize;
+  szTextPool = (char*)malloc(bSize + 1 + 100);
+
+	char *ptr = szTextPool;
+	char *endptr = szTextPool + bSize + 1;
+	while (ptr < endptr) {
+		ptr = gen_sentence(ptr, &ctx->Seed[5]);
+	}
+	szTextPool[bSize] = '\0';
+}
+
+void free_text_pool() {
+  free(szTextPool);
+}
+
+/*
+ * dbg_text() --
+ *		produce ELIZA-like text of random, bounded length, truncating the last
+ *		generated sentence as required
+ */
+void dbg_text(char *tgt, int min, int max, seed_t *seed) {
+	DSS_HUGE hgLength = 0, hgOffset, wordlen = 0, s_len, needed;
+	char sentence[MAX_SENT_LEN + 1], *cp;
+
+	RANDOM(hgOffset, 0, txtBufferSize - max, seed);
+	RANDOM(hgLength, min, max, seed);
+	strncpy(&tgt[0], &szTextPool[hgOffset], (int)hgLength);
+	tgt[hgLength] = '\0';
+
+	return;
+}