/* * Copyright owned by the Transaction Processing Performance Council. * * A copy of the license is included under extension/tpch/dbgen/LICENSE * in this repository. * * You may not use this file except in compliance with the License. * * THE TPC SOFTWARE IS AVAILABLE WITHOUT CHARGE FROM TPC. */ /* * text.c --- pseaudo text generator for use in DBGEN 2.0 * * Defined Routines: * dbg_text() -- select and translate a sentance form */ #ifdef TEXT_TEST #define DECLARER #endif /* TEST */ #include "dbgen/config.h" #include #ifndef _WIN32 /* Change for Windows NT */ #include #endif /* _WIN32 */ #include #include #include #include #include #include /* */ #include #ifdef HP #include #endif #if (defined(WIN32) && !defined(_POSIX_)) #include #pragma warning(disable : 4201) #pragma warning(disable : 4214) #pragma warning(disable : 4514) #define WIN32_LEAN_AND_MEAN #define NOATOM #define NOGDICAPMASKS #define NOMETAFILE #define NOMINMAX #define NOMSG #define NOOPENFILE #define NORASTEROPS #define NOSCROLL #define NOSOUND #define NOSYSMETRICS #define NOTEXTMETRIC #define NOWH #define NOCOMM #define NOKANJI #define NOMCX #include #pragma warning(default : 4201) #pragma warning(default : 4214) #endif #include "dbgen/dss.h" #include "dbgen/dsstypes.h" /* * txt_vp() -- * generate a verb phrase by * 1) selecting a verb phrase form * 2) parsing it to select parts of speech * 3) selecting appropriate words * 4) adding punctuation as required * * Returns: length of generated phrase * Called By: txt_sentence() * Calls: pick_str() */ static int txt_vp(char *dest, seed_t *seed) { char syntax[MAX_GRAMMAR_LEN + 1], *cptr, *parse_target; distribution *src; int i, res = 0; pick_str(&vp, seed, &syntax[0]); parse_target = syntax; while ((cptr = strtok(parse_target, " ")) != NULL) { src = NULL; switch (*cptr) { case 'D': src = &adverbs; break; case 'V': src = &verbs; break; case 'X': src = &auxillaries; break; } /* end of POS switch statement */ i = pick_str(src, seed, dest); i = (int)strlen(DIST_MEMBER(src, i)); dest += i; res += i; if (*(++cptr)) /* miscelaneous fillagree, like punctuation */ { dest += 1; res += 1; *dest = *cptr; } *dest = ' '; dest++; res++; parse_target = NULL; } /* end of while loop */ return (res); } /* * txt_np() -- * generate a noun phrase by * 1) selecting a noun phrase form * 2) parsing it to select parts of speech * 3) selecting appropriate words * 4) adding punctuation as required * * Returns: length of generated phrase * Called By: txt_sentence() * Calls: pick_str(), */ static int txt_np(char *dest, seed_t *seed) { char syntax[MAX_GRAMMAR_LEN + 1], *cptr, *parse_target; distribution *src; int i, res = 0; pick_str(&np, seed, &syntax[0]); parse_target = syntax; while ((cptr = strtok(parse_target, " ")) != NULL) { src = NULL; switch (*cptr) { case 'A': src = &articles; break; case 'J': src = &adjectives; break; case 'D': src = &adverbs; break; case 'N': src = &nouns; break; } /* end of POS switch statement */ i = pick_str(src, seed, dest); i = (int)strlen(DIST_MEMBER(src, i)); dest += i; res += i; if (*(++cptr)) /* miscelaneous fillagree, like punctuation */ { *dest = *cptr; dest += 1; res += 1; } *dest = ' '; dest++; res++; parse_target = NULL; } /* end of while loop */ return (res); } /* * txt_sentence() -- * generate a sentence by * 1) selecting a sentence form * 2) parsing it to select parts of speech or phrase types * 3) selecting appropriate words * 4) adding punctuation as required * * Returns: length of generated sentence * Called By: dbg_text() * Calls: pick_str(), txt_np(), txt_vp() */ static int txt_sentence(char *dest, seed_t *seed) { char syntax[MAX_GRAMMAR_LEN + 1], *cptr; int i, res = 0, len = 0; pick_str(&grammar, seed, syntax); cptr = syntax; next_token: /* I hate goto's, but can't seem to have parent and child use strtok() */ while (*cptr && *cptr == ' ') cptr++; if (*cptr == '\0') goto done; switch (*cptr) { case 'V': len = txt_vp(dest, seed); break; case 'N': len = txt_np(dest, seed); break; case 'P': i = pick_str(&prepositions, seed, dest); len = (int)strlen(DIST_MEMBER(&prepositions, i)); strcpy((dest + len), " the "); len += 5; len += txt_np(dest + len, seed); break; case 'T': i = pick_str(&terminators, seed, --dest); /*terminators should abut previous word */ len = (int)strlen(DIST_MEMBER(&terminators, i)); break; } /* end of POS switch statement */ dest += len; res += len; cptr++; if (*cptr && *cptr != ' ') /* miscelaneous fillagree, like punctuation */ { dest += 1; res += 1; *dest = *cptr; } goto next_token; done: *dest = '\0'; return (--res); } static char *gen_text(char *dest, seed_t *seed, distribution *s) { long i = 0; DSS_HUGE j; RANDOM(j, 1, s->list[s->count - 1].weight, seed); while (s->list[i].weight < j) i++; char *src = s->list[i].text; int ind = 0; while (src[ind]) { dest[ind] = src[ind]; ind++; } dest[ind] = ' '; return dest + ind + 1; } #define NOUN_MAX_WEIGHT 340 #define ADJECTIVES_MAX_WEIGHT 289 #define ADVERBS_MAX_WEIGHT 262 #define AUXILLARIES_MAX_WEIGHT 18 #define VERBS_MAX_WEIGHT 174 #define PREPOSITIONS_MAX_WEIGHT 456 static char *noun_index[NOUN_MAX_WEIGHT + 1]; static char *adjectives_index[ADJECTIVES_MAX_WEIGHT + 1]; static char *adverbs_index[ADVERBS_MAX_WEIGHT + 1]; static char *auxillaries_index[AUXILLARIES_MAX_WEIGHT + 1]; static char *verbs_index[VERBS_MAX_WEIGHT + 1]; static char *prepositions_index[PREPOSITIONS_MAX_WEIGHT + 1]; static char *szTextPool = NULL; static long txtBufferSize = 0; // generate a lookup table for weight -> str static void gen_index(char **index, distribution *s) { for (size_t w = 0; w <= s->list[s->count - 1].weight; w++) { long i = 0; while (s->list[i].weight < w) i++; index[w] = s->list[i].text; } } static char *gen_text_index(char *dest, seed_t *seed, char **index, distribution *s) { long i = 0; DSS_HUGE j; RANDOM(j, 1, s->list[s->count - 1].weight, seed); char *src = index[j]; int ind = 0; while (src[ind]) { dest[ind] = src[ind]; ind++; } dest[ind] = ' '; return dest + ind + 1; } static char *gen_vp(char *dest, seed_t *seed) { DSS_HUGE j; RANDOM(j, 1, vp.list[vp.count - 1].weight, seed); int index = 0; index += vp.list[0].weight < j; index += vp.list[1].weight < j; index += vp.list[2].weight < j; if (index == 0) { dest = gen_text_index(dest, seed, verbs_index, &verbs); } else if (index == 1) { dest = gen_text_index(dest, seed, auxillaries_index, &auxillaries); dest = gen_text_index(dest, seed, verbs_index, &verbs); } else if (index == 2) { dest = gen_text_index(dest, seed, verbs_index, &verbs); dest = gen_text_index(dest, seed, adverbs_index, &adverbs); } else { dest = gen_text_index(dest, seed, auxillaries_index, &auxillaries); dest = gen_text_index(dest, seed, verbs_index, &verbs); dest = gen_text_index(dest, seed, adverbs_index, &adverbs); } return dest; } static char *gen_np(char *dest, seed_t *seed) { DSS_HUGE j; RANDOM(j, 1, np.list[np.count - 1].weight, seed); int index = 0; index += np.list[0].weight < j; index += np.list[1].weight < j; index += np.list[2].weight < j; if (index == 0) { dest = gen_text_index(dest, seed, noun_index, &nouns); } else if (index == 1) { dest = gen_text_index(dest, seed, adjectives_index, &adjectives); dest = gen_text_index(dest, seed, noun_index, &nouns); } else if (index == 2) { dest = gen_text_index(dest, seed, adjectives_index, &adjectives); dest[-1] = ','; *(dest++) = ' '; dest = gen_text_index(dest, seed, adjectives_index, &adjectives); dest = gen_text_index(dest, seed, noun_index, &nouns); } else { dest = gen_text_index(dest, seed, adverbs_index, &adverbs); dest = gen_text_index(dest, seed, adjectives_index, &adjectives); dest = gen_text_index(dest, seed, noun_index, &nouns); } return dest; } static char *gen_preposition(char *dest, seed_t *seed) { dest = gen_text_index(dest, seed, prepositions_index, &prepositions); *(dest++) = 't'; *(dest++) = 'h'; *(dest++) = 'e'; *(dest++) = ' '; return gen_np(dest, seed); } static char *gen_terminator(char *dest, seed_t *seed) { dest = gen_text(--dest, seed, &terminators); return dest - 1; } static char *gen_sentence(char *dest, seed_t *seed) { const char *cptr; int i; DSS_HUGE j; RANDOM(j, 1, grammar.list[grammar.count - 1].weight, seed); int index = 0; index += grammar.list[0].weight < j; index += grammar.list[1].weight < j; index += grammar.list[2].weight < j; index += grammar.list[3].weight < j; cptr = grammar.list[index].text; if (index == 0) { dest = gen_np(dest, seed); dest = gen_vp(dest, seed); dest = gen_terminator(dest, seed); } else if (index == 1) { dest = gen_np(dest, seed); dest = gen_vp(dest, seed); dest = gen_preposition(dest, seed); dest = gen_terminator(dest, seed); } else if (index == 2) { dest = gen_np(dest, seed); dest = gen_vp(dest, seed); dest = gen_np(dest, seed); dest = gen_terminator(dest, seed); } else if (index == 3) { dest = gen_np(dest, seed); dest = gen_preposition(dest, seed); dest = gen_vp(dest, seed); dest = gen_np(dest, seed); dest = gen_terminator(dest, seed); } else { dest = gen_np(dest, seed); dest = gen_preposition(dest, seed); dest = gen_vp(dest, seed); dest = gen_preposition(dest, seed); dest = gen_terminator(dest, seed); } *dest = ' '; return dest + 1; } /* * init_text_pool() -- * allocate and initialize the internal text pool buffer (szTextPool). * Make sure to call it before using dbg_text(). */ void init_text_pool(long bSize, DBGenContext *ctx) { gen_index(noun_index, &nouns); gen_index(adjectives_index, &adjectives); gen_index(adverbs_index, &adverbs); gen_index(auxillaries_index, &auxillaries); gen_index(verbs_index, &verbs); gen_index(prepositions_index, &prepositions); txtBufferSize = bSize; szTextPool = (char*)malloc(bSize + 1 + 100); char *ptr = szTextPool; char *endptr = szTextPool + bSize + 1; while (ptr < endptr) { ptr = gen_sentence(ptr, &ctx->Seed[5]); } szTextPool[bSize] = '\0'; } void free_text_pool() { free(szTextPool); } /* * dbg_text() -- * produce ELIZA-like text of random, bounded length, truncating the last * generated sentence as required */ void dbg_text(char *tgt, int min, int max, seed_t *seed) { DSS_HUGE hgLength = 0, hgOffset, wordlen = 0, s_len, needed; char sentence[MAX_SENT_LEN + 1], *cp; RANDOM(hgOffset, 0, txtBufferSize - max, seed); RANDOM(hgLength, min, max, seed); strncpy(&tgt[0], &szTextPool[hgOffset], (int)hgLength); tgt[hgLength] = '\0'; return; }