should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,67 @@
# name: benchmark/micro/join/iejoin_employees.benchmark
# description: Range join between integers
# group: [join]
name Range Join
group join
# (1) Employees. A dataset that contains employees salary and tax information [3] with eight attributes:
# state, married, dependents, salary, tax, and three others for notes.
# The relation has been populated with real-life data: tax rates, income brackets, and exemptions
# for each state in the USA have been manually collected to generate synthetic tax records.
# We used the following self-join query to identify anomalies [7]:
# Q1 : SELECT r.id, s.id
# FROM Employees r, Employees s
# WHERE r.salary < s.salary AND r.tax > s.tax;
# The above query returns a set of employee pairs, where one employee earns higher salary than the other but pays less tax.
# To make sure that we generate output for Q1, we selected 10% random rows and increased their tax values.
# Employees2 is a group of larger input datasets with up to 6 Billion records, but with only 0.001% random changes to tax values.
# The higher selectivity is used to test the distributed algorithm on large input files.
load
CREATE TYPE surname_t AS ENUM (
'Smith',
'Johnson',
'Williams',
'Jones',
'Brown',
'Davis',
'Miller',
'Wilson',
'Moore',
'Taylor',
'Anderson',
'Thomas',
'Jackson',
'White',
'Harris',
'Martin',
'Thompson',
'Garcia',
'Martinez',
'Robinson'
);
SELECT SETSEED(0.8675309);
CREATE TABLE employees AS
SELECT
facts.id AS id,
surname AS "name",
dept,
salary,
(salary / 10 - CASE WHEN random() <= 0.01 THEN (10 + 1) ELSE 0 END)::INTEGER AS tax
FROM (
SELECT
id,
enum_range(NULL::surname_t)[(round(random() * 19))::INTEGER] AS surname,
round(random() * 5)::INTEGER AS dept,
100 * id AS salary
FROM (SELECT UNNEST(range(1, 10000000))) tbl(id)
) facts
;
run
SELECT COUNT(*) FROM (
SELECT r.id, s.id
FROM employees r, employees s
WHERE r.salary < s.salary AND r.tax > s.tax
) q1;