68 lines
2.0 KiB
Plaintext
68 lines
2.0 KiB
Plaintext
# name: benchmark/micro/join/iejoin_employees.benchmark
|
||
# description: Range join between integers
|
||
# group: [join]
|
||
|
||
name Range Join
|
||
group join
|
||
|
||
# (1) Employees. A dataset that contains employees’ salary and tax information [3] with eight attributes:
|
||
# state, married, dependents, salary, tax, and three others for notes.
|
||
# The relation has been populated with real-life data: tax rates, income brackets, and exemptions
|
||
# for each state in the USA have been manually collected to generate synthetic tax records.
|
||
# We used the following self-join query to identify anomalies [7]:
|
||
# Q1 : SELECT r.id, s.id
|
||
# FROM Employees r, Employees s
|
||
# WHERE r.salary < s.salary AND r.tax > s.tax;
|
||
# The above query returns a set of employee pairs, where one employee earns higher salary than the other but pays less tax.
|
||
# To make sure that we generate output for Q1, we selected 10% random rows and increased their tax values.
|
||
# Employees2 is a group of larger input datasets with up to 6 Billion records, but with only 0.001% random changes to tax values.
|
||
# The higher selectivity is used to test the distributed algorithm on large input files.
|
||
|
||
load
|
||
CREATE TYPE surname_t AS ENUM (
|
||
'Smith',
|
||
'Johnson',
|
||
'Williams',
|
||
'Jones',
|
||
'Brown',
|
||
'Davis',
|
||
'Miller',
|
||
'Wilson',
|
||
'Moore',
|
||
'Taylor',
|
||
'Anderson',
|
||
'Thomas',
|
||
'Jackson',
|
||
'White',
|
||
'Harris',
|
||
'Martin',
|
||
'Thompson',
|
||
'Garcia',
|
||
'Martinez',
|
||
'Robinson'
|
||
);
|
||
SELECT SETSEED(0.8675309);
|
||
CREATE TABLE employees AS
|
||
SELECT
|
||
facts.id AS id,
|
||
surname AS "name",
|
||
dept,
|
||
salary,
|
||
(salary / 10 - CASE WHEN random() <= 0.01 THEN (10 + 1) ELSE 0 END)::INTEGER AS tax
|
||
FROM (
|
||
SELECT
|
||
id,
|
||
enum_range(NULL::surname_t)[(round(random() * 19))::INTEGER] AS surname,
|
||
round(random() * 5)::INTEGER AS dept,
|
||
100 * id AS salary
|
||
FROM (SELECT UNNEST(range(1, 10000000))) tbl(id)
|
||
) facts
|
||
;
|
||
|
||
run
|
||
SELECT COUNT(*) FROM (
|
||
SELECT r.id, s.id
|
||
FROM employees r, employees s
|
||
WHERE r.salary < s.salary AND r.tax > s.tax
|
||
) q1;
|