# name: benchmark/micro/join/iejoin_employees.benchmark # description: Range join between integers # group: [join] name Range Join group join # (1) Employees. A dataset that contains employees’ salary and tax information [3] with eight attributes: # state, married, dependents, salary, tax, and three others for notes. # The relation has been populated with real-life data: tax rates, income brackets, and exemptions # for each state in the USA have been manually collected to generate synthetic tax records. # We used the following self-join query to identify anomalies [7]: # Q1 : SELECT r.id, s.id # FROM Employees r, Employees s # WHERE r.salary < s.salary AND r.tax > s.tax; # The above query returns a set of employee pairs, where one employee earns higher salary than the other but pays less tax. # To make sure that we generate output for Q1, we selected 10% random rows and increased their tax values. # Employees2 is a group of larger input datasets with up to 6 Billion records, but with only 0.001% random changes to tax values. # The higher selectivity is used to test the distributed algorithm on large input files. load CREATE TYPE surname_t AS ENUM ( 'Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 'Taylor', 'Anderson', 'Thomas', 'Jackson', 'White', 'Harris', 'Martin', 'Thompson', 'Garcia', 'Martinez', 'Robinson' ); SELECT SETSEED(0.8675309); CREATE TABLE employees AS SELECT facts.id AS id, surname AS "name", dept, salary, (salary / 10 - CASE WHEN random() <= 0.01 THEN (10 + 1) ELSE 0 END)::INTEGER AS tax FROM ( SELECT id, enum_range(NULL::surname_t)[(round(random() * 19))::INTEGER] AS surname, round(random() * 5)::INTEGER AS dept, 100 * id AS salary FROM (SELECT UNNEST(range(1, 10000000))) tbl(id) ) facts ; run SELECT COUNT(*) FROM ( SELECT r.id, s.id FROM employees r, employees s WHERE r.salary < s.salary AND r.tax > s.tax ) q1;