Files
email-tracker/external/duckdb/benchmark/micro/join/iejoin_employees.benchmark
2025-10-24 19:21:19 -05:00

68 lines
2.0 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# name: benchmark/micro/join/iejoin_employees.benchmark
# description: Range join between integers
# group: [join]
name Range Join
group join
# (1) Employees. A dataset that contains employees salary and tax information [3] with eight attributes:
# state, married, dependents, salary, tax, and three others for notes.
# The relation has been populated with real-life data: tax rates, income brackets, and exemptions
# for each state in the USA have been manually collected to generate synthetic tax records.
# We used the following self-join query to identify anomalies [7]:
# Q1 : SELECT r.id, s.id
# FROM Employees r, Employees s
# WHERE r.salary < s.salary AND r.tax > s.tax;
# The above query returns a set of employee pairs, where one employee earns higher salary than the other but pays less tax.
# To make sure that we generate output for Q1, we selected 10% random rows and increased their tax values.
# Employees2 is a group of larger input datasets with up to 6 Billion records, but with only 0.001% random changes to tax values.
# The higher selectivity is used to test the distributed algorithm on large input files.
load
CREATE TYPE surname_t AS ENUM (
'Smith',
'Johnson',
'Williams',
'Jones',
'Brown',
'Davis',
'Miller',
'Wilson',
'Moore',
'Taylor',
'Anderson',
'Thomas',
'Jackson',
'White',
'Harris',
'Martin',
'Thompson',
'Garcia',
'Martinez',
'Robinson'
);
SELECT SETSEED(0.8675309);
CREATE TABLE employees AS
SELECT
facts.id AS id,
surname AS "name",
dept,
salary,
(salary / 10 - CASE WHEN random() <= 0.01 THEN (10 + 1) ELSE 0 END)::INTEGER AS tax
FROM (
SELECT
id,
enum_range(NULL::surname_t)[(round(random() * 19))::INTEGER] AS surname,
round(random() * 5)::INTEGER AS dept,
100 * id AS salary
FROM (SELECT UNNEST(range(1, 10000000))) tbl(id)
) facts
;
run
SELECT COUNT(*) FROM (
SELECT r.id, s.id
FROM employees r, employees s
WHERE r.salary < s.salary AND r.tax > s.tax
) q1;