236 lines
4.3 KiB
Plaintext
236 lines
4.3 KiB
Plaintext
# name: test/sql/sample/test_sample.test_slow
|
|
# description: Test SAMPLE keyword
|
|
# group: [sample]
|
|
|
|
statement ok
|
|
PRAGMA enable_verification;
|
|
|
|
statement ok
|
|
CREATE TABLE test (a INTEGER, b INTEGER);
|
|
|
|
statement ok
|
|
INSERT INTO test VALUES (11, 22), (12, 21), (13, 22)
|
|
|
|
# test various limits using count
|
|
query I
|
|
SELECT COUNT(*) FROM test USING SAMPLE 0
|
|
----
|
|
0
|
|
|
|
query I
|
|
SELECT COUNT(*) FROM test USING SAMPLE 1
|
|
----
|
|
1
|
|
|
|
query I
|
|
SELECT COUNT(*) FROM test USING SAMPLE 1 ROWS
|
|
----
|
|
1
|
|
|
|
query I
|
|
SELECT COUNT(*) FROM test USING SAMPLE 3
|
|
----
|
|
3
|
|
|
|
# sample size exceeds input
|
|
query I
|
|
SELECT COUNT(*) FROM test USING SAMPLE 10
|
|
----
|
|
3
|
|
|
|
# specify sample
|
|
query I
|
|
SELECT COUNT(*) FROM test USING SAMPLE 3 (reservoir)
|
|
----
|
|
3
|
|
|
|
# specify seed
|
|
query I
|
|
SELECT COUNT(*) FROM test USING SAMPLE 3 (reservoir, 3)
|
|
----
|
|
3
|
|
|
|
query II
|
|
SELECT * FROM test USING SAMPLE 10 ORDER BY a, b
|
|
----
|
|
11 22
|
|
12 21
|
|
13 22
|
|
|
|
# sample on a larger data set
|
|
query I
|
|
SELECT COUNT(*) FROM range(10000) USING SAMPLE 5
|
|
----
|
|
5
|
|
|
|
# sample on a large data set over RESERVOIR_THRESHOLD = 100000
|
|
query I
|
|
SELECT COUNT(*) FROM range(2000000) USING SAMPLE 1000100
|
|
----
|
|
1000100
|
|
|
|
|
|
query I
|
|
SELECT COUNT(*) FROM range(2000000) USING SAMPLE 2
|
|
----
|
|
2
|
|
|
|
# test sample with multiple columns
|
|
# we insert the same data in the entire column
|
|
statement ok
|
|
CREATE TABLE test2 AS SELECT i a, i::VARCHAR b, CONCAT(i, ' - ', i) c FROM repeat(1, 1000) tbl(i)
|
|
|
|
query III
|
|
SELECT a, b, c FROM test2 USING SAMPLE 3;
|
|
----
|
|
1 1 1 - 1
|
|
1 1 1 - 1
|
|
1 1 1 - 1
|
|
|
|
# sample in scalar subqueries
|
|
query I
|
|
SELECT (SELECT COUNT(*) FROM test USING SAMPLE 1);
|
|
----
|
|
1
|
|
|
|
query I
|
|
SELECT (SELECT COUNT(*) + tbl.i FROM test USING SAMPLE 1) FROM range(3) tbl(i) ORDER BY i;
|
|
----
|
|
1
|
|
2
|
|
3
|
|
|
|
# negative sample size not allowed
|
|
statement error
|
|
SELECT COUNT(*) FROM test USING SAMPLE -1
|
|
----
|
|
|
|
# must be a number
|
|
statement error
|
|
SELECT COUNT(*) FROM test USING SAMPLE 'hello'
|
|
----
|
|
|
|
statement error
|
|
SELECT COUNT(*) FROM test USING SAMPLE DATE '1992-01-01'
|
|
----
|
|
|
|
# we can also use postgres/sqlserver-style tablesample syntax
|
|
statement ok
|
|
create table integers as select i from range(200) tbl(i);
|
|
|
|
# default is sample_size, which follows postgres syntax rules
|
|
query I
|
|
select count(*) from integers tablesample reservoir(10);
|
|
----
|
|
10
|
|
|
|
query I
|
|
select count(*) from integers tablesample reservoir(10%);
|
|
----
|
|
20
|
|
|
|
query I
|
|
select count(*) from integers tablesample reservoir(10 percent);
|
|
----
|
|
20
|
|
|
|
query I
|
|
select count(*) from integers tablesample reservoir(10 rows);
|
|
----
|
|
10
|
|
|
|
# we can also use the default sampling method
|
|
query I
|
|
select count(*) from integers tablesample(10 rows);
|
|
----
|
|
10
|
|
|
|
# we can use our sampling syntax here as well
|
|
query I
|
|
select count(*) from integers tablesample 10;
|
|
----
|
|
10
|
|
|
|
query I
|
|
select count(*) from integers tablesample 10 rows (reservoir);
|
|
----
|
|
10
|
|
|
|
query I
|
|
select count(*) from integers tablesample 10 rows (reservoir, 250);
|
|
----
|
|
10
|
|
|
|
# we can also use this with table-producing functions
|
|
query I
|
|
select count(*) from range(200) tablesample reservoir(10%);
|
|
----
|
|
20
|
|
|
|
# and subqueries
|
|
query I
|
|
select count(*) from (select * from range(200)) tbl(i) tablesample reservoir(10%);
|
|
----
|
|
20
|
|
|
|
# specifying a seed leads to repeatable behavior
|
|
loop i 0 10
|
|
|
|
query I nosort reservoirseed
|
|
select * from range(100) tablesample reservoir(10 rows) repeatable(250)
|
|
----
|
|
|
|
query I nosort bernoulliseed
|
|
select * from range(100) tablesample bernoulli(10%) repeatable(250)
|
|
----
|
|
|
|
query I nosort systemseed
|
|
select * from range(100) tablesample system(10%) repeatable(250)
|
|
----
|
|
|
|
endloop
|
|
|
|
query I
|
|
select count(*) from range(1000) using sample reservoir(0.01%);
|
|
----
|
|
0
|
|
|
|
query I
|
|
select count(*) from range(1000) using sample reservoir(0.1%);
|
|
----
|
|
1
|
|
|
|
# cannot use bernoulli or system sampling with X number of rows
|
|
statement error
|
|
select * from integers using sample bernoulli(5 rows);
|
|
----
|
|
|
|
statement error
|
|
select * from integers using sample system(5 rows);
|
|
----
|
|
|
|
# sample_size is out of range
|
|
statement error
|
|
select * from integers using sample 10000%;
|
|
----
|
|
|
|
query I noresult repeatable_seed_0
|
|
select i from integers using sample (1 rows) repeatable (0);
|
|
----
|
|
96
|
|
|
|
query I noresult repeatable_seed_0
|
|
select i from integers using sample (1 rows) repeatable (0);
|
|
----
|
|
|
|
|
|
query I noresult repeatable_seed_1
|
|
select i from integers using sample reservoir(1%) repeatable (0) order by i;
|
|
----
|
|
|
|
query I noresult repeatable_seed_1
|
|
select i from integers using sample reservoir(1%) repeatable (0) order by i;
|
|
----
|
|
58
|
|
127
|