should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,52 @@
# name: test/sql/sample/bernoulli_sampling.test_slow
# description: Test reservoir sample crash on large data sets
# group: [sample]
statement ok
create table output (num_rows INT);
set seed 0.3
loop i 0 500
statement ok
WITH some_tab AS (
SELECT UNNEST(range(1000)) AS id
),
some_tab_unq AS (
SELECT distinct(id) AS id FROM some_tab
),
sampled AS (
select id from some_tab_unq
USING SAMPLE 1% (bernoulli)
)
INSERT INTO output select count(*) as n_rows FROM sampled;
endloop
query II
select min(num_rows) > 0, count(*) FILTER (num_rows = 0) = 0 from output;
----
true true
query III
select avg(rowid), min(rowid), max(rowid) from output where num_rows = 0;
----
NULL NULL NULL
statement ok
create table t1 as select range id from range(1000);
set seed 0.6
query I nosort result_1
select id from t1 USING SAMPLE 1% (bernoulli, 5);
----
query I nosort result_1
select id from t1 USING SAMPLE 1% (bernoulli, 5);
----

View File

@@ -0,0 +1,44 @@
# name: test/sql/sample/can_sample_from_ingested_files.test
# description: Test reservoir sample crash on large data sets
# group: [sample]
require parquet
statement ok
PRAGMA enable_verification;
statement ok
create table all_types as select * exclude(small_enum, medium_enum, large_enum, "union", bit) from test_all_types();
statement ok
copy all_types to '__TEST_DIR__/sample_all_types.csv' (FORMAT CSV);
statement ok
Create table all_types_csv_1 as select * from read_csv_auto('__TEST_DIR__/sample_all_types.csv');
statement ok
Create table all_types_csv_2 as select * from read_csv_auto('__TEST_DIR__/sample_all_types.csv');
query T nosort result_1
select * from all_types_csv_1;
query T nosort result_1
select * from all_types_csv_2;
statement ok
copy (SELECT * from all_types) to '__TEST_DIR__/sample_all_types.parquet' (FORMAT PARQUET);
# test parquet
statement ok
Create table all_types_parquet_1 as select * from read_parquet('__TEST_DIR__/sample_all_types.parquet');
statement ok
Create table all_types_parquet_2 as select * from read_parquet('__TEST_DIR__/sample_all_types.parquet');
query T nosort result_parquet
select * from all_types_parquet_1;
query T nosort result_paruet
select * from all_types_parquet_2;

View File

@@ -0,0 +1,21 @@
# name: test/sql/sample/get_multiple_samples_small.test_slow
# description: Run a sample multiple times (internal#4236)
# group: [sample]
statement ok
pragma memory_limit='10G';
statement ok
CREATE OR REPLACE TABLE blah as (
SELECT *
FROM range(10_000_000)
);
loop i 0 500
statement ok
SELECT * FROM blah TABLESAMPLE 100 ROWS;
endloop

View File

@@ -0,0 +1,14 @@
# name: test/sql/sample/large_reservoir_sample.test_slow
# description: Test reservoir sample crash on large data sets
# group: [sample]
statement ok
PRAGMA enable_verification;
statement ok
create table integers as from range(1000000);
query I
select count(*) from integers USING SAMPLE 99.0% (Reservoir);
----
990000

View File

@@ -0,0 +1,40 @@
# name: test/sql/sample/large_sample.test_slow
# description: Test sampling of larger relations
# group: [sample]
statement ok
PRAGMA enable_verification;
# sample on a larger data set
query I
SELECT COUNT(*) FROM range(10000) USING SAMPLE 5
----
5
# test sample with multiple columns
# we insert the same data in the entire column
statement ok
CREATE TABLE test2 AS SELECT i a, i::VARCHAR b, CONCAT(i, ' - ', i) c FROM repeat(1, 1000) tbl(i)
query III
SELECT a, b, c FROM test2 USING SAMPLE 3;
----
1 1 1 - 1
1 1 1 - 1
1 1 1 - 1
# reservoir sample from a larger dataset
query I
select count(*) from range(200000) tablesample reservoir(90%);
----
180000
loop i 0 3
# sample_size sampling with a large reservoir
query I nosort reservoirlarge
select count(*) from (select * from range(200000) tbl(i) where i % 997 != 0) tbl(i) using sample 80% (reservoir);
----
endloop

View File

@@ -0,0 +1,85 @@
# name: test/sql/sample/reservoir_testing_percentage.test
# description: Test SAMPLE keyword
# group: [sample]
loop i 1 8
statement ok
pragma threads=${i};
statement ok
CREATE or replace TABLE t1 as select range a from range(1000);
query I
SELECT count(*) from t1 using sample 0 percent (reservoir);
----
0
query I
SELECT count(*) from t1 using sample 10 percent (reservoir);
----
100
query I
SELECT count(*) from t1 using sample 20 percent (reservoir);
----
200
query I
SELECT count(*) from t1 using sample 80 percent (reservoir);
----
800
query I
SELECT count(*) from t1 using sample 100 percent (reservoir);
----
1000
statement ok
Insert into t1 select range a from range(9000);
query I
select count(*) from t1 using sample 80 percent (reservoir);
----
8000
statement ok
Insert into t1 select range a from range(90000);
statement ok
Insert into t1 select range a from range(900000);
query I
select count(*) from t1 using sample 20 percent (reservoir);
----
200000
query I
select count(*) from t1 using sample 30 percent (reservoir);
----
300000
query I
select count(*) from t1 using sample 40 percent (reservoir);
----
400000
query I
select count(*) from t1 using sample 50 percent (reservoir);
----
500000
query I
select count(*) from t1 using sample 60 percent (reservoir);
----
600000
query I
select count(*) from t1 using sample 70 percent (reservoir);
----
700000
endloop

View File

@@ -0,0 +1,79 @@
# name: test/sql/sample/reservoir_testing_percentage.test_slow
# description: Test SAMPLE keyword
# group: [sample]
loop i 1 8
statement ok
pragma threads=${i};
statement ok
CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(1000);
query I
SELECT count(*) from t1 using sample 0 percent (reservoir);
----
0
query I
SELECT count(*) from t1 using sample 10 percent (reservoir);
----
100
query I
SELECT count(*) from t1 using sample 20 percent (reservoir);
----
200
query I
SELECT count(*) from t1 using sample 80 percent (reservoir);
----
800
query I
SELECT count(*) from t1 using sample 100 percent (reservoir);
----
1000
statement ok
CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(10000);
query I
select count(*) from t1 using sample 80 percent (reservoir);
----
8000
statement ok
CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(1000000);
query I
select count(*) from t1 using sample 20 percent (reservoir);
----
200000
query I
select count(*) from t1 using sample 30 percent (reservoir);
----
300000
query I
select count(*) from t1 using sample 40 percent (reservoir);
----
400000
query I
select count(*) from t1 using sample 50 percent (reservoir);
----
500000
query I
select count(*) from t1 using sample 60 percent (reservoir);
----
600000
query I
select count(*) from t1 using sample 70 percent (reservoir);
----
700000
endloop

View File

@@ -0,0 +1,96 @@
# name: test/sql/sample/reservoir_testing_rows_value.test_slow
# description: Test SAMPLE keyword
# group: [sample]
loop i 1 8
statement ok
pragma threads=${i};
statement ok
CREATE or replace TABLE t1 as select range a from range(1000);
query I
SELECT count(*) from t1 using sample 0;
----
0
query I
SELECT count(*) from t1 using sample 100;
----
100
query I
SELECT count(*) from t1 using sample 200;
----
200
query I
SELECT count(*) from t1 using sample 800;
----
800
query I
SELECT count(*) from t1 using sample 1000;
----
1000
statement ok
create or replace table t1 as select * from range(10000);
query I
select count(*) from t1 using sample 1000;
----
1000
query I
select count(*) from t1 using sample 3000;
----
3000
query I
select count(*) from t1 using sample 6000;
----
6000
query I
select count(*) from t1 using sample 8000;
----
8000
statement ok
Create or replace table t1 as select range a from range(1000000);
query I
select count(*) from t1 using sample 200000;
----
200000
query I
select count(*) from t1 using sample 300000;
----
300000
query I
select count(*) from t1 using sample 400000;
----
400000
query I
select count(*) from t1 using sample 500000;
----
500000
query I
select count(*) from t1 using sample 600000;
----
600000
query I
select count(*) from t1 using sample 700000;
----
700000
endloop

View File

@@ -0,0 +1,27 @@
# name: test/sql/sample/same_seed_same_sample.test
# description: Test same seed same sample for system sample
# group: [sample]
require vector_size 2048
statement ok
CREATE OR REPLACE TABLE test AS SELECT UNNEST(RANGE(100000)) as x;
loop i 0 20
query II
SELECT COUNT(*), min(x) FROM test TABLESAMPLE system (25 PERCENT) REPEATABLE (42);
----
20480 12288
endloop
loop i 0 20
query II
SELECT COUNT(*), min(x) FROM test TABLESAMPLE BERNOULLI (25 PERCENT) REPEATABLE (42);
----
24903 6
endloop

View File

@@ -0,0 +1,73 @@
# name: test/sql/sample/same_seed_same_sample.test_slow
# description: Test SAMPLE keyword
# group: [sample]
# testing a table with less cardinality than the standard vector size
statement ok
CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(1000);
loop i 1 8
statement ok
pragma threads=${i};
query III nosort result_1
SELECT * from t1 using sample reservoir(100) repeatable (1) order by a;
----
query III nosort result_1
SELECT * from t1 using sample reservoir(100) repeatable (1) order by a;
----
endloop
# testing a table with greater cardinality than the standard vector size, and greater than a row group size.
statement ok
CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(100000);
# samples are only equal when threads = 1
statement ok
set threads=1;
query III nosort result_2
SELECT * from t1 using sample reservoir(6000) repeatable (1) order by a;
----
query III nosort result_2
SELECT * from t1 using sample reservoir(6000) repeatable (1) order by a;
----
statement ok
CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(1000000);
loop i 1 8
query III nosort result_3
SELECT * from t1 using sample reservoir(60) repeatable (1) order by a;
----
query III nosort result_3
SELECT * from t1 using sample reservoir(60) repeatable (1) order by a;
----
endloop
# with no repeatable, then sample is not the same
statement ok
create table sample1 as SELECT * from t1 using sample reservoir(10) order by a;
statement ok
create table sample2 as SELECT * from t1 using sample reservoir(10) order by a;
query I
select count(*) < 10 from (select * from sample1 intersect select * from sample2);
----
true

View File

@@ -0,0 +1,28 @@
# name: test/sql/sample/same_seed_same_sample_vec_size_2.test
# description: Test same seed same sample for system sample
# group: [sample]
require exact_vector_size 2
statement ok
CREATE OR REPLACE TABLE test AS SELECT UNNEST(RANGE(100000)) as x;
loop i 0 20
query II
SELECT COUNT(*), min(x) FROM test TABLESAMPLE system (25 PERCENT) REPEATABLE (42);
----
24952 12
endloop
loop i 0 20
query II
SELECT COUNT(*), min(x) FROM test TABLESAMPLE BERNOULLI (25 PERCENT) REPEATABLE (42);
----
24903 6
endloop

View File

@@ -0,0 +1,43 @@
# name: test/sql/sample/sample_verification.test_slow
# description: Test SAMPLE keyword
# group: [sample]
statement ok
PRAGMA enable_verification;
# verify that it is a sample without replacement (i.e. the same row will never occur more than once in the result)
loop i 0 10
query I
select count(distinct i) from range(10) tbl(i) using sample 5;
----
5
endloop
# specifying a seed leads to repeatable behavior
loop i 0 10
query I nosort reservoirseed
select * from range(100) using sample 10 (reservoir, 250)
----
query I nosort bernoulliseed
select * from range(100) using sample 10% (bernoulli, 250)
----
query I nosort systemseed
select * from range(100) using sample 10% (system, 250)
----
endloop
# specify as sample_size, with reservoir sampling this should give us an exact count (i.e. always 10)
loop i 0 10
query I
select count(*) from range(100) using sample 10% (reservoir)
----
10
endloop

View File

@@ -0,0 +1,88 @@
# name: test/sql/sample/table_samples/basic_sample_tests.test
# group: [table_samples]
mode skip
# currently require fixed vector size since the "randomness" of the sample depends on
# the vector size. If the vector size decreases, the randomness of the sample decreases
# This is especially noticeable for small tables and their samples
require vector_size 2048
statement ok
PRAGMA enable_verification
load __TEST_DIR__/test_samples_basic.db
query I
select count(*) from range(100000) using sample (10000);
----
10000
query I
select count(*) from range(100) using sample (10);
----
10
query I
select count(*) from range(205000) using sample (10000);
----
10000
statement ok
create table t1 as select range a from range(204800);
statement ok
select * from duckdb_table_sample('t1');
statement ok
create or replace table t1 as select range a from range(1000);
query II
select avg(a) > 200, avg(a) < 800 from duckdb_table_sample('t1');
----
true true
statement ok
create or replace table t1 as select range a from range(204800);
# average is not skewed
query II
select avg(a) > (0.2*204800), avg(a) < (0.8*204800) from duckdb_table_sample('t1');
----
true true
# about half the samples are below 102400 and half above
query I
select count(*) from duckdb_table_sample('t1') where a < 102400;
----
1069
query I
select count(*) from duckdb_table_sample('t1') where a > 102400;
----
979
query I
select count(*) from t1 using sample (200000);
----
200000
statement ok
create or replace table materialized_range as select * from range(100);
statement ok
create or replace table integers_1 as (select range b from materialized_range);
query I
select count(b) from duckdb_table_sample('integers_1') where b in (select * from materialized_range);
----
1
# sample exists after restart
restart
query I
select count(b) from duckdb_table_sample('integers_1') where b in (select * from materialized_range);
----
1

View File

@@ -0,0 +1,40 @@
# name: test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow
# description: Test sampling of larger relations
# group: [table_samples]
mode skip
# required when testing table samples. See basic_sample_test.test
require vector_size 2048
load __TEST_DIR__/test_sample_conversion.db
statement ok
PRAGMA enable_verification
statement ok
create table t1 as select 1 a from range(200000);
loop i 1 4805
statement ok
INSERT INTO t1 VALUES(${i} + 1);
restart
endloop
query I
select count(*) from duckdb_table_sample('t1');
----
2048
query I
select count(*) from duckdb_table_sample('t1') where a > 1;
----
48
query I
select count(*) from (select (floor(range/200000))::INT a from range(204800) using sample reservoir (1%)) t1 where a >= 1;
----
48

View File

@@ -0,0 +1,57 @@
# name: test/sql/sample/table_samples/table_sample_converts_to_block_sample.test
# description: Test sampling of larger relations
# group: [table_samples]
mode skip
# required when testing table samples. See basic_sample_test.test
require vector_size 2048
# table samples first collect only 1% of the table, until the table has a cardinality of 2048.
# then the sample stays at a fixed 2048 values.
load __TEST_DIR__/test_sample_converts_after_load.db
statement ok
create table materialized_range as select 1 a from range(102400);
# only 1% of 102400
query I
select count(*) from duckdb_table_sample('materialized_range');
----
1024
restart
statement ok
insert into materialized_range select 2 a from range(102400);
# collect another 1% of 102400
query I
select count(*) from duckdb_table_sample('materialized_range');
----
2048
query II
select a, count(*) from duckdb_table_sample('materialized_range') group by all order by a;
----
1 1024
2 1024
# insert another
statement ok
insert into materialized_range select 3 a from range(102400);
# sample remains at 2048 values
query I
select count(*) from duckdb_table_sample('materialized_range');
----
2048
# 2048 / 3 = 682. so each value should have at least >650
query II
select a, count(*) > 650 from duckdb_table_sample('materialized_range') group by all order by a;
----
1 1
2 1
3 1

View File

@@ -0,0 +1,148 @@
# name: test/sql/sample/table_samples/table_sample_is_stored.test_slow
# description: Test sampling of larger relations
# group: [table_samples]
mode skip
# required when testing table samples. See basic_sample_test.test
require vector_size 2048
require icu
load __TEST_DIR__/test_samples.db
statement ok
PRAGMA enable_verification
statement ok
create table materialized_range as select * from range(5000000);
statement ok
create table integers_1 as (select (range + 5) a, range b, get_current_time() as time from materialized_range);
query II nosort result_1
select a::INT, b from duckdb_table_sample('integers_1') order by all;
----
statement ok
create table integers_2 as (select (range + 5) a, range b, get_current_time() as time from materialized_range);
## samples should be the same given the same table and the same contents.
query II nosort result_1
select a::INT, b from duckdb_table_sample('integers_2') order by all;
----
statement ok
create or replace table integers_1 as (select (range + 5) a, range b from materialized_range);
statement ok
create or replace table integers_2 as (select (range + 5) a, range b from materialized_range);
# sample only has values in the table it was sampled from
query I
select count(*) from (select b from duckdb_table_sample('integers_1') intersect (select b from integers_1));
----
2048
query I
select count(*) from (select b from duckdb_table_sample('integers_2') intersect (select b from integers_2));
----
2048
# sample exists after restart
restart
query I
select count(*) from duckdb_table_sample('integers_1');
----
2048
query I
select count(*) from duckdb_table_sample('integers_2');
----
2048
query II
select floor(b / 1000000) as interval, count(*) as frequency from duckdb_table_sample('integers_1') group by interval order by all;
----
0.0 453
1.0 408
2.0 406
3.0 404
4.0 377
# adding another interval should subtract an equal number from the rest of the intervals
statement ok
insert into integers_1 (select (range + 5) a, range b from range(5000000,6000000));
query II
select floor(b / 1000000) as interval, count(*) as frequency from duckdb_table_sample('integers_1') group by interval order by all;
----
0.0 374
1.0 334
2.0 332
3.0 334
4.0 311
5.0 363
# If double the table count is appended, around half the sample should account for the new values.
statement ok
insert into integers_1 (select -1, -1 from range(6000000));
query I
select count(*) from integers_1;
----
12000000
## about half of the samples should have the pair '-1', 1.
# on latest storage test its something like 997
query I
select count(*) from duckdb_table_sample('integers_1') where a = -1 and b = -1;
----
914
restart
# updated sample is also newly serialized
query I
select count(*) from duckdb_table_sample('integers_1') where a = -1 and b = -1;
----
914
# create a view on top of the sample
statement ok
create view sample_view as select * from duckdb_table_sample('integers_1');
# update the sample
statement ok
insert into integers_1 (select -2, -2 from range(6000000));
# 2048 / 3 = 682 (639 is good)
query I
select count(*) from sample_view where a = -2 and b = -2;
----
639
restart
query I
select count(*) from sample_view where a = -2 and b = -2;
----
639
# currently have 18_000_000 values in the table.
# to try and get 1 value in the sample, we should add
# 18000000 / 2048 = 8789 values to see 1
statement ok
insert into integers_1 (select -3, -3 from range(7000));
# 1 value makes it
query I
select count(*) from sample_view where a = -3 and b = -3;
----
1

View File

@@ -0,0 +1,125 @@
# name: test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test
# description: Test sampling of larger relations
# group: [table_samples]
mode skip
# required when testing table samples. See basic_sample_test.test
require vector_size 2048
load __TEST_DIR__/test_sample_is_destroyed_on_update.db
statement ok
create or replace table integers_1 as select range a, range+1 b from range(102400);
# no sample collected yet. There are only 5
query I
select count(*) from duckdb_table_sample('integers_1') order by all;
----
1024
statement ok
delete from integers_1 where a = 3;
# sample no longer exists
query I
select count(*) from duckdb_table_sample('integers_1') order by all;
----
0
statement ok
create or replace table integers_1 as select range a, range+1 b from range(102400);
query I
select count(*) from duckdb_table_sample('integers_1');
----
1024
statement ok
update integers_1 set a = 5 where a = 1;
query II
select * from duckdb_table_sample('integers_1');
----
# test adding columns destroys the sample.
statement ok
create or replace table integers_1 as select range a, range+1 b from range(204800);
query I
select count(*) from duckdb_table_sample('integers_1');
----
2048
statement ok
Alter table integers_1 add column c DOUBLE;
query III
select * from duckdb_table_sample('integers_1');
----
# test altering types destroys the sample
statement ok
create or replace table integers_1 as select range a, range+1 b from range(102400);
# don't have enough smaples yet.
query I
select count(*) from duckdb_table_sample('integers_1');
----
1024
statement ok
Alter table integers_1 alter b TYPE VARCHAR
query II
select * from duckdb_table_sample('integers_1');
----
# test dropping a columns
statement ok
create or replace table integers_1 as select range a, range+1 b from range(102400);
query I
select count(*) from duckdb_table_sample('integers_1');
----
1024
statement ok
Alter table integers_1 drop b;
query I
select * from duckdb_table_sample('integers_1');
----
# test sample is destroyed after a restart
statement ok
create or replace table integers_1 as select range a, range+1 b from range(500);
query I
select count(*) from duckdb_table_sample('integers_1');
----
5
statement ok
Alter table integers_1 drop b;
# sample is destroyed
query I
select * from duckdb_table_sample('integers_1');
----
restart
statement ok
insert into integers_1 select range a from range(500);
# sample is still destroyed
query I
select * from duckdb_table_sample('integers_1');
----

View File

@@ -0,0 +1,80 @@
# name: test/sql/sample/table_samples/test_sample_types.test
# description: Test sampling of larger relations
# group: [table_samples]
mode skip
# test valid sampling types (for now only integral types)
statement ok
pragma enable_verification;
statement ok
create table string_samples as select range::Varchar a from range(204800);
query I
select count(*) from duckdb_table_sample('string_samples') where a is NULL;
----
2048
statement ok
create table struct_samples as select {'key1': 'quack-a-lack', 'key2': range} a from range(204800);
query I
select count(*) from duckdb_table_sample('struct_samples') where a is null;
----
2048
statement ok
create table blob_samples as select '\xAA\xAB\xAC'::BLOB a from range(204800);
query I
select count(*) from duckdb_table_sample('blob_samples') where a is NULL;
----
2048
statement ok
create table integral_samples as select range::BIGINT a, range::DOUBLE b, range::FLOAT c, range::HUGEINT d, INTERVAL 1 YEAR e from range(204800);
query I
select count(*) from duckdb_table_sample('integral_samples') where a NOT null;
----
2048
query I
select count(*) from duckdb_table_sample('integral_samples') where b NOT null;
----
2048
query I
select count(*) from duckdb_table_sample('integral_samples') where c NOT null;
----
2048
query I
select count(*) from duckdb_table_sample('integral_samples') where d NOT null;
----
2048
query I
select count(*) from duckdb_table_sample('integral_samples') where e IS null;
----
2048
statement ok
CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() d from range(1000000);
query I
select count(*) from duckdb_table_sample('t1') where b is null;
----
2048
query I
select count(*) from duckdb_table_sample('t1') where c is null;
----
2048
query I
select count(*) from duckdb_table_sample('t1') where d is null;
----
2048

View File

@@ -0,0 +1,21 @@
# name: test/sql/sample/table_samples/test_table_sample_errors.test
# description: test table sampl[e errors
# group: [table_samples]
mode skip
statement ok
create table t1 as select range a from range(204800);
statement ok
create view v1 as select * from t1;
statement error
select * from duckdb_table_sample('v1');
----
<REGEX>:.*Invalid Catalog type.*
statement error
select * from duckdb_table_sample('a');
----
<REGEX>:.*Catalog Error:.*Table.*does not exist.*

View File

@@ -0,0 +1,235 @@
# name: test/sql/sample/test_sample.test_slow
# description: Test SAMPLE keyword
# group: [sample]
statement ok
PRAGMA enable_verification;
statement ok
CREATE TABLE test (a INTEGER, b INTEGER);
statement ok
INSERT INTO test VALUES (11, 22), (12, 21), (13, 22)
# test various limits using count
query I
SELECT COUNT(*) FROM test USING SAMPLE 0
----
0
query I
SELECT COUNT(*) FROM test USING SAMPLE 1
----
1
query I
SELECT COUNT(*) FROM test USING SAMPLE 1 ROWS
----
1
query I
SELECT COUNT(*) FROM test USING SAMPLE 3
----
3
# sample size exceeds input
query I
SELECT COUNT(*) FROM test USING SAMPLE 10
----
3
# specify sample
query I
SELECT COUNT(*) FROM test USING SAMPLE 3 (reservoir)
----
3
# specify seed
query I
SELECT COUNT(*) FROM test USING SAMPLE 3 (reservoir, 3)
----
3
query II
SELECT * FROM test USING SAMPLE 10 ORDER BY a, b
----
11 22
12 21
13 22
# sample on a larger data set
query I
SELECT COUNT(*) FROM range(10000) USING SAMPLE 5
----
5
# sample on a large data set over RESERVOIR_THRESHOLD = 100000
query I
SELECT COUNT(*) FROM range(2000000) USING SAMPLE 1000100
----
1000100
query I
SELECT COUNT(*) FROM range(2000000) USING SAMPLE 2
----
2
# test sample with multiple columns
# we insert the same data in the entire column
statement ok
CREATE TABLE test2 AS SELECT i a, i::VARCHAR b, CONCAT(i, ' - ', i) c FROM repeat(1, 1000) tbl(i)
query III
SELECT a, b, c FROM test2 USING SAMPLE 3;
----
1 1 1 - 1
1 1 1 - 1
1 1 1 - 1
# sample in scalar subqueries
query I
SELECT (SELECT COUNT(*) FROM test USING SAMPLE 1);
----
1
query I
SELECT (SELECT COUNT(*) + tbl.i FROM test USING SAMPLE 1) FROM range(3) tbl(i) ORDER BY i;
----
1
2
3
# negative sample size not allowed
statement error
SELECT COUNT(*) FROM test USING SAMPLE -1
----
# must be a number
statement error
SELECT COUNT(*) FROM test USING SAMPLE 'hello'
----
statement error
SELECT COUNT(*) FROM test USING SAMPLE DATE '1992-01-01'
----
# we can also use postgres/sqlserver-style tablesample syntax
statement ok
create table integers as select i from range(200) tbl(i);
# default is sample_size, which follows postgres syntax rules
query I
select count(*) from integers tablesample reservoir(10);
----
10
query I
select count(*) from integers tablesample reservoir(10%);
----
20
query I
select count(*) from integers tablesample reservoir(10 percent);
----
20
query I
select count(*) from integers tablesample reservoir(10 rows);
----
10
# we can also use the default sampling method
query I
select count(*) from integers tablesample(10 rows);
----
10
# we can use our sampling syntax here as well
query I
select count(*) from integers tablesample 10;
----
10
query I
select count(*) from integers tablesample 10 rows (reservoir);
----
10
query I
select count(*) from integers tablesample 10 rows (reservoir, 250);
----
10
# we can also use this with table-producing functions
query I
select count(*) from range(200) tablesample reservoir(10%);
----
20
# and subqueries
query I
select count(*) from (select * from range(200)) tbl(i) tablesample reservoir(10%);
----
20
# specifying a seed leads to repeatable behavior
loop i 0 10
query I nosort reservoirseed
select * from range(100) tablesample reservoir(10 rows) repeatable(250)
----
query I nosort bernoulliseed
select * from range(100) tablesample bernoulli(10%) repeatable(250)
----
query I nosort systemseed
select * from range(100) tablesample system(10%) repeatable(250)
----
endloop
query I
select count(*) from range(1000) using sample reservoir(0.01%);
----
0
query I
select count(*) from range(1000) using sample reservoir(0.1%);
----
1
# cannot use bernoulli or system sampling with X number of rows
statement error
select * from integers using sample bernoulli(5 rows);
----
statement error
select * from integers using sample system(5 rows);
----
# sample_size is out of range
statement error
select * from integers using sample 10000%;
----
query I noresult repeatable_seed_0
select i from integers using sample (1 rows) repeatable (0);
----
96
query I noresult repeatable_seed_0
select i from integers using sample (1 rows) repeatable (0);
----
query I noresult repeatable_seed_1
select i from integers using sample reservoir(1%) repeatable (0) order by i;
----
query I noresult repeatable_seed_1
select i from integers using sample reservoir(1%) repeatable (0) order by i;
----
58
127

View File

@@ -0,0 +1,35 @@
# name: test/sql/sample/test_sampling_stats.test_slow
# description: Test SAMPLE keyword
# group: [sample]
statement ok
PRAGMA enable_verification;
set seed 0.42
query I
select (avg::DOUBLE between 550000 and 650000) and (min::INT < 10000) and (max::INT > 1190000)
and (q25::INT between 230000 and 370000) and (q50::INT between 530000 and 670000)
and (q75::INT between 830000 and 970000) and (count::INT = 1500)
and (std::DOUBLE between 300000 and 400000)
from (summarize select * from generate_series(1,1200000) using sample 1500 (reservoir));
----
true
query I
select (avg::DOUBLE between 580000 and 620000) and (min::INT < 1000) and (max::INT > 1199000)
and (q25::INT between 280000 and 320000) and (q50::INT between 580000 and 620000)
and (q75::INT between 880000 and 920000) and (count::INT = 100000)
and (std::DOUBLE between 320000 and 370000)
from (summarize select * from generate_series(1,1200000) using sample 100000 (reservoir));
----
true
query I
select (avg::DOUBLE between 580000 and 620000) and (min::INT < 1000) and (max::INT > 1199000)
and (q25::INT between 280000 and 320000) and (q50::INT between 580000 and 620000)
and (q75::INT between 880000 and 920000) and (count::INT = 400000)
and (std::DOUBLE between 320000 and 370000)
from (summarize select * from generate_series(1,1200000) using sample 400000 (reservoir));
----
true