should be it
This commit is contained in:
88
external/duckdb/test/sql/sample/table_samples/basic_sample_tests.test
vendored
Normal file
88
external/duckdb/test/sql/sample/table_samples/basic_sample_tests.test
vendored
Normal file
@@ -0,0 +1,88 @@
|
||||
# name: test/sql/sample/table_samples/basic_sample_tests.test
|
||||
# group: [table_samples]
|
||||
|
||||
mode skip
|
||||
|
||||
# currently require fixed vector size since the "randomness" of the sample depends on
|
||||
# the vector size. If the vector size decreases, the randomness of the sample decreases
|
||||
# This is especially noticeable for small tables and their samples
|
||||
require vector_size 2048
|
||||
|
||||
statement ok
|
||||
PRAGMA enable_verification
|
||||
|
||||
load __TEST_DIR__/test_samples_basic.db
|
||||
|
||||
query I
|
||||
select count(*) from range(100000) using sample (10000);
|
||||
----
|
||||
10000
|
||||
|
||||
query I
|
||||
select count(*) from range(100) using sample (10);
|
||||
----
|
||||
10
|
||||
|
||||
query I
|
||||
select count(*) from range(205000) using sample (10000);
|
||||
----
|
||||
10000
|
||||
|
||||
statement ok
|
||||
create table t1 as select range a from range(204800);
|
||||
|
||||
statement ok
|
||||
select * from duckdb_table_sample('t1');
|
||||
|
||||
statement ok
|
||||
create or replace table t1 as select range a from range(1000);
|
||||
|
||||
query II
|
||||
select avg(a) > 200, avg(a) < 800 from duckdb_table_sample('t1');
|
||||
----
|
||||
true true
|
||||
|
||||
statement ok
|
||||
create or replace table t1 as select range a from range(204800);
|
||||
|
||||
# average is not skewed
|
||||
query II
|
||||
select avg(a) > (0.2*204800), avg(a) < (0.8*204800) from duckdb_table_sample('t1');
|
||||
----
|
||||
true true
|
||||
|
||||
# about half the samples are below 102400 and half above
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('t1') where a < 102400;
|
||||
----
|
||||
1069
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('t1') where a > 102400;
|
||||
----
|
||||
979
|
||||
|
||||
query I
|
||||
select count(*) from t1 using sample (200000);
|
||||
----
|
||||
200000
|
||||
|
||||
statement ok
|
||||
create or replace table materialized_range as select * from range(100);
|
||||
|
||||
statement ok
|
||||
create or replace table integers_1 as (select range b from materialized_range);
|
||||
|
||||
query I
|
||||
select count(b) from duckdb_table_sample('integers_1') where b in (select * from materialized_range);
|
||||
----
|
||||
1
|
||||
|
||||
# sample exists after restart
|
||||
restart
|
||||
|
||||
query I
|
||||
select count(b) from duckdb_table_sample('integers_1') where b in (select * from materialized_range);
|
||||
----
|
||||
1
|
||||
|
||||
40
external/duckdb/test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow
vendored
Normal file
40
external/duckdb/test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
# name: test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow
|
||||
# description: Test sampling of larger relations
|
||||
# group: [table_samples]
|
||||
|
||||
mode skip
|
||||
|
||||
# required when testing table samples. See basic_sample_test.test
|
||||
require vector_size 2048
|
||||
|
||||
load __TEST_DIR__/test_sample_conversion.db
|
||||
|
||||
statement ok
|
||||
PRAGMA enable_verification
|
||||
|
||||
statement ok
|
||||
create table t1 as select 1 a from range(200000);
|
||||
|
||||
loop i 1 4805
|
||||
|
||||
statement ok
|
||||
INSERT INTO t1 VALUES(${i} + 1);
|
||||
|
||||
restart
|
||||
|
||||
endloop
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('t1');
|
||||
----
|
||||
2048
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('t1') where a > 1;
|
||||
----
|
||||
48
|
||||
|
||||
query I
|
||||
select count(*) from (select (floor(range/200000))::INT a from range(204800) using sample reservoir (1%)) t1 where a >= 1;
|
||||
----
|
||||
48
|
||||
57
external/duckdb/test/sql/sample/table_samples/table_sample_converts_to_block_sample.test
vendored
Normal file
57
external/duckdb/test/sql/sample/table_samples/table_sample_converts_to_block_sample.test
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
# name: test/sql/sample/table_samples/table_sample_converts_to_block_sample.test
|
||||
# description: Test sampling of larger relations
|
||||
# group: [table_samples]
|
||||
|
||||
mode skip
|
||||
|
||||
# required when testing table samples. See basic_sample_test.test
|
||||
require vector_size 2048
|
||||
|
||||
# table samples first collect only 1% of the table, until the table has a cardinality of 2048.
|
||||
# then the sample stays at a fixed 2048 values.
|
||||
|
||||
load __TEST_DIR__/test_sample_converts_after_load.db
|
||||
|
||||
statement ok
|
||||
create table materialized_range as select 1 a from range(102400);
|
||||
|
||||
# only 1% of 102400
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('materialized_range');
|
||||
----
|
||||
1024
|
||||
|
||||
restart
|
||||
|
||||
statement ok
|
||||
insert into materialized_range select 2 a from range(102400);
|
||||
|
||||
# collect another 1% of 102400
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('materialized_range');
|
||||
----
|
||||
2048
|
||||
|
||||
query II
|
||||
select a, count(*) from duckdb_table_sample('materialized_range') group by all order by a;
|
||||
----
|
||||
1 1024
|
||||
2 1024
|
||||
|
||||
# insert another
|
||||
statement ok
|
||||
insert into materialized_range select 3 a from range(102400);
|
||||
|
||||
# sample remains at 2048 values
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('materialized_range');
|
||||
----
|
||||
2048
|
||||
|
||||
# 2048 / 3 = 682. so each value should have at least >650
|
||||
query II
|
||||
select a, count(*) > 650 from duckdb_table_sample('materialized_range') group by all order by a;
|
||||
----
|
||||
1 1
|
||||
2 1
|
||||
3 1
|
||||
148
external/duckdb/test/sql/sample/table_samples/table_sample_is_stored.test_slow
vendored
Normal file
148
external/duckdb/test/sql/sample/table_samples/table_sample_is_stored.test_slow
vendored
Normal file
@@ -0,0 +1,148 @@
|
||||
# name: test/sql/sample/table_samples/table_sample_is_stored.test_slow
|
||||
# description: Test sampling of larger relations
|
||||
# group: [table_samples]
|
||||
|
||||
mode skip
|
||||
|
||||
# required when testing table samples. See basic_sample_test.test
|
||||
require vector_size 2048
|
||||
|
||||
require icu
|
||||
|
||||
load __TEST_DIR__/test_samples.db
|
||||
|
||||
statement ok
|
||||
PRAGMA enable_verification
|
||||
|
||||
statement ok
|
||||
create table materialized_range as select * from range(5000000);
|
||||
|
||||
statement ok
|
||||
create table integers_1 as (select (range + 5) a, range b, get_current_time() as time from materialized_range);
|
||||
|
||||
query II nosort result_1
|
||||
select a::INT, b from duckdb_table_sample('integers_1') order by all;
|
||||
----
|
||||
|
||||
statement ok
|
||||
create table integers_2 as (select (range + 5) a, range b, get_current_time() as time from materialized_range);
|
||||
|
||||
## samples should be the same given the same table and the same contents.
|
||||
query II nosort result_1
|
||||
select a::INT, b from duckdb_table_sample('integers_2') order by all;
|
||||
----
|
||||
|
||||
statement ok
|
||||
create or replace table integers_1 as (select (range + 5) a, range b from materialized_range);
|
||||
|
||||
statement ok
|
||||
create or replace table integers_2 as (select (range + 5) a, range b from materialized_range);
|
||||
|
||||
# sample only has values in the table it was sampled from
|
||||
query I
|
||||
select count(*) from (select b from duckdb_table_sample('integers_1') intersect (select b from integers_1));
|
||||
----
|
||||
2048
|
||||
|
||||
query I
|
||||
select count(*) from (select b from duckdb_table_sample('integers_2') intersect (select b from integers_2));
|
||||
----
|
||||
2048
|
||||
|
||||
# sample exists after restart
|
||||
restart
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integers_1');
|
||||
----
|
||||
2048
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integers_2');
|
||||
----
|
||||
2048
|
||||
|
||||
|
||||
query II
|
||||
select floor(b / 1000000) as interval, count(*) as frequency from duckdb_table_sample('integers_1') group by interval order by all;
|
||||
----
|
||||
0.0 453
|
||||
1.0 408
|
||||
2.0 406
|
||||
3.0 404
|
||||
4.0 377
|
||||
|
||||
|
||||
# adding another interval should subtract an equal number from the rest of the intervals
|
||||
statement ok
|
||||
insert into integers_1 (select (range + 5) a, range b from range(5000000,6000000));
|
||||
|
||||
query II
|
||||
select floor(b / 1000000) as interval, count(*) as frequency from duckdb_table_sample('integers_1') group by interval order by all;
|
||||
----
|
||||
0.0 374
|
||||
1.0 334
|
||||
2.0 332
|
||||
3.0 334
|
||||
4.0 311
|
||||
5.0 363
|
||||
|
||||
# If double the table count is appended, around half the sample should account for the new values.
|
||||
statement ok
|
||||
insert into integers_1 (select -1, -1 from range(6000000));
|
||||
|
||||
query I
|
||||
select count(*) from integers_1;
|
||||
----
|
||||
12000000
|
||||
|
||||
|
||||
## about half of the samples should have the pair '-1', 1.
|
||||
# on latest storage test its something like 997
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integers_1') where a = -1 and b = -1;
|
||||
----
|
||||
914
|
||||
|
||||
restart
|
||||
|
||||
# updated sample is also newly serialized
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integers_1') where a = -1 and b = -1;
|
||||
----
|
||||
914
|
||||
|
||||
# create a view on top of the sample
|
||||
statement ok
|
||||
create view sample_view as select * from duckdb_table_sample('integers_1');
|
||||
|
||||
# update the sample
|
||||
statement ok
|
||||
insert into integers_1 (select -2, -2 from range(6000000));
|
||||
|
||||
|
||||
# 2048 / 3 = 682 (639 is good)
|
||||
query I
|
||||
select count(*) from sample_view where a = -2 and b = -2;
|
||||
----
|
||||
639
|
||||
|
||||
restart
|
||||
|
||||
query I
|
||||
select count(*) from sample_view where a = -2 and b = -2;
|
||||
----
|
||||
639
|
||||
|
||||
# currently have 18_000_000 values in the table.
|
||||
# to try and get 1 value in the sample, we should add
|
||||
# 18000000 / 2048 = 8789 values to see 1
|
||||
|
||||
statement ok
|
||||
insert into integers_1 (select -3, -3 from range(7000));
|
||||
|
||||
# 1 value makes it
|
||||
query I
|
||||
select count(*) from sample_view where a = -3 and b = -3;
|
||||
----
|
||||
1
|
||||
125
external/duckdb/test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test
vendored
Normal file
125
external/duckdb/test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test
vendored
Normal file
@@ -0,0 +1,125 @@
|
||||
# name: test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test
|
||||
# description: Test sampling of larger relations
|
||||
# group: [table_samples]
|
||||
|
||||
mode skip
|
||||
|
||||
# required when testing table samples. See basic_sample_test.test
|
||||
require vector_size 2048
|
||||
|
||||
load __TEST_DIR__/test_sample_is_destroyed_on_update.db
|
||||
|
||||
statement ok
|
||||
create or replace table integers_1 as select range a, range+1 b from range(102400);
|
||||
|
||||
# no sample collected yet. There are only 5
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integers_1') order by all;
|
||||
----
|
||||
1024
|
||||
|
||||
statement ok
|
||||
delete from integers_1 where a = 3;
|
||||
|
||||
# sample no longer exists
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integers_1') order by all;
|
||||
----
|
||||
0
|
||||
|
||||
statement ok
|
||||
create or replace table integers_1 as select range a, range+1 b from range(102400);
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integers_1');
|
||||
----
|
||||
1024
|
||||
|
||||
statement ok
|
||||
update integers_1 set a = 5 where a = 1;
|
||||
|
||||
query II
|
||||
select * from duckdb_table_sample('integers_1');
|
||||
----
|
||||
|
||||
# test adding columns destroys the sample.
|
||||
statement ok
|
||||
create or replace table integers_1 as select range a, range+1 b from range(204800);
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integers_1');
|
||||
----
|
||||
2048
|
||||
|
||||
statement ok
|
||||
Alter table integers_1 add column c DOUBLE;
|
||||
|
||||
query III
|
||||
select * from duckdb_table_sample('integers_1');
|
||||
----
|
||||
|
||||
|
||||
# test altering types destroys the sample
|
||||
statement ok
|
||||
create or replace table integers_1 as select range a, range+1 b from range(102400);
|
||||
|
||||
|
||||
# don't have enough smaples yet.
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integers_1');
|
||||
----
|
||||
1024
|
||||
|
||||
statement ok
|
||||
Alter table integers_1 alter b TYPE VARCHAR
|
||||
|
||||
query II
|
||||
select * from duckdb_table_sample('integers_1');
|
||||
----
|
||||
|
||||
# test dropping a columns
|
||||
statement ok
|
||||
create or replace table integers_1 as select range a, range+1 b from range(102400);
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integers_1');
|
||||
----
|
||||
1024
|
||||
|
||||
statement ok
|
||||
Alter table integers_1 drop b;
|
||||
|
||||
query I
|
||||
select * from duckdb_table_sample('integers_1');
|
||||
----
|
||||
|
||||
# test sample is destroyed after a restart
|
||||
statement ok
|
||||
create or replace table integers_1 as select range a, range+1 b from range(500);
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integers_1');
|
||||
----
|
||||
5
|
||||
|
||||
statement ok
|
||||
Alter table integers_1 drop b;
|
||||
|
||||
# sample is destroyed
|
||||
query I
|
||||
select * from duckdb_table_sample('integers_1');
|
||||
----
|
||||
|
||||
restart
|
||||
|
||||
statement ok
|
||||
insert into integers_1 select range a from range(500);
|
||||
|
||||
# sample is still destroyed
|
||||
query I
|
||||
select * from duckdb_table_sample('integers_1');
|
||||
----
|
||||
|
||||
|
||||
|
||||
|
||||
80
external/duckdb/test/sql/sample/table_samples/test_sample_types.test
vendored
Normal file
80
external/duckdb/test/sql/sample/table_samples/test_sample_types.test
vendored
Normal file
@@ -0,0 +1,80 @@
|
||||
# name: test/sql/sample/table_samples/test_sample_types.test
|
||||
# description: Test sampling of larger relations
|
||||
# group: [table_samples]
|
||||
|
||||
mode skip
|
||||
|
||||
# test valid sampling types (for now only integral types)
|
||||
|
||||
statement ok
|
||||
pragma enable_verification;
|
||||
|
||||
statement ok
|
||||
create table string_samples as select range::Varchar a from range(204800);
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('string_samples') where a is NULL;
|
||||
----
|
||||
2048
|
||||
|
||||
statement ok
|
||||
create table struct_samples as select {'key1': 'quack-a-lack', 'key2': range} a from range(204800);
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('struct_samples') where a is null;
|
||||
----
|
||||
2048
|
||||
|
||||
statement ok
|
||||
create table blob_samples as select '\xAA\xAB\xAC'::BLOB a from range(204800);
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('blob_samples') where a is NULL;
|
||||
----
|
||||
2048
|
||||
|
||||
statement ok
|
||||
create table integral_samples as select range::BIGINT a, range::DOUBLE b, range::FLOAT c, range::HUGEINT d, INTERVAL 1 YEAR e from range(204800);
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integral_samples') where a NOT null;
|
||||
----
|
||||
2048
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integral_samples') where b NOT null;
|
||||
----
|
||||
2048
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integral_samples') where c NOT null;
|
||||
----
|
||||
2048
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integral_samples') where d NOT null;
|
||||
----
|
||||
2048
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('integral_samples') where e IS null;
|
||||
----
|
||||
2048
|
||||
|
||||
statement ok
|
||||
CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() d from range(1000000);
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('t1') where b is null;
|
||||
----
|
||||
2048
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('t1') where c is null;
|
||||
----
|
||||
2048
|
||||
|
||||
query I
|
||||
select count(*) from duckdb_table_sample('t1') where d is null;
|
||||
----
|
||||
2048
|
||||
21
external/duckdb/test/sql/sample/table_samples/test_table_sample_errors.test
vendored
Normal file
21
external/duckdb/test/sql/sample/table_samples/test_table_sample_errors.test
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
# name: test/sql/sample/table_samples/test_table_sample_errors.test
|
||||
# description: test table sampl[e errors
|
||||
# group: [table_samples]
|
||||
|
||||
mode skip
|
||||
|
||||
statement ok
|
||||
create table t1 as select range a from range(204800);
|
||||
|
||||
statement ok
|
||||
create view v1 as select * from t1;
|
||||
|
||||
statement error
|
||||
select * from duckdb_table_sample('v1');
|
||||
----
|
||||
<REGEX>:.*Invalid Catalog type.*
|
||||
|
||||
statement error
|
||||
select * from duckdb_table_sample('a');
|
||||
----
|
||||
<REGEX>:.*Catalog Error:.*Table.*does not exist.*
|
||||
Reference in New Issue
Block a user