should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,88 @@
# name: test/sql/sample/table_samples/basic_sample_tests.test
# group: [table_samples]
mode skip
# currently require fixed vector size since the "randomness" of the sample depends on
# the vector size. If the vector size decreases, the randomness of the sample decreases
# This is especially noticeable for small tables and their samples
require vector_size 2048
statement ok
PRAGMA enable_verification
load __TEST_DIR__/test_samples_basic.db
query I
select count(*) from range(100000) using sample (10000);
----
10000
query I
select count(*) from range(100) using sample (10);
----
10
query I
select count(*) from range(205000) using sample (10000);
----
10000
statement ok
create table t1 as select range a from range(204800);
statement ok
select * from duckdb_table_sample('t1');
statement ok
create or replace table t1 as select range a from range(1000);
query II
select avg(a) > 200, avg(a) < 800 from duckdb_table_sample('t1');
----
true true
statement ok
create or replace table t1 as select range a from range(204800);
# average is not skewed
query II
select avg(a) > (0.2*204800), avg(a) < (0.8*204800) from duckdb_table_sample('t1');
----
true true
# about half the samples are below 102400 and half above
query I
select count(*) from duckdb_table_sample('t1') where a < 102400;
----
1069
query I
select count(*) from duckdb_table_sample('t1') where a > 102400;
----
979
query I
select count(*) from t1 using sample (200000);
----
200000
statement ok
create or replace table materialized_range as select * from range(100);
statement ok
create or replace table integers_1 as (select range b from materialized_range);
query I
select count(b) from duckdb_table_sample('integers_1') where b in (select * from materialized_range);
----
1
# sample exists after restart
restart
query I
select count(b) from duckdb_table_sample('integers_1') where b in (select * from materialized_range);
----
1

View File

@@ -0,0 +1,40 @@
# name: test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow
# description: Test sampling of larger relations
# group: [table_samples]
mode skip
# required when testing table samples. See basic_sample_test.test
require vector_size 2048
load __TEST_DIR__/test_sample_conversion.db
statement ok
PRAGMA enable_verification
statement ok
create table t1 as select 1 a from range(200000);
loop i 1 4805
statement ok
INSERT INTO t1 VALUES(${i} + 1);
restart
endloop
query I
select count(*) from duckdb_table_sample('t1');
----
2048
query I
select count(*) from duckdb_table_sample('t1') where a > 1;
----
48
query I
select count(*) from (select (floor(range/200000))::INT a from range(204800) using sample reservoir (1%)) t1 where a >= 1;
----
48

View File

@@ -0,0 +1,57 @@
# name: test/sql/sample/table_samples/table_sample_converts_to_block_sample.test
# description: Test sampling of larger relations
# group: [table_samples]
mode skip
# required when testing table samples. See basic_sample_test.test
require vector_size 2048
# table samples first collect only 1% of the table, until the table has a cardinality of 2048.
# then the sample stays at a fixed 2048 values.
load __TEST_DIR__/test_sample_converts_after_load.db
statement ok
create table materialized_range as select 1 a from range(102400);
# only 1% of 102400
query I
select count(*) from duckdb_table_sample('materialized_range');
----
1024
restart
statement ok
insert into materialized_range select 2 a from range(102400);
# collect another 1% of 102400
query I
select count(*) from duckdb_table_sample('materialized_range');
----
2048
query II
select a, count(*) from duckdb_table_sample('materialized_range') group by all order by a;
----
1 1024
2 1024
# insert another
statement ok
insert into materialized_range select 3 a from range(102400);
# sample remains at 2048 values
query I
select count(*) from duckdb_table_sample('materialized_range');
----
2048
# 2048 / 3 = 682. so each value should have at least >650
query II
select a, count(*) > 650 from duckdb_table_sample('materialized_range') group by all order by a;
----
1 1
2 1
3 1

View File

@@ -0,0 +1,148 @@
# name: test/sql/sample/table_samples/table_sample_is_stored.test_slow
# description: Test sampling of larger relations
# group: [table_samples]
mode skip
# required when testing table samples. See basic_sample_test.test
require vector_size 2048
require icu
load __TEST_DIR__/test_samples.db
statement ok
PRAGMA enable_verification
statement ok
create table materialized_range as select * from range(5000000);
statement ok
create table integers_1 as (select (range + 5) a, range b, get_current_time() as time from materialized_range);
query II nosort result_1
select a::INT, b from duckdb_table_sample('integers_1') order by all;
----
statement ok
create table integers_2 as (select (range + 5) a, range b, get_current_time() as time from materialized_range);
## samples should be the same given the same table and the same contents.
query II nosort result_1
select a::INT, b from duckdb_table_sample('integers_2') order by all;
----
statement ok
create or replace table integers_1 as (select (range + 5) a, range b from materialized_range);
statement ok
create or replace table integers_2 as (select (range + 5) a, range b from materialized_range);
# sample only has values in the table it was sampled from
query I
select count(*) from (select b from duckdb_table_sample('integers_1') intersect (select b from integers_1));
----
2048
query I
select count(*) from (select b from duckdb_table_sample('integers_2') intersect (select b from integers_2));
----
2048
# sample exists after restart
restart
query I
select count(*) from duckdb_table_sample('integers_1');
----
2048
query I
select count(*) from duckdb_table_sample('integers_2');
----
2048
query II
select floor(b / 1000000) as interval, count(*) as frequency from duckdb_table_sample('integers_1') group by interval order by all;
----
0.0 453
1.0 408
2.0 406
3.0 404
4.0 377
# adding another interval should subtract an equal number from the rest of the intervals
statement ok
insert into integers_1 (select (range + 5) a, range b from range(5000000,6000000));
query II
select floor(b / 1000000) as interval, count(*) as frequency from duckdb_table_sample('integers_1') group by interval order by all;
----
0.0 374
1.0 334
2.0 332
3.0 334
4.0 311
5.0 363
# If double the table count is appended, around half the sample should account for the new values.
statement ok
insert into integers_1 (select -1, -1 from range(6000000));
query I
select count(*) from integers_1;
----
12000000
## about half of the samples should have the pair '-1', 1.
# on latest storage test its something like 997
query I
select count(*) from duckdb_table_sample('integers_1') where a = -1 and b = -1;
----
914
restart
# updated sample is also newly serialized
query I
select count(*) from duckdb_table_sample('integers_1') where a = -1 and b = -1;
----
914
# create a view on top of the sample
statement ok
create view sample_view as select * from duckdb_table_sample('integers_1');
# update the sample
statement ok
insert into integers_1 (select -2, -2 from range(6000000));
# 2048 / 3 = 682 (639 is good)
query I
select count(*) from sample_view where a = -2 and b = -2;
----
639
restart
query I
select count(*) from sample_view where a = -2 and b = -2;
----
639
# currently have 18_000_000 values in the table.
# to try and get 1 value in the sample, we should add
# 18000000 / 2048 = 8789 values to see 1
statement ok
insert into integers_1 (select -3, -3 from range(7000));
# 1 value makes it
query I
select count(*) from sample_view where a = -3 and b = -3;
----
1

View File

@@ -0,0 +1,125 @@
# name: test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test
# description: Test sampling of larger relations
# group: [table_samples]
mode skip
# required when testing table samples. See basic_sample_test.test
require vector_size 2048
load __TEST_DIR__/test_sample_is_destroyed_on_update.db
statement ok
create or replace table integers_1 as select range a, range+1 b from range(102400);
# no sample collected yet. There are only 5
query I
select count(*) from duckdb_table_sample('integers_1') order by all;
----
1024
statement ok
delete from integers_1 where a = 3;
# sample no longer exists
query I
select count(*) from duckdb_table_sample('integers_1') order by all;
----
0
statement ok
create or replace table integers_1 as select range a, range+1 b from range(102400);
query I
select count(*) from duckdb_table_sample('integers_1');
----
1024
statement ok
update integers_1 set a = 5 where a = 1;
query II
select * from duckdb_table_sample('integers_1');
----
# test adding columns destroys the sample.
statement ok
create or replace table integers_1 as select range a, range+1 b from range(204800);
query I
select count(*) from duckdb_table_sample('integers_1');
----
2048
statement ok
Alter table integers_1 add column c DOUBLE;
query III
select * from duckdb_table_sample('integers_1');
----
# test altering types destroys the sample
statement ok
create or replace table integers_1 as select range a, range+1 b from range(102400);
# don't have enough smaples yet.
query I
select count(*) from duckdb_table_sample('integers_1');
----
1024
statement ok
Alter table integers_1 alter b TYPE VARCHAR
query II
select * from duckdb_table_sample('integers_1');
----
# test dropping a columns
statement ok
create or replace table integers_1 as select range a, range+1 b from range(102400);
query I
select count(*) from duckdb_table_sample('integers_1');
----
1024
statement ok
Alter table integers_1 drop b;
query I
select * from duckdb_table_sample('integers_1');
----
# test sample is destroyed after a restart
statement ok
create or replace table integers_1 as select range a, range+1 b from range(500);
query I
select count(*) from duckdb_table_sample('integers_1');
----
5
statement ok
Alter table integers_1 drop b;
# sample is destroyed
query I
select * from duckdb_table_sample('integers_1');
----
restart
statement ok
insert into integers_1 select range a from range(500);
# sample is still destroyed
query I
select * from duckdb_table_sample('integers_1');
----

View File

@@ -0,0 +1,80 @@
# name: test/sql/sample/table_samples/test_sample_types.test
# description: Test sampling of larger relations
# group: [table_samples]
mode skip
# test valid sampling types (for now only integral types)
statement ok
pragma enable_verification;
statement ok
create table string_samples as select range::Varchar a from range(204800);
query I
select count(*) from duckdb_table_sample('string_samples') where a is NULL;
----
2048
statement ok
create table struct_samples as select {'key1': 'quack-a-lack', 'key2': range} a from range(204800);
query I
select count(*) from duckdb_table_sample('struct_samples') where a is null;
----
2048
statement ok
create table blob_samples as select '\xAA\xAB\xAC'::BLOB a from range(204800);
query I
select count(*) from duckdb_table_sample('blob_samples') where a is NULL;
----
2048
statement ok
create table integral_samples as select range::BIGINT a, range::DOUBLE b, range::FLOAT c, range::HUGEINT d, INTERVAL 1 YEAR e from range(204800);
query I
select count(*) from duckdb_table_sample('integral_samples') where a NOT null;
----
2048
query I
select count(*) from duckdb_table_sample('integral_samples') where b NOT null;
----
2048
query I
select count(*) from duckdb_table_sample('integral_samples') where c NOT null;
----
2048
query I
select count(*) from duckdb_table_sample('integral_samples') where d NOT null;
----
2048
query I
select count(*) from duckdb_table_sample('integral_samples') where e IS null;
----
2048
statement ok
CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() d from range(1000000);
query I
select count(*) from duckdb_table_sample('t1') where b is null;
----
2048
query I
select count(*) from duckdb_table_sample('t1') where c is null;
----
2048
query I
select count(*) from duckdb_table_sample('t1') where d is null;
----
2048

View File

@@ -0,0 +1,21 @@
# name: test/sql/sample/table_samples/test_table_sample_errors.test
# description: test table sampl[e errors
# group: [table_samples]
mode skip
statement ok
create table t1 as select range a from range(204800);
statement ok
create view v1 as select * from t1;
statement error
select * from duckdb_table_sample('v1');
----
<REGEX>:.*Invalid Catalog type.*
statement error
select * from duckdb_table_sample('a');
----
<REGEX>:.*Catalog Error:.*Table.*does not exist.*