should be it

2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions
--- a/external/duckdb/test/sql/sample/bernoulli_sampling.test_slow
+++ b/external/duckdb/test/sql/sample/bernoulli_sampling.test_slow
@@ -0,0 +1,52 @@
+# name: test/sql/sample/bernoulli_sampling.test_slow
+# description: Test reservoir sample crash on large data sets
+# group: [sample]
+
+statement ok
+create table output (num_rows INT);
+
+set seed 0.3
+
+loop i 0 500
+
+statement ok
+WITH some_tab AS (
+    SELECT UNNEST(range(1000)) AS id
+),
+some_tab_unq AS (
+    SELECT distinct(id) AS id FROM some_tab
+),
+sampled AS (
+    select id from some_tab_unq
+    USING SAMPLE 1% (bernoulli)
+)
+INSERT INTO output select count(*) as n_rows FROM sampled;
+
+endloop
+
+
+query II
+select min(num_rows) > 0, count(*) FILTER (num_rows = 0) = 0 from output;
+----
+true	true
+
+query III
+select avg(rowid), min(rowid), max(rowid) from output where num_rows = 0;
+----
+NULL	NULL	NULL
+
+statement ok
+create table t1 as select range id from range(1000);
+
+set seed 0.6
+
+query I nosort result_1
+select id from t1 USING SAMPLE 1% (bernoulli, 5);
+----
+
+query I nosort result_1
+select id from t1 USING SAMPLE 1% (bernoulli, 5);
+----
+
+
+
--- a/external/duckdb/test/sql/sample/can_sample_from_ingested_files.test
+++ b/external/duckdb/test/sql/sample/can_sample_from_ingested_files.test
@@ -0,0 +1,44 @@
+# name: test/sql/sample/can_sample_from_ingested_files.test
+# description: Test reservoir sample crash on large data sets
+# group: [sample]
+
+require parquet
+
+statement ok
+PRAGMA enable_verification;
+
+statement ok
+create table all_types as select * exclude(small_enum, medium_enum, large_enum, "union", bit) from test_all_types();
+
+statement ok
+copy all_types to '__TEST_DIR__/sample_all_types.csv' (FORMAT CSV);
+
+statement ok
+Create table all_types_csv_1 as select * from read_csv_auto('__TEST_DIR__/sample_all_types.csv');
+
+statement ok
+Create table all_types_csv_2 as select * from read_csv_auto('__TEST_DIR__/sample_all_types.csv');
+
+query T nosort result_1
+select * from all_types_csv_1;
+
+query T nosort result_1
+select * from all_types_csv_2;
+
+
+statement ok
+copy (SELECT * from all_types) to '__TEST_DIR__/sample_all_types.parquet' (FORMAT PARQUET);
+
+# test parquet
+statement ok
+Create table all_types_parquet_1 as select * from read_parquet('__TEST_DIR__/sample_all_types.parquet');
+
+statement ok
+Create table all_types_parquet_2 as select * from read_parquet('__TEST_DIR__/sample_all_types.parquet');
+
+query T nosort result_parquet
+select * from all_types_parquet_1;
+
+query T nosort result_paruet
+select * from all_types_parquet_2;
+
--- a/external/duckdb/test/sql/sample/get_multiple_samples_small.test_slow
+++ b/external/duckdb/test/sql/sample/get_multiple_samples_small.test_slow
@@ -0,0 +1,21 @@
+# name: test/sql/sample/get_multiple_samples_small.test_slow
+# description: Run a sample multiple times (internal#4236)
+# group: [sample]
+
+statement ok
+pragma memory_limit='10G';
+
+
+statement ok
+CREATE OR REPLACE TABLE blah as (
+        SELECT *
+        FROM range(10_000_000)
+    );
+
+
+loop i 0 500
+
+statement ok
+SELECT * FROM blah TABLESAMPLE 100 ROWS;
+
+endloop
--- a/external/duckdb/test/sql/sample/large_reservoir_sample.test_slow
+++ b/external/duckdb/test/sql/sample/large_reservoir_sample.test_slow
@@ -0,0 +1,14 @@
+# name: test/sql/sample/large_reservoir_sample.test_slow
+# description: Test reservoir sample crash on large data sets
+# group: [sample]
+
+statement ok
+PRAGMA enable_verification;
+
+statement ok
+create table integers as from range(1000000);
+
+query I
+select count(*) from integers USING SAMPLE 99.0% (Reservoir);
+----
+990000
--- a/external/duckdb/test/sql/sample/large_sample.test_slow
+++ b/external/duckdb/test/sql/sample/large_sample.test_slow
@@ -0,0 +1,40 @@
+# name: test/sql/sample/large_sample.test_slow
+# description: Test sampling of larger relations
+# group: [sample]
+
+statement ok
+PRAGMA enable_verification;
+
+# sample on a larger data set
+query I
+SELECT COUNT(*) FROM range(10000) USING SAMPLE 5
+----
+5
+
+# test sample with multiple columns
+# we insert the same data in the entire column
+statement ok
+CREATE TABLE test2 AS SELECT i a, i::VARCHAR b, CONCAT(i, ' - ', i) c FROM repeat(1, 1000) tbl(i)
+
+query III
+SELECT a, b, c FROM test2 USING SAMPLE 3;
+----
+1	1	1 - 1
+1	1	1 - 1
+1	1	1 - 1
+
+# reservoir sample from a larger dataset
+query I
+select count(*) from range(200000) tablesample reservoir(90%);
+----
+180000
+
+loop i 0 3
+
+# sample_size sampling with a large reservoir
+query I nosort reservoirlarge
+select count(*) from (select * from range(200000) tbl(i) where i % 997 != 0) tbl(i) using sample 80% (reservoir);
+----
+
+endloop
+
--- a/external/duckdb/test/sql/sample/reservoir_testing_percentage.test
+++ b/external/duckdb/test/sql/sample/reservoir_testing_percentage.test
@@ -0,0 +1,85 @@
+# name: test/sql/sample/reservoir_testing_percentage.test
+# description: Test SAMPLE keyword
+# group: [sample]
+
+loop i 1 8
+
+statement ok
+pragma threads=${i};
+
+statement ok
+CREATE or replace TABLE t1 as select range a from range(1000);
+
+query I
+SELECT count(*) from t1 using sample 0 percent (reservoir);
+----
+0
+
+query I
+SELECT count(*) from t1 using sample 10 percent (reservoir);
+----
+100
+
+query I
+SELECT count(*) from t1 using sample 20 percent (reservoir);
+----
+200
+
+query I
+SELECT count(*) from t1 using sample 80 percent (reservoir);
+----
+800
+
+query I
+SELECT count(*) from t1 using sample 100 percent (reservoir);
+----
+1000
+
+
+statement ok
+Insert into t1 select range a from range(9000);
+
+query I
+select count(*) from t1 using sample 80 percent (reservoir);
+----
+8000
+
+statement ok
+Insert into t1 select range a from range(90000);
+
+
+statement ok
+Insert into t1 select range a from range(900000);
+
+query I
+select count(*) from t1 using sample 20 percent (reservoir);
+----
+200000
+
+query I
+select count(*) from t1 using sample 30 percent (reservoir);
+----
+300000
+
+query I
+select count(*) from t1 using sample 40 percent (reservoir);
+----
+400000
+
+query I
+select count(*) from t1 using sample 50 percent (reservoir);
+----
+500000
+
+
+query I
+select count(*) from t1 using sample 60 percent (reservoir);
+----
+600000
+
+query I
+select count(*) from t1 using sample 70 percent (reservoir);
+----
+700000
+
+endloop
--- a/external/duckdb/test/sql/sample/reservoir_testing_percentage.test_slow
+++ b/external/duckdb/test/sql/sample/reservoir_testing_percentage.test_slow
@@ -0,0 +1,79 @@
+# name: test/sql/sample/reservoir_testing_percentage.test_slow
+# description: Test SAMPLE keyword
+# group: [sample]
+
+loop i 1 8
+
+statement ok
+pragma threads=${i};
+
+statement ok
+CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(1000);
+
+query I
+SELECT count(*) from t1 using sample 0 percent (reservoir);
+----
+0
+
+query I
+SELECT count(*) from t1 using sample 10 percent (reservoir);
+----
+100
+
+query I
+SELECT count(*) from t1 using sample 20 percent (reservoir);
+----
+200
+
+query I
+SELECT count(*) from t1 using sample 80 percent (reservoir);
+----
+800
+
+query I
+SELECT count(*) from t1 using sample 100 percent (reservoir);
+----
+1000
+
+statement ok
+CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(10000);
+
+query I
+select count(*) from t1 using sample 80 percent (reservoir);
+----
+8000
+
+statement ok
+CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(1000000);
+
+query I
+select count(*) from t1 using sample 20 percent (reservoir);
+----
+200000
+
+query I
+select count(*) from t1 using sample 30 percent (reservoir);
+----
+300000
+
+query I
+select count(*) from t1 using sample 40 percent (reservoir);
+----
+400000
+
+query I
+select count(*) from t1 using sample 50 percent (reservoir);
+----
+500000
+
+query I
+select count(*) from t1 using sample 60 percent (reservoir);
+----
+600000
+
+query I
+select count(*) from t1 using sample 70 percent (reservoir);
+----
+700000
+
+endloop
--- a/external/duckdb/test/sql/sample/reservoir_testing_rows_value.test_slow
+++ b/external/duckdb/test/sql/sample/reservoir_testing_rows_value.test_slow
@@ -0,0 +1,96 @@
+# name: test/sql/sample/reservoir_testing_rows_value.test_slow
+# description: Test SAMPLE keyword
+# group: [sample]
+
+loop i 1 8
+
+statement ok
+pragma threads=${i};
+
+statement ok
+CREATE or replace TABLE t1 as select range a from range(1000);
+
+query I
+SELECT count(*) from t1 using sample 0;
+----
+0
+
+query I
+SELECT count(*) from t1 using sample 100;
+----
+100
+
+query I
+SELECT count(*) from t1 using sample 200;
+----
+200
+
+query I
+SELECT count(*) from t1 using sample 800;
+----
+800
+
+
+query I
+SELECT count(*) from t1 using sample 1000;
+----
+1000
+
+
+statement ok
+create or replace table t1 as select * from range(10000);
+
+query I
+select count(*) from t1 using sample 1000;
+----
+1000
+
+query I
+select count(*) from t1 using sample 3000;
+----
+3000
+
+query I
+select count(*) from t1 using sample 6000;
+----
+6000
+
+query I
+select count(*) from t1 using sample 8000;
+----
+8000
+
+statement ok
+Create or replace table t1  as select range a from range(1000000);
+
+query I
+select count(*) from t1 using sample 200000;
+----
+200000
+
+query I
+select count(*) from t1 using sample 300000;
+----
+300000
+
+query I
+select count(*) from t1 using sample 400000;
+----
+400000
+
+query I
+select count(*) from t1 using sample 500000;
+----
+500000
+
+query I
+select count(*) from t1 using sample 600000;
+----
+600000
+
+query I
+select count(*) from t1 using sample 700000;
+----
+700000
+
+endloop
--- a/external/duckdb/test/sql/sample/same_seed_same_sample.test
+++ b/external/duckdb/test/sql/sample/same_seed_same_sample.test
@@ -0,0 +1,27 @@
+# name: test/sql/sample/same_seed_same_sample.test
+# description: Test same seed same sample for system sample
+# group: [sample]
+
+require vector_size 2048
+
+statement ok
+CREATE OR REPLACE TABLE test AS SELECT UNNEST(RANGE(100000)) as x;
+
+loop i 0 20
+
+query II
+SELECT COUNT(*), min(x) FROM test TABLESAMPLE system (25 PERCENT) REPEATABLE (42);
+----
+20480	12288
+
+endloop
+
+
+loop i 0 20
+
+query II
+SELECT COUNT(*), min(x) FROM test TABLESAMPLE BERNOULLI (25 PERCENT) REPEATABLE (42);
+----
+24903	6
+
+endloop
--- a/external/duckdb/test/sql/sample/same_seed_same_sample.test_slow
+++ b/external/duckdb/test/sql/sample/same_seed_same_sample.test_slow
@@ -0,0 +1,73 @@
+# name: test/sql/sample/same_seed_same_sample.test_slow
+# description: Test SAMPLE keyword
+# group: [sample]
+
+# testing a table with less cardinality than the standard vector size
+
+statement ok
+CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(1000);
+
+loop i 1 8
+
+statement ok
+pragma threads=${i};
+
+query III nosort result_1
+SELECT * from t1 using sample reservoir(100) repeatable (1) order by a;
+----
+
+
+query III nosort result_1
+SELECT * from t1 using sample reservoir(100) repeatable (1) order by a;
+----
+
+endloop
+
+# testing a table with greater cardinality than the standard vector size, and greater than a row group size.
+
+statement ok
+CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(100000);
+
+# samples are only equal when threads = 1
+
+statement ok
+set threads=1;
+
+query III nosort result_2
+SELECT * from t1 using sample reservoir(6000) repeatable (1) order by a;
+----
+
+
+query III nosort result_2
+SELECT * from t1 using sample reservoir(6000) repeatable (1) order by a;
+----
+
+statement ok
+CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(1000000);
+
+loop i 1 8
+
+query III nosort result_3
+SELECT * from t1 using sample reservoir(60) repeatable (1) order by a;
+----
+
+
+query III nosort result_3
+SELECT * from t1 using sample reservoir(60) repeatable (1) order by a;
+----
+
+endloop
+
+# with no repeatable, then sample is not the same
+
+statement ok
+create table sample1 as SELECT * from t1 using sample reservoir(10) order by a;
+
+statement ok
+create table sample2 as SELECT * from t1 using sample reservoir(10) order by a;
+
+query I
+select count(*) < 10 from (select * from sample1 intersect select * from sample2);
+----
+true
+
--- a/external/duckdb/test/sql/sample/same_seed_same_sample_vec_size_2.test
+++ b/external/duckdb/test/sql/sample/same_seed_same_sample_vec_size_2.test
@@ -0,0 +1,28 @@
+# name: test/sql/sample/same_seed_same_sample_vec_size_2.test
+# description: Test same seed same sample for system sample
+# group: [sample]
+
+require exact_vector_size 2
+
+statement ok
+CREATE OR REPLACE TABLE test AS SELECT UNNEST(RANGE(100000)) as x;
+
+loop i 0 20
+
+query II
+SELECT COUNT(*), min(x) FROM test TABLESAMPLE system (25 PERCENT) REPEATABLE (42);
+----
+24952	12
+
+endloop
+
+
+loop i 0 20
+
+query II
+SELECT COUNT(*), min(x) FROM test TABLESAMPLE BERNOULLI (25 PERCENT) REPEATABLE (42);
+----
+24903	6
+
+endloop
+
--- a/external/duckdb/test/sql/sample/sample_verification.test_slow
+++ b/external/duckdb/test/sql/sample/sample_verification.test_slow
@@ -0,0 +1,43 @@
+# name: test/sql/sample/sample_verification.test_slow
+# description: Test SAMPLE keyword
+# group: [sample]
+
+statement ok
+PRAGMA enable_verification;
+
+# verify that it is a sample without replacement (i.e. the same row will never occur more than once in the result)
+loop i 0 10
+
+query I
+select count(distinct i) from range(10) tbl(i) using sample 5;
+----
+5
+
+endloop
+
+# specifying a seed leads to repeatable behavior
+loop i 0 10
+
+query I nosort reservoirseed
+select * from range(100) using sample 10 (reservoir, 250)
+----
+
+query I nosort bernoulliseed
+select * from range(100) using sample 10% (bernoulli, 250)
+----
+
+query I nosort systemseed
+select * from range(100) using sample 10% (system, 250)
+----
+
+endloop
+
+# specify as sample_size, with reservoir sampling this should give us an exact count (i.e. always 10)
+loop i 0 10
+
+query I
+select count(*) from range(100) using sample 10% (reservoir)
+----
+10
+
+endloop
--- a/external/duckdb/test/sql/sample/table_samples/basic_sample_tests.test
+++ b/external/duckdb/test/sql/sample/table_samples/basic_sample_tests.test
@@ -0,0 +1,88 @@
+# name: test/sql/sample/table_samples/basic_sample_tests.test
+# group: [table_samples]
+
+mode skip
+
+# currently require fixed vector size since the "randomness" of the sample depends on
+# the vector size. If the vector size decreases, the randomness of the sample decreases
+# This is especially noticeable for small tables and their samples
+require vector_size 2048
+
+statement ok
+PRAGMA enable_verification
+
+load __TEST_DIR__/test_samples_basic.db
+
+query I
+select count(*) from range(100000) using sample (10000);
+----
+10000
+
+query I
+select count(*) from range(100) using sample (10);
+----
+10
+
+query I
+select count(*) from range(205000) using sample (10000);
+----
+10000
+
+statement ok
+create table t1 as select range a from range(204800);
+
+statement ok
+select * from duckdb_table_sample('t1');
+
+statement ok
+create or replace table t1 as select range a from range(1000);
+
+query II
+select avg(a) > 200, avg(a) < 800 from duckdb_table_sample('t1');
+----
+true	true
+
+statement ok
+create or replace table t1 as select range a from range(204800);
+
+# average is not skewed
+query II
+select avg(a) > (0.2*204800), avg(a) < (0.8*204800) from duckdb_table_sample('t1');
+----
+true	true
+
+# about half the samples are below 102400 and half above
+query I
+select count(*) from duckdb_table_sample('t1') where a < 102400;
+----
+1069
+
+query I
+select count(*) from duckdb_table_sample('t1') where a > 102400;
+----
+979
+
+query I
+select count(*) from t1 using sample (200000);
+----
+200000
+
+statement ok
+create or replace table materialized_range as select * from range(100);
+
+statement ok
+create or replace table integers_1 as (select range b from materialized_range);
+
+query I
+select count(b) from duckdb_table_sample('integers_1') where b in (select * from materialized_range);
+----
+1
+
+# sample exists after restart
+restart
+
+query I
+select count(b) from duckdb_table_sample('integers_1') where b in (select * from materialized_range);
+----
+1
+
--- a/external/duckdb/test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow
+++ b/external/duckdb/test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow
@@ -0,0 +1,40 @@
+# name: test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow
+# description: Test sampling of larger relations
+# group: [table_samples]
+
+mode skip
+
+# required when testing table samples. See basic_sample_test.test
+require vector_size 2048
+
+load __TEST_DIR__/test_sample_conversion.db
+
+statement ok
+PRAGMA enable_verification
+
+statement ok
+create table t1 as select 1 a from range(200000);
+
+loop i 1 4805
+
+statement ok
+INSERT INTO t1 VALUES(${i} + 1);
+
+restart
+
+endloop
+
+query I
+select count(*) from duckdb_table_sample('t1');
+----
+2048
+
+query I
+select count(*) from duckdb_table_sample('t1') where a > 1;
+----
+48
+
+query I
+select count(*) from (select (floor(range/200000))::INT a from range(204800) using sample reservoir (1%)) t1 where a >= 1;
+----
+48
--- a/external/duckdb/test/sql/sample/table_samples/table_sample_converts_to_block_sample.test
+++ b/external/duckdb/test/sql/sample/table_samples/table_sample_converts_to_block_sample.test
@@ -0,0 +1,57 @@
+# name: test/sql/sample/table_samples/table_sample_converts_to_block_sample.test
+# description: Test sampling of larger relations
+# group: [table_samples]
+
+mode skip
+
+# required when testing table samples. See basic_sample_test.test
+require vector_size 2048
+
+# table samples first collect only 1% of the table, until the table has a cardinality of 2048.
+# then the sample stays at a fixed 2048 values.
+
+load __TEST_DIR__/test_sample_converts_after_load.db
+
+statement ok
+create table materialized_range as select 1 a from range(102400);
+
+# only 1% of 102400
+query I
+select count(*) from duckdb_table_sample('materialized_range');
+----
+1024
+
+restart
+
+statement ok
+insert into materialized_range select 2 a from range(102400);
+
+# collect another 1% of 102400
+query I
+select count(*) from duckdb_table_sample('materialized_range');
+----
+2048
+
+query II
+select a, count(*) from duckdb_table_sample('materialized_range') group by all order by a;
+----
+1	1024
+2	1024
+
+# insert another
+statement ok
+insert into materialized_range select 3 a from range(102400);
+
+# sample remains at 2048 values
+query I
+select count(*) from duckdb_table_sample('materialized_range');
+----
+2048
+
+# 2048 / 3 = 682. so each value should have at least >650
+query II
+select a, count(*) > 650 from duckdb_table_sample('materialized_range') group by all order by a;
+----
+1	1
+2	1
+3	1
--- a/external/duckdb/test/sql/sample/table_samples/table_sample_is_stored.test_slow
+++ b/external/duckdb/test/sql/sample/table_samples/table_sample_is_stored.test_slow
@@ -0,0 +1,148 @@
+# name: test/sql/sample/table_samples/table_sample_is_stored.test_slow
+# description: Test sampling of larger relations
+# group: [table_samples]
+
+mode skip
+
+# required when testing table samples. See basic_sample_test.test
+require vector_size 2048
+
+require icu
+
+load __TEST_DIR__/test_samples.db
+
+statement ok
+PRAGMA enable_verification
+
+statement ok
+create table materialized_range as select * from range(5000000);
+
+statement ok
+create table integers_1 as (select (range + 5) a, range b, get_current_time() as time from materialized_range);
+
+query II nosort result_1
+select a::INT, b from duckdb_table_sample('integers_1') order by all;
+----
+
+statement ok
+create table integers_2 as (select (range + 5) a, range b, get_current_time() as time from materialized_range);
+
+## samples should be the same given the same table and the same contents.
+query II nosort result_1
+select a::INT, b from duckdb_table_sample('integers_2') order by all;
+----
+
+statement ok
+create or replace table integers_1 as (select (range + 5) a, range b from materialized_range);
+
+statement ok
+create or replace table integers_2 as (select (range + 5) a, range b from materialized_range);
+
+# sample only has values in the table it was sampled from
+query I
+select count(*) from (select b from duckdb_table_sample('integers_1') intersect (select b from integers_1));
+----
+2048
+
+query I
+select count(*) from (select b from duckdb_table_sample('integers_2') intersect (select b from integers_2));
+----
+2048
+
+# sample exists after restart
+restart
+
+query I
+select count(*) from duckdb_table_sample('integers_1');
+----
+2048
+
+query I
+select count(*) from duckdb_table_sample('integers_2');
+----
+2048
+
+
+query II
+select floor(b / 1000000) as interval, count(*) as frequency from duckdb_table_sample('integers_1') group by interval order by all;
+----
+0.0	453
+1.0	408
+2.0	406
+3.0	404
+4.0	377
+
+
+# adding another interval should subtract an equal number from the rest of the intervals
+statement ok
+insert into integers_1 (select (range + 5) a, range b from range(5000000,6000000));
+
+query II
+select floor(b / 1000000) as interval, count(*) as frequency from duckdb_table_sample('integers_1') group by interval order by all;
+----
+0.0	374
+1.0	334
+2.0	332
+3.0	334
+4.0	311
+5.0	363
+
+# If double the table count is appended, around half the sample should account for the new values.
+statement ok
+insert into integers_1 (select -1, -1 from range(6000000));
+
+query I
+select count(*) from integers_1;
+----
+12000000
+
+
+## about half of the samples should have the pair '-1', 1.
+# on latest storage test its something like 997
+query I
+select count(*) from duckdb_table_sample('integers_1') where a = -1 and b = -1;
+----
+914
+
+restart
+
+# updated sample is also newly serialized
+query I
+select count(*) from duckdb_table_sample('integers_1') where a = -1 and b = -1;
+----
+914
+
+# create a view on top of the sample
+statement ok
+create view sample_view as select * from duckdb_table_sample('integers_1');
+
+# update the sample
+statement ok
+insert into integers_1 (select -2, -2 from range(6000000));
+
+
+# 2048 / 3 = 682 (639 is good)
+query I
+select count(*) from sample_view where a = -2 and b = -2;
+----
+639
+
+restart
+
+query I
+select count(*)  from sample_view where a = -2 and b = -2;
+----
+639
+
+# currently have 18_000_000 values in the table.
+# to try and get 1 value in the sample, we should add
+# 18000000 / 2048 = 8789 values to see 1
+
+statement ok
+insert into integers_1 (select -3, -3 from range(7000));
+
+# 1 value makes it
+query I
+select count(*) from sample_view where a = -3 and b = -3;
+----
+1
--- a/external/duckdb/test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test
+++ b/external/duckdb/test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test
@@ -0,0 +1,125 @@
+# name: test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test
+# description: Test sampling of larger relations
+# group: [table_samples]
+
+mode skip
+
+# required when testing table samples. See basic_sample_test.test
+require vector_size 2048
+
+load __TEST_DIR__/test_sample_is_destroyed_on_update.db
+
+statement ok
+create or replace table integers_1 as select range a, range+1 b  from range(102400);
+
+# no sample collected yet. There are only 5
+query I
+select count(*) from duckdb_table_sample('integers_1') order by all;
+----
+1024
+
+statement ok
+delete from integers_1 where a = 3;
+
+# sample no longer exists
+query I
+select count(*) from duckdb_table_sample('integers_1') order by all;
+----
+0
+
+statement ok
+create or replace table integers_1 as select range a, range+1 b  from range(102400);
+
+query I
+select count(*) from duckdb_table_sample('integers_1');
+----
+1024
+
+statement ok
+update integers_1 set a = 5 where a = 1;
+
+query II
+select * from duckdb_table_sample('integers_1');
+----
+
+# test adding columns destroys the sample.
+statement ok
+create or replace table integers_1 as select range a, range+1 b  from range(204800);
+
+query I
+select count(*) from duckdb_table_sample('integers_1');
+----
+2048
+
+statement ok
+Alter table integers_1 add column c DOUBLE;
+
+query III
+select * from duckdb_table_sample('integers_1');
+----
+
+
+# test altering types destroys the sample
+statement ok
+create or replace table integers_1 as select range a, range+1 b  from range(102400);
+
+
+# don't have enough smaples yet.
+query I
+select count(*) from duckdb_table_sample('integers_1');
+----
+1024
+
+statement ok
+Alter table integers_1 alter b TYPE VARCHAR
+
+query II
+select * from duckdb_table_sample('integers_1');
+----
+
+# test dropping a columns
+statement ok
+create or replace table integers_1 as select range a, range+1 b  from range(102400);
+
+query I
+select count(*) from duckdb_table_sample('integers_1');
+----
+1024
+
+statement ok
+Alter table integers_1 drop b;
+
+query I
+select * from duckdb_table_sample('integers_1');
+----
+
+# test sample is destroyed after a restart
+statement ok
+create or replace table integers_1 as select range a, range+1 b  from range(500);
+
+query I
+select count(*) from duckdb_table_sample('integers_1');
+----
+5
+
+statement ok
+Alter table integers_1 drop b;
+
+# sample is destroyed
+query I
+select * from duckdb_table_sample('integers_1');
+----
+
+restart
+
+statement ok
+insert into integers_1 select range a from range(500);
+
+# sample is still destroyed
+query I
+select * from duckdb_table_sample('integers_1');
+----
+
+
+
+
--- a/external/duckdb/test/sql/sample/table_samples/test_sample_types.test
+++ b/external/duckdb/test/sql/sample/table_samples/test_sample_types.test
@@ -0,0 +1,80 @@
+# name: test/sql/sample/table_samples/test_sample_types.test
+# description: Test sampling of larger relations
+# group: [table_samples]
+
+mode skip
+
+# test valid sampling types (for now only integral types)
+
+statement ok
+pragma enable_verification;
+
+statement ok
+create table string_samples as select range::Varchar a from range(204800);
+
+query I
+select count(*) from duckdb_table_sample('string_samples') where a is NULL;
+----
+2048
+
+statement ok
+create table struct_samples as select {'key1': 'quack-a-lack', 'key2': range} a from range(204800);
+
+query I
+select count(*) from duckdb_table_sample('struct_samples') where a is null;
+----
+2048
+
+statement ok
+create table blob_samples as select '\xAA\xAB\xAC'::BLOB a from range(204800);
+
+query I
+select count(*) from duckdb_table_sample('blob_samples') where a is NULL;
+----
+2048
+
+statement ok
+create table integral_samples as select range::BIGINT a, range::DOUBLE b, range::FLOAT c, range::HUGEINT d, INTERVAL 1 YEAR e from range(204800);
+
+query I
+select count(*) from duckdb_table_sample('integral_samples') where a NOT null;
+----
+2048
+
+query I
+select count(*) from duckdb_table_sample('integral_samples') where b NOT null;
+----
+2048
+
+query I
+select count(*) from duckdb_table_sample('integral_samples') where c NOT null;
+----
+2048
+
+query I
+select count(*) from duckdb_table_sample('integral_samples') where d NOT null;
+----
+2048
+
+query I
+select count(*) from duckdb_table_sample('integral_samples') where e IS null;
+----
+2048
+
+statement ok
+CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() d from range(1000000);
+
+query I
+select count(*) from duckdb_table_sample('t1') where b is null;
+----
+2048
+
+query I
+select count(*) from duckdb_table_sample('t1') where c is null;
+----
+2048
+
+query I
+select count(*) from duckdb_table_sample('t1') where d is null;
+----
+2048
--- a/external/duckdb/test/sql/sample/table_samples/test_table_sample_errors.test
+++ b/external/duckdb/test/sql/sample/table_samples/test_table_sample_errors.test
@@ -0,0 +1,21 @@
+# name: test/sql/sample/table_samples/test_table_sample_errors.test
+# description: test table sampl[e errors
+# group: [table_samples]
+
+mode skip
+
+statement ok
+create table t1 as select range a from range(204800);
+
+statement ok
+create view v1 as select * from t1;
+
+statement error
+select * from duckdb_table_sample('v1');
+----
+<REGEX>:.*Invalid Catalog type.*
+
+statement error
+select * from duckdb_table_sample('a');
+----
+<REGEX>:.*Catalog Error:.*Table.*does not exist.*
--- a/external/duckdb/test/sql/sample/test_sample.test_slow
+++ b/external/duckdb/test/sql/sample/test_sample.test_slow
@@ -0,0 +1,235 @@
+# name: test/sql/sample/test_sample.test_slow
+# description: Test SAMPLE keyword
+# group: [sample]
+
+statement ok
+PRAGMA enable_verification;
+
+statement ok
+CREATE TABLE test (a INTEGER, b INTEGER);
+
+statement ok
+INSERT INTO test VALUES (11, 22), (12, 21), (13, 22)
+
+# test various limits using count
+query I
+SELECT COUNT(*) FROM test USING SAMPLE 0
+----
+0
+
+query I
+SELECT COUNT(*) FROM test USING SAMPLE 1
+----
+1
+
+query I
+SELECT COUNT(*) FROM test USING SAMPLE 1 ROWS
+----
+1
+
+query I
+SELECT COUNT(*) FROM test USING SAMPLE 3
+----
+3
+
+# sample size exceeds input
+query I
+SELECT COUNT(*) FROM test USING SAMPLE 10
+----
+3
+
+# specify sample
+query I
+SELECT COUNT(*) FROM test USING SAMPLE 3 (reservoir)
+----
+3
+
+# specify seed
+query I
+SELECT COUNT(*) FROM test USING SAMPLE 3 (reservoir, 3)
+----
+3
+
+query II
+SELECT * FROM test USING SAMPLE 10 ORDER BY a, b
+----
+11	22
+12	21
+13	22
+
+# sample on a larger data set
+query I
+SELECT COUNT(*) FROM range(10000) USING SAMPLE 5
+----
+5
+
+# sample on a large data set over RESERVOIR_THRESHOLD = 100000
+query I
+SELECT COUNT(*) FROM range(2000000) USING SAMPLE 1000100
+----
+1000100
+
+
+query I
+SELECT COUNT(*) FROM range(2000000) USING SAMPLE 2
+----
+2
+
+# test sample with multiple columns
+# we insert the same data in the entire column
+statement ok
+CREATE TABLE test2 AS SELECT i a, i::VARCHAR b, CONCAT(i, ' - ', i) c FROM repeat(1, 1000) tbl(i)
+
+query III
+SELECT a, b, c FROM test2 USING SAMPLE 3;
+----
+1	1	1 - 1
+1	1	1 - 1
+1	1	1 - 1
+
+# sample in scalar subqueries
+query I
+SELECT (SELECT COUNT(*) FROM test USING SAMPLE 1);
+----
+1
+
+query I
+SELECT (SELECT COUNT(*) + tbl.i FROM test USING SAMPLE 1) FROM range(3) tbl(i) ORDER BY i;
+----
+1
+2
+3
+
+# negative sample size not allowed
+statement error
+SELECT COUNT(*) FROM test USING SAMPLE -1
+----
+
+# must be a number
+statement error
+SELECT COUNT(*) FROM test USING SAMPLE 'hello'
+----
+
+statement error
+SELECT COUNT(*) FROM test USING SAMPLE DATE '1992-01-01'
+----
+
+# we can also use postgres/sqlserver-style tablesample syntax
+statement ok
+create table integers as select i from range(200) tbl(i);
+
+# default is sample_size, which follows postgres syntax rules
+query I
+select count(*) from integers tablesample reservoir(10);
+----
+10
+
+query I
+select count(*) from integers tablesample reservoir(10%);
+----
+20
+
+query I
+select count(*) from integers tablesample reservoir(10 percent);
+----
+20
+
+query I
+select count(*) from integers tablesample reservoir(10 rows);
+----
+10
+
+# we can also use the default sampling method
+query I
+select count(*) from integers tablesample(10 rows);
+----
+10
+
+# we can use our sampling syntax here as well
+query I
+select count(*) from integers tablesample 10;
+----
+10
+
+query I
+select count(*) from integers tablesample 10 rows (reservoir);
+----
+10
+
+query I
+select count(*) from integers tablesample 10 rows (reservoir, 250);
+----
+10
+
+# we can also use this with table-producing functions
+query I
+select count(*) from range(200) tablesample reservoir(10%);
+----
+20
+
+# and subqueries
+query I
+select count(*) from (select * from range(200)) tbl(i) tablesample reservoir(10%);
+----
+20
+
+# specifying a seed leads to repeatable behavior
+loop i 0 10
+
+query I nosort reservoirseed
+select * from range(100) tablesample reservoir(10 rows) repeatable(250)
+----
+
+query I nosort bernoulliseed
+select * from range(100) tablesample bernoulli(10%) repeatable(250)
+----
+
+query I nosort systemseed
+select * from range(100) tablesample system(10%) repeatable(250)
+----
+
+endloop
+
+query I
+select count(*) from range(1000) using sample reservoir(0.01%);
+----
+0
+
+query I
+select count(*) from range(1000) using sample reservoir(0.1%);
+----
+1
+
+# cannot use bernoulli or system sampling with X number of rows
+statement error
+select * from integers using sample bernoulli(5 rows);
+----
+
+statement error
+select * from integers using sample system(5 rows);
+----
+
+# sample_size is out of range
+statement error
+select * from integers using sample 10000%;
+----
+
+query I noresult repeatable_seed_0
+select i from integers using sample (1 rows) repeatable (0);
+----
+96
+
+query I noresult repeatable_seed_0
+select i from integers using sample (1 rows) repeatable (0);
+----
+
+
+query I noresult repeatable_seed_1
+select i from integers using sample reservoir(1%) repeatable (0) order by i;
+----
+
+query I noresult repeatable_seed_1
+select i from integers using sample reservoir(1%) repeatable (0) order by i;
+----
+58
+127
--- a/external/duckdb/test/sql/sample/test_sampling_stats.test_slow
+++ b/external/duckdb/test/sql/sample/test_sampling_stats.test_slow
@@ -0,0 +1,35 @@
+# name: test/sql/sample/test_sampling_stats.test_slow
+# description: Test SAMPLE keyword
+# group: [sample]
+
+statement ok
+PRAGMA enable_verification;
+
+set seed 0.42
+
+query I
+select (avg::DOUBLE between 550000 and 650000) and (min::INT < 10000) and (max::INT > 1190000)
+  and (q25::INT between 230000 and 370000) and (q50::INT between 530000 and 670000) 
+  and (q75::INT between 830000 and 970000) and (count::INT = 1500)
+  and (std::DOUBLE between 300000 and 400000)
+from (summarize select * from generate_series(1,1200000) using sample 1500 (reservoir));
+----
+true
+
+query I
+select (avg::DOUBLE between 580000 and 620000) and (min::INT < 1000) and (max::INT > 1199000)
+  and (q25::INT between 280000 and 320000) and (q50::INT between 580000 and 620000) 
+  and (q75::INT between 880000 and 920000) and (count::INT = 100000)
+  and (std::DOUBLE between 320000 and 370000)
+from (summarize select * from generate_series(1,1200000) using sample 100000 (reservoir));
+----
+true
+
+query I
+select (avg::DOUBLE between 580000 and 620000) and (min::INT < 1000) and (max::INT > 1199000)
+  and (q25::INT between 280000 and 320000) and (q50::INT between 580000 and 620000) 
+  and (q75::INT between 880000 and 920000) and (count::INT = 400000)
+  and (std::DOUBLE between 320000 and 370000)
+from (summarize select * from generate_series(1,1200000) using sample 400000 (reservoir));
+----
+true