should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,66 @@
# name: test/sql/storage/compression/zstd/fetch_row.test
# group: [zstd]
statement ok
SET storage_compatibility_version='v1.2.0'
# load the DB from disk
load __TEST_DIR__/test_zstd_compression_fetch_row.db
statement ok
CREATE TABLE big_string (
a VARCHAR,
id INT
);
statement ok
pragma force_compression='zstd'
statement ok
INSERT INTO big_string values (repeat('a', 8000), 1);
INSERT INTO big_string values (repeat('b', 10), 2);
INSERT INTO big_string values (repeat('c', 8000), 3);
INSERT INTO big_string values (repeat('d', 10), 4);
INSERT INTO big_string values (repeat('a', 8000), 1);
INSERT INTO big_string values (repeat('b', 10), 2);
INSERT INTO big_string values (repeat('c', 8000), 3);
INSERT INTO big_string values (repeat('d', 10), 4);
INSERT INTO big_string values (repeat('a', 8000), 1);
INSERT INTO big_string values (repeat('b', 10), 2);
INSERT INTO big_string values (repeat('c', 8000), 3);
INSERT INTO big_string values (repeat('d', 10), 4);
# Test with a string that is larger than a block size after compression
# uncompressed size: 3888890 compressed size: 1068813
statement ok
INSERT INTO big_string values (concat(range(0,500000)::VARCHAR), 5);
statement ok
INSERT INTO big_string values (repeat('f', 1), 6);
INSERT INTO big_string values (repeat('g', 8000), 7);
INSERT INTO big_string values (repeat('h', 10000), 8);
statement ok
checkpoint
restart
query II
SELECT a[1], strlen(a) from big_string;
----
a 8000
b 10
c 8000
d 10
a 8000
b 10
c 8000
d 10
a 8000
b 10
c 8000
d 10
[ 3888890
f 1
g 8000
h 10000

View File

@@ -0,0 +1,33 @@
# name: test/sql/storage/compression/zstd/nulls.test
# group: [zstd]
load __TEST_DIR__/zstd_nulls.db readwrite v1.2.0
statement ok
create table tbl (
a varchar
);
statement ok
set variable my_string = (
select concat(range(0,1000)::VARCHAR)
);
statement ok
INSERT INTO tbl (a)
SELECT CASE
WHEN (i % 7) = 0 THEN NULL
ELSE getvariable('my_string') || i
END
FROM range(5000) t(i);
statement ok
pragma force_compression='zstd'
statement ok
checkpoint
query I
select count(*) from tbl where a IS NULL;
----
715

View File

@@ -0,0 +1,89 @@
# name: test/sql/storage/compression/zstd/page_flushing_test1.test_slow
# group: [zstd]
load __TEST_DIR__/zstd_page_flushing_test1.db readwrite v1.2.0
statement ok
create table tbl (
a VARCHAR
);
statement ok
set variable my_string = (
select concat(range(0,500000)::VARCHAR)
);
statement ok
SET checkpoint_threshold = '10.0 GB';
# Feed it enough data to actually use zstd (enough to train a dictionary)
statement ok
insert into tbl values ('aaaabbbb');
insert into tbl values ('aaaabbbb');
insert into tbl values ('aaaabbbb');
insert into tbl values ('aaaabbbb');
insert into tbl values ('aaaabbbb');
insert into tbl values ('aaaabbbb');
insert into tbl values ('aaaabbbb');
insert into tbl values ('aaaabbbb');
insert into tbl values ('aaaabbbb');
insert into tbl values ('aaaabbbb');
insert into tbl values ('aaaabbbb');
# ------- VECTOR 1 -------
# This will start on the segment page and write the remainder on 'extra_pages[0]' buffer
statement ok
insert into tbl select getvariable('my_string');
# ------- VECTOR 2 -------
# This starts on 'extra_pages[0]' and writes onto 'extra_pages[1]'
statement ok
insert into tbl select getvariable('my_string');
# Insert 2036 (STANDARD_VECTOR_SIZE - 12) to finish the vector of values
statement ok
insert into tbl select 'bbbbaaaa' from range(2036)
# ------- VECTOR 3 -------
# This starts on 'extra_pages[1]' and writes onto 'extra_pages[0]'
statement ok
insert into tbl select getvariable('my_string');
statement ok
insert into tbl select 'bbbbaaaa' from range(2047)
# ------- VECTOR 4 -------
# This starts on 'extra_pages[0]', overflows to 'extra_pages[1]' twice
statement ok
insert into tbl select concat(getvariable('my_string'), getvariable('my_string'));
statement ok
insert into tbl select 'bbbbaaaa' from range(2047)
# ------- VECTOR 5 -------
# This starts on 'extra_pages[1]', overflows to 'extra_pages[0]' twice
statement ok
insert into tbl select concat(getvariable('my_string'), getvariable('my_string'));
statement ok
insert into tbl select 'bbbbaaaa' from range(2047)
# ------------------------
statement ok
pragma force_compression='zstd'
statement ok
checkpoint
query II
select strlen(a), count(a) from tbl group by strlen(a) order by all;
----
8 8188
3888890 3
7777780 2

View File

@@ -0,0 +1,96 @@
# name: test/sql/storage/compression/zstd/reclaim_space_column.test_slow
# description: Test that we reclaim space when dropping columns containing overflow strings
# group: [zstd]
statement ok
SET storage_compatibility_version='v1.2.0'
load __TEST_DIR__/reclaim_space_drop_column_overflow_strings.db
statement ok
PRAGMA force_compression='ZSTD';
statement ok
PRAGMA force_checkpoint;
statement ok
CREATE TABLE strings AS SELECT i, repeat('X', case when i%17=0 then 5000 else i%7 end) AS s FROM generate_series(0,150000) tbl(i);
statement ok
CHECKPOINT;
statement ok
CHECKPOINT;
query IIIIII
SELECT AVG(STRLEN(s)), MIN(STRLEN(S)), MAX(STRLEN(S)), SUM(STRLEN(S)), MIN(S[1]), MAX(S[1]) FROM strings
----
296.955 0 5000 44543527 (empty) X
# For smaller block sizes (16KB) the total blocks alternate between a few values in the loop,
# therefore, we need to compare to a range of total block counts.
statement ok
CREATE TABLE total_blocks_tbl AS SELECT total_blocks FROM pragma_database_size();
statement ok
create type test_result as UNION(
ok BOOL,
err STRUCT(
old BIGINT,
allowed_max DECIMAL(21,1),
actual BIGINT
)
);
loop i 0 30
statement ok
ALTER TABLE strings DROP COLUMN s;
statement ok
ALTER TABLE strings ADD COLUMN s VARCHAR;
statement ok
UPDATE strings SET s=repeat('X', case when i%17=0 then 5000 else i%7 end);
query IIIIII
SELECT AVG(STRLEN(s)), MIN(STRLEN(S)), MAX(STRLEN(S)), SUM(STRLEN(S)), MIN(S[1]), MAX(S[1]) FROM strings
----
296.955 0 5000 44543527 (empty) X
statement ok
CHECKPOINT;
# Ensure that the total blocks don't exceed the total blocks by more than 1.2.
query I
SELECT
CASE WHEN ${i} < 10
THEN True::test_result
WHEN current.total_blocks <= total_blocks_tbl.total_blocks * 1.2
THEN True::test_result
ELSE {
'old': total_blocks_tbl.total_blocks,
'allowed_max': total_blocks_tbl.total_blocks * 1.2,
'actual': current.total_blocks
}::test_result
END
FROM pragma_database_size() AS current, total_blocks_tbl;
----
true
# Adjust total_blocks_tbl to the count after 10 warm-up iterations.
statement ok
UPDATE total_blocks_tbl SET total_blocks = (
SELECT CASE WHEN ${i} < 10 THEN (SELECT current.total_blocks FROM pragma_database_size() AS current)
ELSE (total_blocks) END);
restart
query IIIIII
SELECT AVG(STRLEN(s)), MIN(STRLEN(S)), MAX(STRLEN(S)), SUM(STRLEN(S)), MIN(S[1]), MAX(S[1]) FROM strings
----
296.955 0 5000 44543527 (empty) X
endloop

View File

@@ -0,0 +1,133 @@
# name: test/sql/storage/compression/zstd/reclaim_space_table.test_slow
# description: Test that we reclaim space when dropping tables containing overflow strings
# group: [zstd]
statement ok
SET storage_compatibility_version='v1.2.0'
load __TEST_DIR__/reclaim_space_overflow_strings.db
statement ok
PRAGMA force_compression='ZSTD';
statement ok
PRAGMA force_checkpoint;
# Every 17th value has length 5000
# Other values are between 0-6 in length
statement ok
CREATE TABLE strings AS SELECT
repeat('X', case when i%17=0 then 5000 else i%7 end) AS s
FROM generate_series(0,150000) tbl(i);
statement ok
CHECKPOINT;
statement ok
CHECKPOINT;
query IIIIII
SELECT
AVG(STRLEN(s)),
MIN(STRLEN(S)),
MAX(STRLEN(S)),
SUM(STRLEN(S)),
MIN(S[1]),
MAX(S[1])
FROM strings
----
296.955 0 5000 44543527 (empty) X
# for smaller block sizes (16KB) the total blocks alternate between a few values in the loop,
# therefore, we need to compare to a range of total block counts
statement ok
CREATE TABLE total_blocks_tbl AS SELECT
total_blocks
FROM pragma_database_size();
statement ok
create type test_result as UNION(
ok BOOL,
err STRUCT(
old BIGINT,
allowed_max DECIMAL(21,1),
actual BIGINT
)
);
loop i 0 10
statement ok
DROP TABLE strings;
# Recreate the table
statement ok
CREATE TABLE strings AS SELECT
repeat('X', case when i%17=0 then 5000 else i%7 end) AS s
FROM generate_series(0,150000) tbl(i);
query IIIIII
SELECT
AVG(STRLEN(s)),
MIN(STRLEN(S)),
MAX(STRLEN(S)),
SUM(STRLEN(S)),
MIN(S[1]),
MAX(S[1])
FROM strings
----
296.955 0 5000 44543527 (empty) X
statement ok
CHECKPOINT;
# ensure that the total blocks don't exceed the total blocks after the first iteration
# by more than 1.2
query I
SELECT
CASE WHEN ${i} = 0
THEN True::test_result
WHEN current.total_blocks <= total_blocks_tbl.total_blocks * 1.2
THEN True::test_result
ELSE {
'old': total_blocks_tbl.total_blocks,
'allowed_max': total_blocks_tbl.total_blocks * 1.2,
'actual': current.total_blocks
}::test_result
END
FROM pragma_database_size() AS current, total_blocks_tbl;
----
true
# adjust total_blocks_tbl once to the count after the first iteration
statement ok
UPDATE total_blocks_tbl SET total_blocks = (
SELECT
CASE WHEN ${i} = 0
THEN (SELECT current.total_blocks FROM pragma_database_size() AS current)
ELSE
(total_blocks)
END
);
statement ok
CHECKPOINT;
restart
query IIIIII
SELECT
AVG(STRLEN(s)),
MIN(STRLEN(S)),
MAX(STRLEN(S)),
SUM(STRLEN(S)),
MIN(S[1]),
MAX(S[1])
FROM strings
----
296.955 0 5000 44543527 (empty) X
# i
endloop

View File

@@ -0,0 +1,42 @@
# name: test/sql/storage/compression/zstd/test_giant_lists.test_slow
# group: [zstd]
# With the vector metadata size taking between 24-32 bytes (because of padding), this equates to around 584 vectors per segment
# which is 1_196_032 values
statement ok
SET default_block_size = '16384';
statement ok
SET storage_compatibility_version='v1.2.0'
statement ok
attach '__TEST_DIR__/giant_list.db' as db2
statement ok
USE db2;
statement ok
pragma force_compression='zstd';
statement ok
pragma checkpoint_threshold='1gb';
statement ok
set zstd_min_string_length=1;
# With 1.2m values per segment, this roughly creates 25 segments
statement ok
create table tbl as
select repeat('a', 30_000_000).split('') lst;
statement ok
force checkpoint;
query III
select len(lst), lst[1], lst[-1] from tbl;
----
30_000_000 a a
query I
SELECT compression FROM pragma_storage_info('tbl') WHERE segment_type = 'VARCHAR' AND compression != 'ZSTD';
----

View File

@@ -0,0 +1,36 @@
# name: test/sql/storage/compression/zstd/test_skipping.test
# group: [zstd]
statement ok
SET storage_compatibility_version='v1.2.0'
load __TEST_DIR__/zstd_vector_skipping.db
statement ok
pragma force_compression='zstd';
set seed 0.42
statement ok
create table tbl as
select
i // 5_000 as num,
num::VARCHAR || list_reduce([uuid()::varchar for x in range(10)], lambda x, y: concat(x, y)) str
from range(20_000) t(i) order by num
statement ok
force checkpoint;
query I
select str[0:1]::BIGINT from tbl where num = 1 limit 10;
----
1
1
1
1
1
1
1
1
1
1

View File

@@ -0,0 +1,30 @@
# name: test/sql/storage/compression/zstd/zstd.test
# description: Test zstd storage
# group: [zstd]
statement ok
SET storage_compatibility_version='v1.2.0'
# load the DB from disk
load __TEST_DIR__/test_zstd_storage.db
statement ok
PRAGMA force_compression = 'zstd'
statement ok
CREATE TABLE test (a VARCHAR);
statement ok
INSERT INTO test VALUES ('11'), ('11'), ('12'), (NULL);
statement ok
checkpoint;
query I
SELECT * FROM test;
----
11
11
12
NULL

View File

@@ -0,0 +1,132 @@
# name: test/sql/storage/compression/zstd/zstd_compression_ratio.test_slow
# description: Assert bitpacking compression ratio is within reasonable margins for each mode
# group: [zstd]
require block_size 262144
statement ok
SET storage_compatibility_version='v1.2.0'
load __TEST_DIR__/test_zstd_compression_ratio.db
statement ok
set enable_logging=true;
statement ok
set logging_level='info';
statement ok
PRAGMA force_compression='zstd'
set seed 0.42
statement ok
set variable dataset_size = 100_000;
# strings of 4096 characters, created from uuid(), making them semi-random
statement ok
create table test_compressed as (
with cte as (
select list_reduce([uuid()::varchar for x in range(128)], lambda x, y: concat(x, y)) str
from range(getvariable('dataset_size'))
)
select
str
from cte
);
statement ok
checkpoint
# We analyzed the ZSTD compressed data to be 230.8 Mb
query I
SELECT message FROM duckdb_logs where message.starts_with('ColumnDataCheckpointer FinalAnalyze') and message.contains('test_compressed') and message.contains('VARCHAR') order by timestamp;
----
ColumnDataCheckpointer FinalAnalyze(COMPRESSION_ZSTD) result for main.test_compressed.0(VARCHAR): 230801376
statement ok
PRAGMA force_compression='uncompressed'
set seed 0.42
statement ok
CREATE TABLE test_uncompressed as (
with cte as (
select list_reduce([uuid()::varchar for x in range(128)], lambda x, y: concat(x, y)) str
from range(getvariable('dataset_size'))
)
select
str
from cte
);
statement ok
checkpoint
# The Uncompressed data to would have been 462.4 Mb
query I
SELECT message FROM duckdb_logs where message.starts_with('ColumnDataCheckpointer FinalAnalyze') and message.contains('test_uncompressed') and message.contains('VARCHAR') order by timestamp;
----
ColumnDataCheckpointer FinalAnalyze(COMPRESSION_UNCOMPRESSED) result for main.test_uncompressed.0(VARCHAR): 462400000
query I
SELECT compression FROM pragma_storage_info('test_compressed') WHERE segment_type != 'VALIDITY' AND compression != 'ZSTD';
----
query I
SELECT compression FROM pragma_storage_info('test_uncompressed') WHERE segment_type != 'VALIDITY' AND compression != 'Uncompressed';
----
statement ok
CREATE TYPE test_result AS UNION (
ok BOOL,
err STRUCT(
uncompressed HUGEINT,
compressed HUGEINT,
allowed_minimum_ratio DECIMAL(2,1),
allowed_maximum_ratio DECIMAL(2,1),
actual_ratio FLOAT
)
);
# This roughly aligns with comparing Uncompressed/ZSTD FinalAnalyze results (which is ~2x)
statement ok
set variable min_ratio = 1.8;
set variable max_ratio = 2.5;
query I
WITH compressed_intermediate AS (
SELECT
1 + sum(len(additional_block_ids)) as blocks
FROM
pragma_storage_info('test_compressed')
WHERE segment_type NOT IN ('VALIDITY')
GROUP BY block_id
),
uncompressed_intermediate AS (
SELECT
1 + sum(len(additional_block_ids)) as blocks
FROM
pragma_storage_info('test_uncompressed')
WHERE segment_type NOT IN ('VALIDITY')
GROUP BY block_id
)
SELECT
CASE
WHEN (uncompressed::FLOAT / compressed::FLOAT) > getvariable('min_ratio') AND (uncompressed::FLOAT / compressed::FLOAT) < getvariable('max_ratio')
THEN True::test_result
ELSE {
'uncompressed': uncompressed,
'compressed': compressed,
'allowed_minimum_ratio': getvariable('min_ratio'),
'allowed_maximum_ratio': getvariable('max_ratio'),
'actual_ratio': uncompressed::FLOAT / compressed::FLOAT
}::test_result
END
FROM (
SELECT
(SELECT sum(blocks) FROM compressed_intermediate) as compressed,
(SELECT sum(blocks) FROM uncompressed_intermediate) AS uncompressed
) AS blocks_tbl;
----
true

View File

@@ -0,0 +1,18 @@
# name: test/sql/storage/compression/zstd/zstd_force_compression.test
# group: [zstd]
load __TEST_DIR__/zstd_force.test readwrite v1.2.0
statement ok
SET force_compression='zstd';
statement ok
CREATE TABLE zstd_data AS SELECT concat('thisisalongstring', i) str FROM range(1000) t(i);
statement ok
checkpoint;
query I
select count(*) from pragma_storage_info('zstd_data') where compression='ZSTD';
----
1

View File

@@ -0,0 +1,24 @@
# name: test/sql/storage/compression/zstd/zstd_giant_string.test_slow
# group: [zstd]
require ram 16gb
statement ok
ATTACH '__TEST_DIR__/zstd_giant_string.db' (STORAGE_VERSION 'v1.3.2');
statement ok
CREATE TABLE zstd_giant_string.foo (bar VARCHAR USING COMPRESSION zstd);
statement ok
INSERT INTO zstd_giant_string.foo SELECT repeat('a', 10000000) FROM range(0, 500);
statement ok
DETACH zstd_giant_string
statement ok
ATTACH '__TEST_DIR__/zstd_giant_string.db'
query I
select max(length(bar)) from zstd_giant_string.foo;
----
10000000