should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,93 @@
# name: test/sql/storage/compression/roaring/fetch_row.test
# group: [roaring]
require block_size 262144
# load the DB from disk
load __TEST_DIR__/test_roaring_compression_fetch_row.db readwrite v1.2.0
statement ok
CREATE TABLE test (
a INT
);
statement ok
pragma force_compression='roaring'
# Array Container
statement ok
INSERT INTO test SELECT case when i%25=0 then 1337 else null end FROM range(0,10000) tbl(i);
statement ok
checkpoint
restart
query I
SELECT compression FROM pragma_storage_info('test') WHERE segment_type ILIKE 'VALIDITY' and compression != 'Roaring';
----
query I
select count(*) from test WHERE a IS NOT NULL;
----
400
query III
select sum(a), min(a), max(a) from test;
----
534800 1337 1337
statement ok
delete from test;
# Run Container
statement ok
INSERT INTO test SELECT case when i = 0 or (i % 512 != 0 and (i % 512) < 350 or (i % 512) > 450) then null else 1337 end FROM range(0,10000) tbl(i);
statement ok
checkpoint
restart
query I
SELECT compression FROM pragma_storage_info('test') WHERE segment_type ILIKE 'VALIDITY' and compression != 'Roaring';
----
query I
select count(*) from test WHERE a IS NOT NULL;
----
1938
query III
select sum(a), min(a), max(a) from test;
----
2591106 1337 1337
statement ok
delete from test;
# Bitset Container
statement ok
INSERT INTO test SELECT case when i%3=0 then 1337 else null end FROM range(0,10000) tbl(i);
statement ok
checkpoint
restart
query I
SELECT compression FROM pragma_storage_info('test') WHERE segment_type ILIKE 'VALIDITY' and compression != 'Roaring';
----
query I
select count(*) from test WHERE a IS NOT NULL;
----
3334
query III
select sum(a), min(a), max(a) from test;
----
4457558 1337 1337
statement ok
delete from test;

View File

@@ -0,0 +1,83 @@
# name: test/sql/storage/compression/roaring/roaring_analyze_array.test
# description: Check the produced (final_)analyze result
# group: [roaring]
require block_size 262144
require noforcestorage
load __TEST_DIR__/test_roaring.db readwrite v1.2.0
statement ok
set logging_level='info';
# 1 rowgroup
statement ok
set variable dataset_size = 122880;
statement ok
PRAGMA force_compression='uncompressed'
statement ok
set enable_logging=true;
statement ok
CREATE TABLE test_uncompressed AS SELECT
case
when i%25=0
then 1337
else null
end
FROM range(getvariable('dataset_size')) tbl(i);
statement ok
checkpoint
statement ok
set enable_logging=false;
query I
SELECT message.split(': ')[2]::INTEGER FROM duckdb_logs
where
message.starts_with('ColumnDataCheckpointer FinalAnalyze') and
message.contains('test_uncompressed') and
message.contains('VALIDITY') and
message.contains('COMPRESSION_UNCOMPRESSED');
----
15360
statement ok
PRAGMA force_compression='roaring'
statement ok
set enable_logging=true;
statement ok
CREATE TABLE test_roaring AS select * from test_uncompressed;
statement ok
checkpoint
statement ok
set enable_logging=false;
# For single row group
# 60 vectors with 82 non-null values per vector
# Total compressed bytes:
# 2 bits (is_inverted + is_run) + 8 bits (cardinality) = 10 bits per Vector
# 10 * 60 = 600 bits == 75 bytes of metadata per RowGroup
#
# 8 (compressed overhead) + (82 * sizeof(uint8_t)) = 90 bytes per Vector
# 90 * 60 = 5400 bytes of data per RowGroup
# 5475 bytes
# We 2x the actual result, to pay for the slower decompression speed
query I
SELECT message.split(': ')[2]::INTEGER FROM duckdb_logs
where
message.starts_with('ColumnDataCheckpointer FinalAnalyze') and
message.contains('test_roaring') and
message.contains('VALIDITY') and
message.contains('COMPRESSION_ROARING');
----
10944

View File

@@ -0,0 +1,83 @@
# name: test/sql/storage/compression/roaring/roaring_analyze_bitset.test
# description: Check the produced (final_)analyze result
# group: [roaring]
require block_size 262144
require noforcestorage
load __TEST_DIR__/test_roaring.db readwrite v1.2.0
statement ok
set logging_level='info';
# 1 rowgroup
statement ok
set variable dataset_size = 122880;
statement ok
PRAGMA force_compression='uncompressed'
statement ok
set enable_logging=true;
statement ok
CREATE TABLE test_uncompressed AS SELECT
case
when i%3=0
then 1337
else null
end
FROM range(getvariable('dataset_size')) tbl(i);
statement ok
checkpoint
statement ok
set enable_logging=false;
query I
SELECT message.split(': ')[2]::INTEGER FROM duckdb_logs
where
message.starts_with('ColumnDataCheckpointer FinalAnalyze') and
message.contains('test_uncompressed') and
message.contains('VALIDITY') and
message.contains('COMPRESSION_UNCOMPRESSED');
----
15360
statement ok
PRAGMA force_compression='roaring'
statement ok
set enable_logging=true;
statement ok
CREATE TABLE test_roaring AS select * from test_uncompressed;
statement ok
checkpoint
statement ok
set enable_logging=false;
# For single row group
# 60 vectors with 7 or 8 runs of nulls per vector
# Total compressed bytes:
# 2 bits (is_inverted + is_run) = 2 bits per Vector
# 2 * 60 = 120 bits == 15 bytes of metadata per RowGroup
#
# 256 bytes bytes per Vector
# 256 * 60 = 15360 bytes of data per RowGroup
# 15375 bytes
# We 2x the actual result, to pay for the slower decompression speed
query I
SELECT message.split(': ')[2]::INTEGER FROM duckdb_logs
where
message.starts_with('ColumnDataCheckpointer FinalAnalyze') and
message.contains('test_roaring') and
message.contains('VALIDITY') and
message.contains('COMPRESSION_ROARING');
----
30872

View File

@@ -0,0 +1,83 @@
# name: test/sql/storage/compression/roaring/roaring_analyze_run.test
# description: Check the produced (final_)analyze result
# group: [roaring]
require block_size 262144
require noforcestorage
load __TEST_DIR__/test_roaring.db readwrite v1.2.0
statement ok
set logging_level='info';
# 1 rowgroup
statement ok
set variable dataset_size = 122880;
statement ok
PRAGMA force_compression='uncompressed'
statement ok
set enable_logging=true;
statement ok
CREATE TABLE test_uncompressed AS SELECT
case
when i = 0 or (i % 512 != 0 and (i % 512) < 350 or (i % 512) > 450)
then null
else 1337
end
FROM range(getvariable('dataset_size')) tbl(i);
statement ok
checkpoint
statement ok
set enable_logging=false;
query I
SELECT message.split(': ')[2]::INTEGER FROM duckdb_logs
where
message.starts_with('ColumnDataCheckpointer FinalAnalyze') and
message.contains('test_uncompressed') and
message.contains('VALIDITY') and
message.contains('COMPRESSION_UNCOMPRESSED');
----
15360
statement ok
PRAGMA force_compression='roaring'
statement ok
set enable_logging=true;
statement ok
CREATE TABLE test_roaring AS select * from test_uncompressed;
statement ok
checkpoint
statement ok
set enable_logging=false;
# For single row group
# 60 vectors with 7 or 8 runs of nulls per vector
# Total compressed bytes:
# 2 bits (is_inverted + is_run) + 7 bits (run_size) = 9 bits per Vector
# 9 * 60 = 540 bits == 67 bytes of metadata per RowGroup
#
# 8 (compressed overhead) + (8 * sizeof(uint16_t)) = 24 bytes per Vector
# 24 * 60 = 1440 bytes of data per RowGroup
# 1507 bytes
# We 2x the actual result, to pay for the slower decompression speed
query I
SELECT message.split(': ')[2]::INTEGER FROM duckdb_logs
where
message.starts_with('ColumnDataCheckpointer FinalAnalyze') and
message.contains('test_roaring') and
message.contains('VALIDITY') and
message.contains('COMPRESSION_ROARING');
----
3024

View File

@@ -0,0 +1,39 @@
# name: test/sql/storage/compression/roaring/roaring_appends.test_slow
# group: [roaring]
require block_size 262144
load __TEST_DIR__/test_roaring_appends.db readwrite v1.2.0
statement ok
PRAGMA force_compression='roaring';
statement ok
set checkpoint_threshold = '100mb';
statement ok
CREATE TABLE test (a BIGINT);
foreach size 50 100 250 1025 1500
statement ok
delete from test;
loop i 1 30
statement ok
INSERT INTO test SELECT case when i%25=0 then 1337 else null end FROM range(0,${size}) tbl(i);
statement ok
checkpoint
query I
select count(*) = (${size} / 25) * ${i} from test WHERE a IS NOT NULL;
----
true
#i
endloop
#size
endloop

View File

@@ -0,0 +1,67 @@
# name: test/sql/storage/compression/roaring/roaring_array_simple.test
# description: Test bitpacking with nulls
# group: [roaring]
load __TEST_DIR__/test_roaring.db readwrite v1.2.0
statement ok
PRAGMA force_compression='roaring'
# simple compression with few values
statement ok
CREATE TABLE test (a BIGINT);
# 82 values stored in the Array Container
statement ok
INSERT INTO test SELECT case when i%25=0 then 1337 else null end FROM range(0,10000) tbl(i);
statement ok
checkpoint
query I
SELECT compression FROM pragma_storage_info('test') WHERE segment_type ILIKE 'VALIDITY' and compression != 'Roaring';
----
query I
select count(*) from test WHERE a IS NOT NULL;
----
400
query III
select sum(a), min(a), max(a) from test;
----
534800 1337 1337
statement ok
delete from test
# 5 non-null values per Vector, uses uncompressed arrays
statement ok
with intermediates as (
select i % 2048 as i
from range(0, 10_000) t(i)
)
insert into test select case when
i = 0 or
i = 6 or
i = 1000 or
i = 1500 or
i = 2000
then 1337
else null end from intermediates;
statement ok
checkpoint;
query I
select count(*) from test WHERE a IS NOT NULL;
----
24
query III
select sum(a), min(a), max(a) from test;
----
32088 1337 1337
statement ok
DROP TABLE test;

View File

@@ -0,0 +1,35 @@
# name: test/sql/storage/compression/roaring/roaring_bitset_simple.test
# description: Test bitpacking with NULLs.
# group: [roaring]
load __TEST_DIR__/test_roaring.db readwrite v1.2.0
statement ok
PRAGMA force_compression='roaring'
# Simple compression with a few values.
statement ok
CREATE TABLE test (a BIGINT);
statement ok
INSERT INTO test SELECT CASE WHEN i % 3 = 0 THEN 1337 ELSE NULL END FROM range(0, 10000) tbl(i);
statement ok
CHECKPOINT;
query I
SELECT compression FROM pragma_storage_info('test') WHERE segment_type ILIKE 'VALIDITY' AND compression != 'Roaring';
----
query I
SELECT COUNT(*) FROM test WHERE a IS NOT NULL;
----
3334
query III
SELECT SUM(a), MIN(a), MAX(a) FROM test;
----
4457558 1337 1337
statement ok
DROP TABLE test;

View File

@@ -0,0 +1,220 @@
# name: test/sql/storage/compression/roaring/roaring_compression_ratio.test_slow
# description: Assert roaring compression ratio is within reasonable margins for each container type
# group: [roaring]
require block_size 262144
load __TEST_DIR__/test_roaring.db readwrite v1.2.0
statement ok
set variable dataset_size = 120_000_000;
#### Array container Roaring Compression ratio calculation:
# For single row group
# 60 vectors with 82 non-null values per vector
# Total compressed bytes:
# metadata: (64 / (8 / 2 (bitwidth))) + 60 = 76
# data: (8 + (82 * 1)) * 60 = 5400
# 5476 bytes
# Total uncompressed bytes = (60 * 256) = 15360 bytes
# Expected Ratio ~= 2.8x
statement ok
PRAGMA force_compression='roaring'
statement ok
CREATE TABLE test_roaring AS SELECT case when i%25=0 then 1337 else null end FROM range(getvariable('dataset_size')) tbl(i);
statement ok
checkpoint
statement ok
PRAGMA force_compression='uncompressed'
statement ok
CREATE TABLE test_uncompressed AS SELECT case when i%25=0 then 1337 else null end FROM range(getvariable('dataset_size')) tbl(i);
statement ok
checkpoint
query I
SELECT compression FROM pragma_storage_info('test_roaring') WHERE segment_type = 'VALIDITY' AND compression != 'Roaring';
----
query I
SELECT compression FROM pragma_storage_info('test_uncompressed') WHERE segment_type = 'VALIDITY' AND compression != 'Uncompressed';
----
statement ok
CREATE TYPE test_result AS UNION (
ok BOOL,
err STRUCT(
uncompressed HUGEINT,
compressed HUGEINT,
allowed_minimum_ratio DECIMAL(2,1),
allowed_maximum_ratio DECIMAL(2,1),
actual_ratio FLOAT
)
);
statement ok
set variable min_ratio = 2.6;
set variable max_ratio = 2.8;
query I
SELECT
CASE
WHEN (uncompressed::FLOAT / compressed::FLOAT) > getvariable('min_ratio') AND (uncompressed::FLOAT / compressed::FLOAT) <= getvariable('max_ratio')
THEN True::test_result
ELSE {
'uncompressed': uncompressed,
'compressed': compressed,
'allowed_minimum_ratio': getvariable('min_ratio'),
'allowed_maximum_ratio': getvariable('max_ratio'),
'actual_ratio': uncompressed::FLOAT / compressed::FLOAT
}::test_result
END
FROM (
select
(select count(distinct block_id) from pragma_storage_info('test_roaring') where segment_type in ('VALIDITY')) as compressed,
(select count(distinct block_id) from pragma_storage_info('test_uncompressed') where segment_type in ('VALIDITY')) as uncompressed
) AS blocks_tbl;
----
true
statement ok
drop table test_roaring;
drop table test_uncompressed;
#### Run container Roaring Compression ratio calculation:
# For single row group
# 60 vectors with 7/8 runs of nulls per vector
# Total compressed bytes:
# metadata: (64 / (8 / 2 (bitwidth))) + ((64 * 7) / 8) = 72
# data: (8 + (8 * 2)) * 60 = 1440
# 1512 bytes
# Total uncompressed bytes = (60 * 256) = 15360 bytes
# Expected Ratio ~= 10.15x
statement ok
PRAGMA force_compression='roaring'
statement ok
CREATE TABLE test_roaring AS SELECT case when i = 0 or (i % 512 != 0 and (i % 512) < 350 or (i % 512) > 450) then null else 1337 end FROM range(0, getvariable('dataset_size')) tbl(i);
statement ok
checkpoint
statement ok
PRAGMA force_compression='uncompressed'
statement ok
CREATE TABLE test_uncompressed AS SELECT case when i = 0 or (i % 512 != 0 and (i % 512) < 350 or (i % 512) > 450) then null else 1337 end FROM range(0, getvariable('dataset_size')) tbl(i);
statement ok
checkpoint
query I
SELECT compression FROM pragma_storage_info('test_roaring') WHERE segment_type = 'VALIDITY' AND compression != 'Roaring';
----
query I
SELECT compression FROM pragma_storage_info('test_uncompressed') WHERE segment_type = 'VALIDITY' AND compression != 'Uncompressed';
----
statement ok
checkpoint
# Hmm, this doesnt actually match the result from the back-of-the-napkin calculation
statement ok
set variable min_ratio = 8.6;
set variable max_ratio = 8.8;
query I
SELECT
CASE
WHEN (uncompressed::FLOAT / compressed::FLOAT) > getvariable('min_ratio') AND (uncompressed::FLOAT / compressed::FLOAT) <= getvariable('max_ratio')
THEN True::test_result
ELSE {
'uncompressed': uncompressed,
'compressed': compressed,
'allowed_minimum_ratio': getvariable('min_ratio'),
'allowed_maximum_ratio': getvariable('max_ratio'),
'actual_ratio': uncompressed::FLOAT / compressed::FLOAT
}::test_result
END
FROM (
select
(select count(distinct block_id) from pragma_storage_info('test_roaring') where segment_type in ('VALIDITY')) as compressed,
(select count(distinct block_id) from pragma_storage_info('test_uncompressed') where segment_type in ('VALIDITY')) as uncompressed
) AS blocks_tbl;
----
true
statement ok
drop table test_roaring;
drop table test_uncompressed;
#### Bitset container Roaring Compression ratio calculation:
# For single row group
# 60 vectors stored uncompressed, + metadata
# Total compressed bytes = (60 * 2 (metadata)) + (60 * 256) = 15480 bytes
# Total uncompressed bytes = (60 * 256) = 15360 bytes
# Expected Ratio ~= 7.5x
statement ok
PRAGMA force_compression='roaring'
statement ok
CREATE TABLE test_roaring AS SELECT case when i%3=0 then 1337 else null end FROM range(getvariable('dataset_size')) tbl(i);
statement ok
checkpoint
statement ok
PRAGMA force_compression='uncompressed'
statement ok
CREATE TABLE test_uncompressed AS SELECT case when i%3=0 then 1337 else null end FROM range(getvariable('dataset_size')) tbl(i);
statement ok
checkpoint
query I
SELECT compression FROM pragma_storage_info('test_roaring') WHERE segment_type = 'VALIDITY' AND compression != 'Roaring';
----
query I
SELECT compression FROM pragma_storage_info('test_uncompressed') WHERE segment_type = 'VALIDITY' AND compression != 'Uncompressed';
----
statement ok
checkpoint
statement ok
set variable min_ratio = 0.9;
set variable max_ratio = 1;
query I
SELECT
CASE
WHEN (uncompressed::FLOAT / compressed::FLOAT) > getvariable('min_ratio') AND (uncompressed::FLOAT / compressed::FLOAT) <= getvariable('max_ratio')
THEN True::test_result
ELSE {
'uncompressed': uncompressed,
'compressed': compressed,
'allowed_minimum_ratio': getvariable('min_ratio'),
'allowed_maximum_ratio': getvariable('max_ratio'),
'actual_ratio': uncompressed::FLOAT / compressed::FLOAT
}::test_result
END
FROM (
select
(select count(distinct block_id) from pragma_storage_info('test_roaring') where segment_type in ('VALIDITY')) as compressed,
(select count(distinct block_id) from pragma_storage_info('test_uncompressed') where segment_type in ('VALIDITY')) as uncompressed
) AS blocks_tbl;
----
true
statement ok
drop table test_roaring;
drop table test_uncompressed;

View File

@@ -0,0 +1,61 @@
# name: test/sql/storage/compression/roaring/roaring_inverted_array_simple.test
# description: Test bitpacking with nulls
# group: [roaring]
load __TEST_DIR__/test_roaring.db readwrite v1.2.0
statement ok
PRAGMA force_compression='roaring'
# simple compression with few values
statement ok
CREATE TABLE test (a BIGINT);
statement ok
INSERT INTO test SELECT case when i%25=0 then null else 1337 end FROM range(0,10000) tbl(i);
statement ok
checkpoint
query I
SELECT compression FROM pragma_storage_info('test') WHERE segment_type ILIKE 'VALIDITY' and compression != 'Roaring';
----
query III
select sum(a), min(a), max(a) from test;
----
12835200 1337 1337
statement ok
delete from test
# 5 null values per Vector, uses inverted uncompressed arrays
statement ok
with intermediates as (
select i % 2048 as i
from range(0, 10_000) t(i)
)
insert into test select case when
i = 0 or
i = 6 or
i = 1000 or
i = 1500 or
i = 2000
then null
else 1337 end from intermediates;
statement ok
checkpoint;
query I
select count(*) from test WHERE a IS NOT NULL;
----
9976
query III
select sum(a), min(a), max(a) from test;
----
13337912 1337 1337
statement ok
DROP TABLE test;

View File

@@ -0,0 +1,36 @@
# name: test/sql/storage/compression/roaring/roaring_inverted_run_simple.test
# description: Test bitpacking with nulls
# group: [roaring]
load __TEST_DIR__/test_roaring.db readwrite v1.2.0
statement ok
PRAGMA force_compression='roaring'
# simple compression with few values
statement ok
CREATE TABLE test (a BIGINT);
# Runs can't be inverted, this would be better off being inverted but the space saving is only 1 run at most.
statement ok
INSERT INTO test SELECT case when i = 0 or (i % 512 != 0 and (i % 512) < 350 or (i % 512) > 450) then 1337 else null end FROM range(0,10000) tbl(i);
statement ok
checkpoint
query I
SELECT compression FROM pragma_storage_info('test') WHERE segment_type ILIKE 'VALIDITY' and compression != 'Roaring';
----
query I
select count(*) from test WHERE a IS NOT NULL;
----
8062
query III
select sum(a), min(a), max(a) from test;
----
10778894 1337 1337
statement ok
DROP TABLE test;

View File

@@ -0,0 +1,72 @@
# name: test/sql/storage/compression/roaring/roaring_run_simple.test
# description: Test bitpacking with nulls
# group: [roaring]
load __TEST_DIR__/test_roaring.db readwrite v1.2.0
statement ok
PRAGMA force_compression='roaring'
# simple compression with few values
statement ok
CREATE TABLE test (a BIGINT);
# 8 runs per Vector
statement ok
INSERT INTO test SELECT case when i = 0 or (i % 512 != 0 and (i % 512) < 350 or (i % 512) > 450) then null else 1337 end FROM range(0,10000) tbl(i);
# runs:
# (0,350)
# (451,512)
# (513,862)
# (963,1024)
# (1025,1374)
# (1475,1536)
# (1537,1886)
# (1987,2048)
statement ok
checkpoint
query I
SELECT compression FROM pragma_storage_info('test') WHERE segment_type ILIKE 'VALIDITY' and compression != 'Roaring';
----
query I
select count(*) from test WHERE a IS NOT NULL;
----
1938
query III
select sum(a), min(a), max(a) from test;
----
2591106 1337 1337
statement ok
delete from test;
# 3 runs per Vector (uses uncompressed runs)
statement ok
with intermediates as (
select i % 2048 as i
from range(0, 10_000) t(i)
)
INSERT INTO test SELECT case when (i >= 0 and i < 110) or (i >= 1500 and i < 1800) or (i >= 2000) then null else 1337 end FROM intermediates;
# (0,110)
# (1500,1800)
# (2000,2048)
statement ok
checkpoint;
query I
select count(*) from test WHERE a IS NOT NULL;
----
7758
query III
select sum(a), min(a), max(a) from test;
----
10372446 1337 1337
statement ok
DROP TABLE test;

View File

@@ -0,0 +1,39 @@
# name: test/sql/storage/compression/roaring/roaring_smaller_than_vector.test
# group: [roaring]
load __TEST_DIR__/test_roaring2.db readwrite v1.2.0
statement ok
PRAGMA force_compression='roaring';
statement ok
set checkpoint_threshold = '10mb';
# simple compression with few values
statement ok
CREATE TABLE test (a BIGINT);
statement ok
INSERT INTO test SELECT case when i%25=0 then 1337 else null end FROM range(0,1025) tbl(i);
statement ok
checkpoint
query I
select count(*) from test WHERE a IS NOT NULL;
----
41
statement ok
INSERT INTO test SELECT case when i%25=0 then 1337 else null end FROM range(0,1025) tbl(i);
statement ok
checkpoint;
query I
select count(*) from test WHERE a IS NOT NULL;
----
82
statement ok
DROP TABLE test;