email-tracker/external/duckdb/test/optimizer/compressed_materialization.test_slow

# name: test/optimizer/compressed_materialization.test_slow
# description: Compressed materialization test
# group: [optimizer]

statement ok
pragma enable_verification

statement ok
PRAGMA explain_output = OPTIMIZED_ONLY

# these functions live in the catalog, but cannot be called directly
statement error
select __internal_compress_string_utinyint('L')
----
Binder Error: Compressed materialization functions are for internal use only!

# internal issue 1576
statement ok
create table t0 as select range%400000 a, range%400000 b from range(500000);

query III rowsort
select * from (
      select *, row_number() OVER () as row_number from (
          SELECT * FROM t0 ORDER BY 1) ta
      ) tb where b > 2
  order by a limit 2;
----
3	3	7
3	3	8

# tricky tests taken from test/sql/subquery/scalar/test_issue_6136.test
# we run these with one thread since they are order dependent
statement ok
create table r as select * from values (1, 1, 'a', 'A'), (1, null, 'b', 'B'), (1, 2, 'c', 'C'), (2, null, 'd', 'D') t(ra, rb, x, y);

statement ok
create table b as select * from values (1, 1, 1), (2, 1, 2), (3, 1, 3), (4, 1, null), (5, 2, 1), (6, 2, null), (7, 99, 99) t(id, ba, bb);

statement ok
set threads=1

query T
select (
    select {'x': first(x order by x), 'y': first(y order by y), '__matches': count(*)}
    from (
        select *
        from r
        where ba = ra
          and (bb = rb or rb is null)
        order by all
    )
    group by ra, rb
    order by all
    limit 1)
from b
order by all
----
{'x': a, 'y': A, '__matches': 1}
{'x': b, 'y': B, '__matches': 1}
{'x': b, 'y': B, '__matches': 1}
{'x': b, 'y': B, '__matches': 1}
{'x': d, 'y': D, '__matches': 1}
{'x': d, 'y': D, '__matches': 1}
NULL

query T
select
  coalesce((select {'x': first(x), 'y': first(y), '__matches': count(*)} from r where ba = ra and (bb = rb or rb is null) group by ra, rb order by bb = rb limit 1), {'x': null, 'y': null, '__matches': 0}) as ref2
from b
----
{'x': a, 'y': A, '__matches': 1}
{'x': c, 'y': C, '__matches': 1}
{'x': b, 'y': B, '__matches': 1}
{'x': b, 'y': B, '__matches': 1}
{'x': d, 'y': D, '__matches': 1}
{'x': d, 'y': D, '__matches': 1}
{'x': NULL, 'y': NULL, '__matches': 0}

statement ok
set threads=4

# we should see compress twice (in the ORDER BY expression and payload) and decompress once (just the payload)
statement ok
create table t1 as select range i from range(10)

query II
explain select i from t1 order by 10-i
----
logical_opt	<REGEX>:(.*__internal_decompress.*){1}(.*__internal_compress.*){2}

statement ok
create table test as
select (range + 7) % 4 i,
       (range + 7) % 11 j
from range(10)

# should see compress exactly twice (for columns i and j)
# if we see less than twice we're not compressing,
# and if we see it more than twice we're likely compressing and decompressing twice (once for each ORDER BY)
# but we can compress once, then do both ORDER BYs, then decompress
query II
explain select count(i), count(j) from (select i, j from (select i, j from test order by j offset 1) order by j offset 1)
----
logical_opt	<REGEX>:(.*__internal_compress.*){2}

# should see it exactly once here, as we can only compress the group (i), not the value being summed (j)
# after the GROUP BY we do the ORDER BY, and finally decompress
query II
explain select i, sum(j) from test group by i order by i
----
logical_opt	<REGEX>:(.*__internal_compress.*){1}

# We can't deal with duplicate projections (yet) so this should see 3 compresses instead of 1
query II
explain select count(j1), count(j2) from (select j j1, j j2 from (select j from test order by j offset 1) order by j1, j2 offset 1)
----
logical_opt	<REGEX>:(.*__internal_compress.*){3}

query II
explain select distinct i, j from test order by i, j
----
logical_opt	<REGEX>:(.*__internal_compress.*){2}

# taken from third_party/sqllogictest/test/index/orderby_nosort/10/slt_good_27.test
# the problem was that statistics propagation created an index join after "filter_prune" happened
statement ok
CREATE TABLE tab3(pk INTEGER PRIMARY KEY, col0 INTEGER, col1 FLOAT, col2 TEXT, col3 INTEGER, col4 FLOAT, col5 TEXT)

statement ok
INSERT INTO tab3 VALUES
    (0,461,479.93,'idmdh',456,464.90,'nczyk'),
    (1,473,482.60,'bguxh',460,466.25,'oseln'),
    (2,474,484.45,'bnzmd',461,467.13,'kvwna'),
    (3,475,485.1,'obtlj',462,468.73,'jkjbo'),
    (4,477,486.62,'gjtbr',463,469.9,'bhers'),
    (5,479,489.59,'bkxfm',464,470.29,'aklru'),
    (6,481,495.30,'owirt',466,471.55,'lysig'),
    (7,482,496.31,'yergm',467,473.31,'rkpxn'),
    (8,484,497.51,'fszui',468,474.44,'ztexm'),
    (9,486,498.24,'eueji',469,477.28,'amvcc')

statement ok
CREATE UNIQUE INDEX idx_tab3_4 ON tab3 (col3)

query I
SELECT pk FROM tab3 WHERE col0 IN (SELECT col3 FROM tab3 WHERE (col1 > 93.79)) ORDER BY 1 DESC
----
0

# test that we compress all-NULL (from multiple Parquet files) to utinyint too (if union_by_name is true)
require parquet

#
statement ok
pragma disable_verification

# one column without NULL, and two columns (varchar and bigint) that are all NULL
statement ok
copy (select hash(range + 1) i, null::varchar j, null::bigint k from range(100)) to '__TEST_DIR__/cm1.parquet'

statement ok
copy (select hash(range + 1) i, null::varchar j, null::bigint k from range(100,200)) to '__TEST_DIR__/cm2.parquet'

# has NULL, and does not have non-NULL
query II
select
    stats(j) LIKE '%[Has Null: true, Has No Null: false]%',
    stats(k) LIKE '%[Has Null: true, Has No Null: false]%'
from read_parquet('__TEST_DIR__/cm*.parquet', union_by_name=true) limit 1
----
true	true

# this should lead to a plan where both all-NULL columns (varchar j and bigint k) are compressed
statement ok
PRAGMA explain_output = PHYSICAL_ONLY

query II
explain select * from read_parquet('__TEST_DIR__/cm*.parquet', union_by_name=true) order by i
----
physical_plan	<REGEX>:.*__internal_decompress.*__internal_decompress.*__internal_compress.*__internal_compress.*

# and of course some tpch stuff

require tpch

statement ok
call dbgen(sf=0.01)

statement ok
PRAGMA explain_output = PHYSICAL_ONLY

# tpch q1 should use perfect hash aggregate
query II
EXPLAIN
SELECT
    l_returnflag,
    l_linestatus,
    sum(l_quantity) AS sum_qty,
    sum(l_extendedprice) AS sum_base_price,
    sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    avg(l_quantity) AS avg_qty,
    avg(l_extendedprice) AS avg_price,
    avg(l_discount) AS avg_disc,
    count(*) AS count_order
FROM
    lineitem
WHERE
    l_shipdate <= CAST('1998-09-02' AS date)
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
----
physical_plan	<REGEX>:.*PERFECT_HASH_GROUP_BY.*

statement ok
PRAGMA explain_output = OPTIMIZED_ONLY

# test that we're compressing lineitem
query II
explain select * from lineitem order by l_shipdate
----
logical_opt	<REGEX>:.*__internal_decompress.*__internal_compress.*

# test that we get the same result with and without compressed materialization
query IIIIIIIIIIIIIII nosort q0
select * from lineitem order by l_shipdate
----

statement ok
set disabled_optimizers to 'compressed_materialization'

query IIIIIIIIIIIIIII nosort q0
select * from lineitem order by l_shipdate
----