245 lines
11 KiB
C++
245 lines
11 KiB
C++
#include "catch.hpp"
|
|
|
|
#include "arrow/arrow_test_helper.hpp"
|
|
|
|
using namespace duckdb;
|
|
|
|
static void TestArrowRoundtrip(const string &query, bool export_large_buffer = false,
|
|
bool loseless_conversion = false) {
|
|
DuckDB db;
|
|
Connection con(db);
|
|
if (export_large_buffer) {
|
|
auto res = con.Query("SET arrow_large_buffer_size=True");
|
|
REQUIRE(!res->HasError());
|
|
}
|
|
if (loseless_conversion) {
|
|
auto res = con.Query("SET arrow_lossless_conversion = true");
|
|
REQUIRE(!res->HasError());
|
|
}
|
|
REQUIRE(ArrowTestHelper::RunArrowComparison(con, query, true));
|
|
REQUIRE(ArrowTestHelper::RunArrowComparison(con, query, false));
|
|
}
|
|
|
|
static void TestArrowRoundtripStringView(const string &query) {
|
|
DuckDB db;
|
|
Connection con(db);
|
|
auto res = con.Query("SET produce_arrow_string_view=True");
|
|
REQUIRE(!res->HasError());
|
|
REQUIRE(ArrowTestHelper::RunArrowComparison(con, query, false));
|
|
}
|
|
|
|
static void TestParquetRoundtrip(const string &path) {
|
|
DBConfig config;
|
|
// This needs to be set since this test will be triggered when testing autoloading
|
|
config.options.allow_unsigned_extensions = true;
|
|
|
|
DuckDB db(nullptr, &config);
|
|
Connection con(db);
|
|
|
|
// run the query
|
|
auto query = "SELECT * FROM parquet_scan('" + path + "')";
|
|
REQUIRE(ArrowTestHelper::RunArrowComparison(con, query, true));
|
|
REQUIRE(ArrowTestHelper::RunArrowComparison(con, query));
|
|
}
|
|
|
|
TEST_CASE("Test Export Large", "[arrow]") {
|
|
// Test with Regular Buffer Size
|
|
TestArrowRoundtrip("SELECT 'bla' FROM range(10000)");
|
|
|
|
TestArrowRoundtrip("SELECT 'bla'::BLOB FROM range(10000)");
|
|
|
|
TestArrowRoundtrip("SELECT '3d038406-6275-4aae-bec1-1235ccdeaade'::UUID FROM range(10000) tbl(i)", false, true);
|
|
|
|
// Test with Large Buffer Size
|
|
TestArrowRoundtrip("SELECT 'bla' FROM range(10000)", true);
|
|
|
|
TestArrowRoundtrip("SELECT 'bla'::BLOB FROM range(10000)", true);
|
|
|
|
TestArrowRoundtrip("SELECT '3d038406-6275-4aae-bec1-1235ccdeaade'::UUID FROM range(10000) tbl(i)", true, true);
|
|
}
|
|
|
|
TEST_CASE("Test arrow roundtrip", "[arrow]") {
|
|
TestArrowRoundtrip("SELECT * FROM range(10000) tbl(i) UNION ALL SELECT NULL");
|
|
TestArrowRoundtrip("SELECT m from (select MAP(list_value(1), list_value(2)) from range(5) tbl(i)) tbl(m)");
|
|
TestArrowRoundtrip("SELECT * FROM range(10) tbl(i)");
|
|
TestArrowRoundtrip("SELECT case when i%2=0 then null else i end i FROM range(10) tbl(i)");
|
|
TestArrowRoundtrip("SELECT case when i%2=0 then true else false end b FROM range(10) tbl(i)");
|
|
TestArrowRoundtrip("SELECT case when i%2=0 then i%4=0 else null end b FROM range(10) tbl(i)");
|
|
TestArrowRoundtrip("SELECT 'thisisalongstring'||i::varchar str FROM range(10) tbl(i)");
|
|
TestArrowRoundtrip(
|
|
"SELECT case when i%2=0 then null else 'thisisalongstring'||i::varchar end str FROM range(10) tbl(i)");
|
|
TestArrowRoundtrip("SELECT {'i': i, 'b': 10-i} str FROM range(10) tbl(i)");
|
|
TestArrowRoundtrip("SELECT case when i%2=0 then {'i': case when i%4=0 then null else i end, 'b': 10-i} else null "
|
|
"end str FROM range(10) tbl(i)");
|
|
TestArrowRoundtrip("SELECT [i, i+1, i+2] FROM range(10) tbl(i)");
|
|
TestArrowRoundtrip(
|
|
"SELECT MAP(LIST_VALUE({'i':1,'j':2},{'i':3,'j':4}),LIST_VALUE({'i':1,'j':2},{'i':3,'j':4})) as a");
|
|
TestArrowRoundtrip(
|
|
"SELECT MAP(LIST_VALUE({'i':i,'j':i+2},{'i':3,'j':NULL}),LIST_VALUE({'i':i+10,'j':2},{'i':i+4,'j':4})) as a "
|
|
"FROM range(10) tbl(i)");
|
|
TestArrowRoundtrip("SELECT MAP(['hello', 'world'||i::VARCHAR],[i + 1, NULL]) as a FROM range(10) tbl(i)");
|
|
TestArrowRoundtrip("SELECT (1.5 + i)::DECIMAL(4,2) dec4, (1.5 + i)::DECIMAL(9,3) dec9, (1.5 + i)::DECIMAL(18,3) "
|
|
"dec18, (1.5 + i)::DECIMAL(38,3) dec38 FROM range(10) tbl(i)");
|
|
TestArrowRoundtrip(
|
|
"SELECT case when i%2=0 then null else INTERVAL (i) seconds end AS interval FROM range(10) tbl(i)");
|
|
#if STANDARD_VECTOR_SIZE < 64
|
|
// FIXME: there seems to be a bug in the enum arrow reader in this test when run with vsize=2
|
|
return;
|
|
#endif
|
|
TestArrowRoundtrip("SELECT * EXCLUDE(bit,time_tz, bignum) REPLACE "
|
|
"(interval (1) seconds AS interval, hugeint::DOUBLE as hugeint, uhugeint::DOUBLE as uhugeint) "
|
|
"FROM test_all_types()",
|
|
false, true);
|
|
}
|
|
|
|
TEST_CASE("Test Arrow Extension Types", "[arrow][.]") {
|
|
// UUID
|
|
TestArrowRoundtrip("SELECT '2d89ebe6-1e13-47e5-803a-b81c87660b66'::UUID str FROM range(5) tbl(i)", false, true);
|
|
|
|
// HUGEINT
|
|
TestArrowRoundtrip("SELECT '170141183460469231731687303715884105727'::HUGEINT str FROM range(5) tbl(i)", false,
|
|
true);
|
|
|
|
// UHUGEINT
|
|
TestArrowRoundtrip("SELECT '170141183460469231731687303715884105727'::UHUGEINT str FROM range(5) tbl(i)", false,
|
|
true);
|
|
|
|
// BIT
|
|
TestArrowRoundtrip("SELECT '0101011'::BIT str FROM range(5) tbl(i)", false, true);
|
|
|
|
// TIME_TZ
|
|
TestArrowRoundtrip("SELECT '02:30:00+04'::TIMETZ str FROM range(5) tbl(i)", false, true);
|
|
|
|
// BIGNUM
|
|
TestArrowRoundtrip("SELECT 85070591730234614260976917445211069672::BIGNUM str FROM range(5) tbl(i)", false, true);
|
|
|
|
TestArrowRoundtrip("SELECT 85070591730234614260976917445211069672::BIGNUM str FROM range(5) tbl(i)", true, true);
|
|
}
|
|
|
|
TEST_CASE("Test Arrow Extension Types - JSON", "[arrow][.]") {
|
|
DBConfig config;
|
|
DuckDB db(nullptr, &config);
|
|
Connection con(db);
|
|
|
|
if (!db.ExtensionIsLoaded("json")) {
|
|
return;
|
|
}
|
|
|
|
// JSON
|
|
TestArrowRoundtrip("SELECT '{\"name\":\"Pedro\", \"age\":28, \"car\":\"VW Fox\"}'::JSON str FROM range(5) tbl(i)",
|
|
false, true);
|
|
}
|
|
|
|
TEST_CASE("Test Arrow String View", "[arrow][.]") {
|
|
// Test Small Strings
|
|
TestArrowRoundtripStringView("SELECT (i*10^i)::varchar str FROM range(5) tbl(i)");
|
|
|
|
// Test Small Strings + Nulls
|
|
TestArrowRoundtripStringView("SELECT (i*10^i)::varchar str FROM range(5) tbl(i) UNION SELECT NULL");
|
|
|
|
// Test Big Strings
|
|
TestArrowRoundtripStringView("SELECT 'Imaverybigstringmuchbiggerthanfourbytes' str FROM range(5) tbl(i)");
|
|
|
|
// Test Big Strings + Nulls
|
|
TestArrowRoundtripStringView("SELECT 'Imaverybigstringmuchbiggerthanfourbytes'||i::varchar str FROM range(5) "
|
|
"tbl(i) UNION SELECT NULL order by str");
|
|
|
|
// Test Mix of Small/Big/NULL Strings
|
|
TestArrowRoundtripStringView(
|
|
"SELECT 'Imaverybigstringmuchbiggerthanfourbytes'||i::varchar str FROM range(10000) tbl(i) UNION "
|
|
"SELECT NULL UNION SELECT (i*10^i)::varchar str FROM range(10000) tbl(i)");
|
|
}
|
|
|
|
TEST_CASE("Test TPCH arrow roundtrip", "[arrow][.]") {
|
|
DBConfig config;
|
|
DuckDB db(nullptr, &config);
|
|
Connection con(db);
|
|
|
|
if (!db.ExtensionIsLoaded("tpch")) {
|
|
return;
|
|
}
|
|
con.SendQuery("CALL dbgen(sf=0.5)");
|
|
|
|
// REQUIRE(ArrowTestHelper::RunArrowComparison(con, "SELECT * FROM lineitem;", false));
|
|
// REQUIRE(ArrowTestHelper::RunArrowComparison(con, "SELECT l_orderkey, l_shipdate, l_comment FROM lineitem ORDER BY
|
|
// l_orderkey DESC;", false)); REQUIRE(ArrowTestHelper::RunArrowComparison(con, "SELECT lineitem FROM lineitem;",
|
|
// false)); REQUIRE(ArrowTestHelper::RunArrowComparison(con, "SELECT [lineitem] FROM lineitem;", false));
|
|
|
|
con.SendQuery("create table lineitem_no_constraint as from lineitem;");
|
|
con.SendQuery("update lineitem_no_constraint set l_comment=null where l_orderkey%2=0;");
|
|
|
|
// REQUIRE(ArrowTestHelper::RunArrowComparison(con, "SELECT * FROM lineitem_no_constraint;", false));
|
|
REQUIRE(ArrowTestHelper::RunArrowComparison(
|
|
con, "SELECT l_orderkey, l_shipdate, l_comment FROM lineitem_no_constraint ORDER BY l_orderkey DESC;", false));
|
|
REQUIRE(
|
|
ArrowTestHelper::RunArrowComparison(con, "SELECT lineitem_no_constraint FROM lineitem_no_constraint;", false));
|
|
REQUIRE(ArrowTestHelper::RunArrowComparison(con, "SELECT [lineitem_no_constraint] FROM lineitem_no_constraint;",
|
|
false));
|
|
}
|
|
|
|
TEST_CASE("Test Parquet Files round-trip", "[arrow][.]") {
|
|
std::vector<std::string> data;
|
|
// data.emplace_back("data/parquet-testing/7-set.snappy.arrow2.parquet");
|
|
// data.emplace_back("data/parquet-testing/adam_genotypes.parquet");
|
|
data.emplace_back("data/parquet-testing/apkwan.parquet");
|
|
data.emplace_back("data/parquet-testing/aws1.snappy.parquet");
|
|
// not supported by arrow
|
|
// data.emplace_back("data/parquet-testing/aws2.parquet");
|
|
data.emplace_back("data/parquet-testing/binary_string.parquet");
|
|
data.emplace_back("data/parquet-testing/blob.parquet");
|
|
data.emplace_back("data/parquet-testing/boolean_stats.parquet");
|
|
// arrow can't read this
|
|
// data.emplace_back("data/parquet-testing/broken-arrow.parquet");
|
|
data.emplace_back("data/parquet-testing/bug1554.parquet");
|
|
data.emplace_back("data/parquet-testing/bug1588.parquet");
|
|
data.emplace_back("data/parquet-testing/bug1589.parquet");
|
|
data.emplace_back("data/parquet-testing/bug1618_struct_strings.parquet");
|
|
data.emplace_back("data/parquet-testing/bug2267.parquet");
|
|
data.emplace_back("data/parquet-testing/bug2557.parquet");
|
|
// slow
|
|
// data.emplace_back("data/parquet-testing/bug687_nulls.parquet");
|
|
// data.emplace_back("data/parquet-testing/complex.parquet");
|
|
data.emplace_back("data/parquet-testing/data-types.parquet");
|
|
data.emplace_back("data/parquet-testing/date.parquet");
|
|
// arrow can't read this because it's a time with a timezone and it's not supported by arrow
|
|
// data.emplace_back("data/parquet-testing/date_stats.parquet");
|
|
data.emplace_back("data/parquet-testing/decimal_stats.parquet");
|
|
data.emplace_back("data/parquet-testing/decimals.parquet");
|
|
data.emplace_back("data/parquet-testing/enum.parquet");
|
|
data.emplace_back("data/parquet-testing/filter_bug1391.parquet");
|
|
// data.emplace_back("data/parquet-testing/fixed.parquet");
|
|
// slow
|
|
// data.emplace_back("data/parquet-testing/leftdate3_192_loop_1.parquet");
|
|
data.emplace_back("data/parquet-testing/lineitem-top10000.gzip.parquet");
|
|
data.emplace_back("data/parquet-testing/manyrowgroups.parquet");
|
|
data.emplace_back("data/parquet-testing/manyrowgroups2.parquet");
|
|
// data.emplace_back("data/parquet-testing/map.parquet");
|
|
// Can't roundtrip NaNs
|
|
data.emplace_back("data/parquet-testing/nan-float.parquet");
|
|
// null byte in file
|
|
// data.emplace_back("data/parquet-testing/nullbyte.parquet");
|
|
// data.emplace_back("data/parquet-testing/nullbyte_multiple.parquet");
|
|
// borked
|
|
// data.emplace_back("data/parquet-testing/p2.parquet");
|
|
// data.emplace_back("data/parquet-testing/p2strings.parquet");
|
|
data.emplace_back("data/parquet-testing/pandas-date.parquet");
|
|
data.emplace_back("data/parquet-testing/signed_stats.parquet");
|
|
data.emplace_back("data/parquet-testing/silly-names.parquet");
|
|
// borked
|
|
// data.emplace_back("data/parquet-testing/simple.parquet");
|
|
// data.emplace_back("data/parquet-testing/sorted.zstd_18_131072_small.parquet");
|
|
data.emplace_back("data/parquet-testing/struct.parquet");
|
|
data.emplace_back("data/parquet-testing/struct_skip_test.parquet");
|
|
data.emplace_back("data/parquet-testing/timestamp-ms.parquet");
|
|
data.emplace_back("data/parquet-testing/timestamp.parquet");
|
|
data.emplace_back("data/parquet-testing/unsigned.parquet");
|
|
data.emplace_back("data/parquet-testing/unsigned_stats.parquet");
|
|
data.emplace_back("data/parquet-testing/userdata1.parquet");
|
|
data.emplace_back("data/parquet-testing/varchar_stats.parquet");
|
|
data.emplace_back("data/parquet-testing/zstd.parquet");
|
|
|
|
for (auto &parquet_path : data) {
|
|
TestParquetRoundtrip(parquet_path);
|
|
}
|
|
}
|