should be it
This commit is contained in:
244
external/duckdb/test/arrow/arrow_roundtrip.cpp
vendored
Normal file
244
external/duckdb/test/arrow/arrow_roundtrip.cpp
vendored
Normal file
@@ -0,0 +1,244 @@
|
||||
#include "catch.hpp"
|
||||
|
||||
#include "arrow/arrow_test_helper.hpp"
|
||||
|
||||
using namespace duckdb;
|
||||
|
||||
static void TestArrowRoundtrip(const string &query, bool export_large_buffer = false,
|
||||
bool loseless_conversion = false) {
|
||||
DuckDB db;
|
||||
Connection con(db);
|
||||
if (export_large_buffer) {
|
||||
auto res = con.Query("SET arrow_large_buffer_size=True");
|
||||
REQUIRE(!res->HasError());
|
||||
}
|
||||
if (loseless_conversion) {
|
||||
auto res = con.Query("SET arrow_lossless_conversion = true");
|
||||
REQUIRE(!res->HasError());
|
||||
}
|
||||
REQUIRE(ArrowTestHelper::RunArrowComparison(con, query, true));
|
||||
REQUIRE(ArrowTestHelper::RunArrowComparison(con, query, false));
|
||||
}
|
||||
|
||||
static void TestArrowRoundtripStringView(const string &query) {
|
||||
DuckDB db;
|
||||
Connection con(db);
|
||||
auto res = con.Query("SET produce_arrow_string_view=True");
|
||||
REQUIRE(!res->HasError());
|
||||
REQUIRE(ArrowTestHelper::RunArrowComparison(con, query, false));
|
||||
}
|
||||
|
||||
static void TestParquetRoundtrip(const string &path) {
|
||||
DBConfig config;
|
||||
// This needs to be set since this test will be triggered when testing autoloading
|
||||
config.options.allow_unsigned_extensions = true;
|
||||
|
||||
DuckDB db(nullptr, &config);
|
||||
Connection con(db);
|
||||
|
||||
// run the query
|
||||
auto query = "SELECT * FROM parquet_scan('" + path + "')";
|
||||
REQUIRE(ArrowTestHelper::RunArrowComparison(con, query, true));
|
||||
REQUIRE(ArrowTestHelper::RunArrowComparison(con, query));
|
||||
}
|
||||
|
||||
TEST_CASE("Test Export Large", "[arrow]") {
|
||||
// Test with Regular Buffer Size
|
||||
TestArrowRoundtrip("SELECT 'bla' FROM range(10000)");
|
||||
|
||||
TestArrowRoundtrip("SELECT 'bla'::BLOB FROM range(10000)");
|
||||
|
||||
TestArrowRoundtrip("SELECT '3d038406-6275-4aae-bec1-1235ccdeaade'::UUID FROM range(10000) tbl(i)", false, true);
|
||||
|
||||
// Test with Large Buffer Size
|
||||
TestArrowRoundtrip("SELECT 'bla' FROM range(10000)", true);
|
||||
|
||||
TestArrowRoundtrip("SELECT 'bla'::BLOB FROM range(10000)", true);
|
||||
|
||||
TestArrowRoundtrip("SELECT '3d038406-6275-4aae-bec1-1235ccdeaade'::UUID FROM range(10000) tbl(i)", true, true);
|
||||
}
|
||||
|
||||
TEST_CASE("Test arrow roundtrip", "[arrow]") {
|
||||
TestArrowRoundtrip("SELECT * FROM range(10000) tbl(i) UNION ALL SELECT NULL");
|
||||
TestArrowRoundtrip("SELECT m from (select MAP(list_value(1), list_value(2)) from range(5) tbl(i)) tbl(m)");
|
||||
TestArrowRoundtrip("SELECT * FROM range(10) tbl(i)");
|
||||
TestArrowRoundtrip("SELECT case when i%2=0 then null else i end i FROM range(10) tbl(i)");
|
||||
TestArrowRoundtrip("SELECT case when i%2=0 then true else false end b FROM range(10) tbl(i)");
|
||||
TestArrowRoundtrip("SELECT case when i%2=0 then i%4=0 else null end b FROM range(10) tbl(i)");
|
||||
TestArrowRoundtrip("SELECT 'thisisalongstring'||i::varchar str FROM range(10) tbl(i)");
|
||||
TestArrowRoundtrip(
|
||||
"SELECT case when i%2=0 then null else 'thisisalongstring'||i::varchar end str FROM range(10) tbl(i)");
|
||||
TestArrowRoundtrip("SELECT {'i': i, 'b': 10-i} str FROM range(10) tbl(i)");
|
||||
TestArrowRoundtrip("SELECT case when i%2=0 then {'i': case when i%4=0 then null else i end, 'b': 10-i} else null "
|
||||
"end str FROM range(10) tbl(i)");
|
||||
TestArrowRoundtrip("SELECT [i, i+1, i+2] FROM range(10) tbl(i)");
|
||||
TestArrowRoundtrip(
|
||||
"SELECT MAP(LIST_VALUE({'i':1,'j':2},{'i':3,'j':4}),LIST_VALUE({'i':1,'j':2},{'i':3,'j':4})) as a");
|
||||
TestArrowRoundtrip(
|
||||
"SELECT MAP(LIST_VALUE({'i':i,'j':i+2},{'i':3,'j':NULL}),LIST_VALUE({'i':i+10,'j':2},{'i':i+4,'j':4})) as a "
|
||||
"FROM range(10) tbl(i)");
|
||||
TestArrowRoundtrip("SELECT MAP(['hello', 'world'||i::VARCHAR],[i + 1, NULL]) as a FROM range(10) tbl(i)");
|
||||
TestArrowRoundtrip("SELECT (1.5 + i)::DECIMAL(4,2) dec4, (1.5 + i)::DECIMAL(9,3) dec9, (1.5 + i)::DECIMAL(18,3) "
|
||||
"dec18, (1.5 + i)::DECIMAL(38,3) dec38 FROM range(10) tbl(i)");
|
||||
TestArrowRoundtrip(
|
||||
"SELECT case when i%2=0 then null else INTERVAL (i) seconds end AS interval FROM range(10) tbl(i)");
|
||||
#if STANDARD_VECTOR_SIZE < 64
|
||||
// FIXME: there seems to be a bug in the enum arrow reader in this test when run with vsize=2
|
||||
return;
|
||||
#endif
|
||||
TestArrowRoundtrip("SELECT * EXCLUDE(bit,time_tz, bignum) REPLACE "
|
||||
"(interval (1) seconds AS interval, hugeint::DOUBLE as hugeint, uhugeint::DOUBLE as uhugeint) "
|
||||
"FROM test_all_types()",
|
||||
false, true);
|
||||
}
|
||||
|
||||
TEST_CASE("Test Arrow Extension Types", "[arrow][.]") {
|
||||
// UUID
|
||||
TestArrowRoundtrip("SELECT '2d89ebe6-1e13-47e5-803a-b81c87660b66'::UUID str FROM range(5) tbl(i)", false, true);
|
||||
|
||||
// HUGEINT
|
||||
TestArrowRoundtrip("SELECT '170141183460469231731687303715884105727'::HUGEINT str FROM range(5) tbl(i)", false,
|
||||
true);
|
||||
|
||||
// UHUGEINT
|
||||
TestArrowRoundtrip("SELECT '170141183460469231731687303715884105727'::UHUGEINT str FROM range(5) tbl(i)", false,
|
||||
true);
|
||||
|
||||
// BIT
|
||||
TestArrowRoundtrip("SELECT '0101011'::BIT str FROM range(5) tbl(i)", false, true);
|
||||
|
||||
// TIME_TZ
|
||||
TestArrowRoundtrip("SELECT '02:30:00+04'::TIMETZ str FROM range(5) tbl(i)", false, true);
|
||||
|
||||
// BIGNUM
|
||||
TestArrowRoundtrip("SELECT 85070591730234614260976917445211069672::BIGNUM str FROM range(5) tbl(i)", false, true);
|
||||
|
||||
TestArrowRoundtrip("SELECT 85070591730234614260976917445211069672::BIGNUM str FROM range(5) tbl(i)", true, true);
|
||||
}
|
||||
|
||||
TEST_CASE("Test Arrow Extension Types - JSON", "[arrow][.]") {
|
||||
DBConfig config;
|
||||
DuckDB db(nullptr, &config);
|
||||
Connection con(db);
|
||||
|
||||
if (!db.ExtensionIsLoaded("json")) {
|
||||
return;
|
||||
}
|
||||
|
||||
// JSON
|
||||
TestArrowRoundtrip("SELECT '{\"name\":\"Pedro\", \"age\":28, \"car\":\"VW Fox\"}'::JSON str FROM range(5) tbl(i)",
|
||||
false, true);
|
||||
}
|
||||
|
||||
TEST_CASE("Test Arrow String View", "[arrow][.]") {
|
||||
// Test Small Strings
|
||||
TestArrowRoundtripStringView("SELECT (i*10^i)::varchar str FROM range(5) tbl(i)");
|
||||
|
||||
// Test Small Strings + Nulls
|
||||
TestArrowRoundtripStringView("SELECT (i*10^i)::varchar str FROM range(5) tbl(i) UNION SELECT NULL");
|
||||
|
||||
// Test Big Strings
|
||||
TestArrowRoundtripStringView("SELECT 'Imaverybigstringmuchbiggerthanfourbytes' str FROM range(5) tbl(i)");
|
||||
|
||||
// Test Big Strings + Nulls
|
||||
TestArrowRoundtripStringView("SELECT 'Imaverybigstringmuchbiggerthanfourbytes'||i::varchar str FROM range(5) "
|
||||
"tbl(i) UNION SELECT NULL order by str");
|
||||
|
||||
// Test Mix of Small/Big/NULL Strings
|
||||
TestArrowRoundtripStringView(
|
||||
"SELECT 'Imaverybigstringmuchbiggerthanfourbytes'||i::varchar str FROM range(10000) tbl(i) UNION "
|
||||
"SELECT NULL UNION SELECT (i*10^i)::varchar str FROM range(10000) tbl(i)");
|
||||
}
|
||||
|
||||
TEST_CASE("Test TPCH arrow roundtrip", "[arrow][.]") {
|
||||
DBConfig config;
|
||||
DuckDB db(nullptr, &config);
|
||||
Connection con(db);
|
||||
|
||||
if (!db.ExtensionIsLoaded("tpch")) {
|
||||
return;
|
||||
}
|
||||
con.SendQuery("CALL dbgen(sf=0.5)");
|
||||
|
||||
// REQUIRE(ArrowTestHelper::RunArrowComparison(con, "SELECT * FROM lineitem;", false));
|
||||
// REQUIRE(ArrowTestHelper::RunArrowComparison(con, "SELECT l_orderkey, l_shipdate, l_comment FROM lineitem ORDER BY
|
||||
// l_orderkey DESC;", false)); REQUIRE(ArrowTestHelper::RunArrowComparison(con, "SELECT lineitem FROM lineitem;",
|
||||
// false)); REQUIRE(ArrowTestHelper::RunArrowComparison(con, "SELECT [lineitem] FROM lineitem;", false));
|
||||
|
||||
con.SendQuery("create table lineitem_no_constraint as from lineitem;");
|
||||
con.SendQuery("update lineitem_no_constraint set l_comment=null where l_orderkey%2=0;");
|
||||
|
||||
// REQUIRE(ArrowTestHelper::RunArrowComparison(con, "SELECT * FROM lineitem_no_constraint;", false));
|
||||
REQUIRE(ArrowTestHelper::RunArrowComparison(
|
||||
con, "SELECT l_orderkey, l_shipdate, l_comment FROM lineitem_no_constraint ORDER BY l_orderkey DESC;", false));
|
||||
REQUIRE(
|
||||
ArrowTestHelper::RunArrowComparison(con, "SELECT lineitem_no_constraint FROM lineitem_no_constraint;", false));
|
||||
REQUIRE(ArrowTestHelper::RunArrowComparison(con, "SELECT [lineitem_no_constraint] FROM lineitem_no_constraint;",
|
||||
false));
|
||||
}
|
||||
|
||||
TEST_CASE("Test Parquet Files round-trip", "[arrow][.]") {
|
||||
std::vector<std::string> data;
|
||||
// data.emplace_back("data/parquet-testing/7-set.snappy.arrow2.parquet");
|
||||
// data.emplace_back("data/parquet-testing/adam_genotypes.parquet");
|
||||
data.emplace_back("data/parquet-testing/apkwan.parquet");
|
||||
data.emplace_back("data/parquet-testing/aws1.snappy.parquet");
|
||||
// not supported by arrow
|
||||
// data.emplace_back("data/parquet-testing/aws2.parquet");
|
||||
data.emplace_back("data/parquet-testing/binary_string.parquet");
|
||||
data.emplace_back("data/parquet-testing/blob.parquet");
|
||||
data.emplace_back("data/parquet-testing/boolean_stats.parquet");
|
||||
// arrow can't read this
|
||||
// data.emplace_back("data/parquet-testing/broken-arrow.parquet");
|
||||
data.emplace_back("data/parquet-testing/bug1554.parquet");
|
||||
data.emplace_back("data/parquet-testing/bug1588.parquet");
|
||||
data.emplace_back("data/parquet-testing/bug1589.parquet");
|
||||
data.emplace_back("data/parquet-testing/bug1618_struct_strings.parquet");
|
||||
data.emplace_back("data/parquet-testing/bug2267.parquet");
|
||||
data.emplace_back("data/parquet-testing/bug2557.parquet");
|
||||
// slow
|
||||
// data.emplace_back("data/parquet-testing/bug687_nulls.parquet");
|
||||
// data.emplace_back("data/parquet-testing/complex.parquet");
|
||||
data.emplace_back("data/parquet-testing/data-types.parquet");
|
||||
data.emplace_back("data/parquet-testing/date.parquet");
|
||||
// arrow can't read this because it's a time with a timezone and it's not supported by arrow
|
||||
// data.emplace_back("data/parquet-testing/date_stats.parquet");
|
||||
data.emplace_back("data/parquet-testing/decimal_stats.parquet");
|
||||
data.emplace_back("data/parquet-testing/decimals.parquet");
|
||||
data.emplace_back("data/parquet-testing/enum.parquet");
|
||||
data.emplace_back("data/parquet-testing/filter_bug1391.parquet");
|
||||
// data.emplace_back("data/parquet-testing/fixed.parquet");
|
||||
// slow
|
||||
// data.emplace_back("data/parquet-testing/leftdate3_192_loop_1.parquet");
|
||||
data.emplace_back("data/parquet-testing/lineitem-top10000.gzip.parquet");
|
||||
data.emplace_back("data/parquet-testing/manyrowgroups.parquet");
|
||||
data.emplace_back("data/parquet-testing/manyrowgroups2.parquet");
|
||||
// data.emplace_back("data/parquet-testing/map.parquet");
|
||||
// Can't roundtrip NaNs
|
||||
data.emplace_back("data/parquet-testing/nan-float.parquet");
|
||||
// null byte in file
|
||||
// data.emplace_back("data/parquet-testing/nullbyte.parquet");
|
||||
// data.emplace_back("data/parquet-testing/nullbyte_multiple.parquet");
|
||||
// borked
|
||||
// data.emplace_back("data/parquet-testing/p2.parquet");
|
||||
// data.emplace_back("data/parquet-testing/p2strings.parquet");
|
||||
data.emplace_back("data/parquet-testing/pandas-date.parquet");
|
||||
data.emplace_back("data/parquet-testing/signed_stats.parquet");
|
||||
data.emplace_back("data/parquet-testing/silly-names.parquet");
|
||||
// borked
|
||||
// data.emplace_back("data/parquet-testing/simple.parquet");
|
||||
// data.emplace_back("data/parquet-testing/sorted.zstd_18_131072_small.parquet");
|
||||
data.emplace_back("data/parquet-testing/struct.parquet");
|
||||
data.emplace_back("data/parquet-testing/struct_skip_test.parquet");
|
||||
data.emplace_back("data/parquet-testing/timestamp-ms.parquet");
|
||||
data.emplace_back("data/parquet-testing/timestamp.parquet");
|
||||
data.emplace_back("data/parquet-testing/unsigned.parquet");
|
||||
data.emplace_back("data/parquet-testing/unsigned_stats.parquet");
|
||||
data.emplace_back("data/parquet-testing/userdata1.parquet");
|
||||
data.emplace_back("data/parquet-testing/varchar_stats.parquet");
|
||||
data.emplace_back("data/parquet-testing/zstd.parquet");
|
||||
|
||||
for (auto &parquet_path : data) {
|
||||
TestParquetRoundtrip(parquet_path);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user