437 lines
15 KiB
Julia
437 lines
15 KiB
Julia
# test_scalar_udf.jl
|
|
|
|
# Define a simple scalar UDF that doubles the input value
|
|
|
|
function my_double_function(
|
|
info::DuckDB.duckdb_function_info,
|
|
input::DuckDB.duckdb_data_chunk,
|
|
output::DuckDB.duckdb_vector
|
|
)
|
|
# Convert input data chunk to DataChunk object
|
|
input_chunk = DuckDB.DataChunk(input, false)
|
|
n = DuckDB.get_size(input_chunk)
|
|
|
|
# Get input vector (assuming one input parameter)
|
|
input_vector = DuckDB.get_vector(input_chunk, 1)
|
|
input_array = DuckDB.get_array(input_vector, Int64, n)
|
|
|
|
# Get output vector
|
|
output_array = DuckDB.get_array(DuckDB.Vec(output), Int64, n)
|
|
|
|
# Perform the operation: double each input value
|
|
for i in 1:n
|
|
output_array[i] = input_array[i] * 2
|
|
end
|
|
end
|
|
|
|
# Define a scalar UDF that returns NULL for odd numbers and the number itself for even numbers
|
|
function my_null_function(
|
|
info::DuckDB.duckdb_function_info,
|
|
input::DuckDB.duckdb_data_chunk,
|
|
output::DuckDB.duckdb_vector
|
|
)
|
|
# Convert input data chunk to DataChunk object
|
|
input_chunk = DuckDB.DataChunk(input, false)
|
|
n = DuckDB.get_size(input_chunk)
|
|
|
|
# Get input vector
|
|
input_vector = DuckDB.get_vector(input_chunk, 1)
|
|
input_array = DuckDB.get_array(input_vector, Int64, n)
|
|
validity_input = DuckDB.get_validity(input_vector)
|
|
|
|
# Get output vector
|
|
output_vector = DuckDB.Vec(output)
|
|
output_array = DuckDB.get_array(output_vector, Int64, n)
|
|
validity_output = DuckDB.get_validity(output_vector)
|
|
|
|
# Perform the operation
|
|
for i in 1:n
|
|
if DuckDB.isvalid(validity_input, i)
|
|
if input_array[i] % 2 == 0
|
|
output_array[i] = input_array[i]
|
|
# Validity is true by default, no need to set
|
|
else
|
|
# Set output as NULL
|
|
DuckDB.setinvalid(validity_output, i)
|
|
end
|
|
else
|
|
# Input is NULL, set output as NULL
|
|
DuckDB.setinvalid(validity_output, i)
|
|
end
|
|
end
|
|
end
|
|
|
|
# Define a scalar UDF that always throws an error
|
|
function my_error_function(
|
|
info::DuckDB.duckdb_function_info,
|
|
input::DuckDB.duckdb_data_chunk,
|
|
output::DuckDB.duckdb_vector
|
|
)
|
|
throw(ErrorException("Runtime error in scalar function"))
|
|
end
|
|
|
|
|
|
function my_string_function_count_a(
|
|
info::DuckDB.duckdb_function_info,
|
|
input::DuckDB.duckdb_data_chunk,
|
|
output::DuckDB.duckdb_vector
|
|
)
|
|
|
|
|
|
|
|
input_chunk = DuckDB.DataChunk(input, false)
|
|
output_vec = DuckDB.Vec(output)
|
|
n = DuckDB.get_size(input_chunk)
|
|
chunks = [input_chunk]
|
|
extra_info_ptr = DuckDB.duckdb_scalar_function_get_extra_info(info)
|
|
extra_info::DuckDB.ScalarFunction = unsafe_pointer_to_objref(extra_info_ptr)
|
|
conversion_data = DuckDB.ColumnConversionData(chunks, 1, extra_info.logical_parameters[1], nothing)
|
|
a_data_converted = DuckDB.DuckDB.convert_column(conversion_data)
|
|
output_data = DuckDB.get_array(DuckDB.Vec(output), Int, n)
|
|
|
|
# # # @info "Values" a_data b_data
|
|
for row in 1:n
|
|
result = count(x -> x == 'a', a_data_converted[row])
|
|
output_data[row] = result
|
|
end
|
|
return nothing
|
|
end
|
|
|
|
|
|
|
|
function my_string_function_reverse_concat(
|
|
info::DuckDB.duckdb_function_info,
|
|
input::DuckDB.duckdb_data_chunk,
|
|
output::DuckDB.duckdb_vector
|
|
)
|
|
|
|
input_chunk = DuckDB.DataChunk(input, false)
|
|
output_vec = DuckDB.Vec(output)
|
|
n = Int64(DuckDB.get_size(input_chunk))
|
|
chunks = [input_chunk]
|
|
|
|
extra_info_ptr = DuckDB.duckdb_scalar_function_get_extra_info(info)
|
|
|
|
extra_info::DuckDB.ScalarFunction = unsafe_pointer_to_objref(extra_info_ptr)
|
|
conversion_data_a = DuckDB.ColumnConversionData(chunks, 1, extra_info.logical_parameters[1], nothing)
|
|
conversion_data_b = DuckDB.ColumnConversionData(chunks, 2, extra_info.logical_parameters[2], nothing)
|
|
|
|
a_data_converted = DuckDB.DuckDB.convert_column(conversion_data_a)
|
|
b_data_converted = DuckDB.DuckDB.convert_column(conversion_data_b)
|
|
|
|
for row in 1:n
|
|
result = string(reverse(a_data_converted[row]), b_data_converted[row])
|
|
DuckDB.assign_string_element(output_vec, row, result)
|
|
end
|
|
return nothing
|
|
end
|
|
|
|
|
|
@testset "Test custom scalar functions" begin
|
|
# Connect to DuckDB
|
|
db = DuckDB.DB()
|
|
con = DuckDB.connect(db)
|
|
|
|
# Create the test table
|
|
DuckDB.query(con, "CREATE TABLE test_table AS SELECT i FROM range(10) t(i)")
|
|
|
|
# Define logical type BIGINT
|
|
type_bigint = DuckDB.duckdb_create_logical_type(DuckDB.DUCKDB_TYPE_BIGINT)
|
|
|
|
# Test 1: Double Function
|
|
# Create the scalar function
|
|
f_double = DuckDB.duckdb_create_scalar_function()
|
|
DuckDB.duckdb_scalar_function_set_name(f_double, "double_value")
|
|
|
|
# Set parameter types
|
|
DuckDB.duckdb_scalar_function_add_parameter(f_double, type_bigint)
|
|
|
|
# Set return type
|
|
DuckDB.duckdb_scalar_function_set_return_type(f_double, type_bigint)
|
|
|
|
# Set the function
|
|
CMyDoubleFunction = @cfunction(
|
|
my_double_function,
|
|
Cvoid,
|
|
(DuckDB.duckdb_function_info, DuckDB.duckdb_data_chunk, DuckDB.duckdb_vector)
|
|
)
|
|
DuckDB.duckdb_scalar_function_set_function(f_double, CMyDoubleFunction)
|
|
|
|
# Register the function
|
|
res = DuckDB.duckdb_register_scalar_function(con.handle, f_double)
|
|
@test res == DuckDB.DuckDBSuccess
|
|
|
|
# Execute the function in a query
|
|
results = DuckDB.query(con, "SELECT i, double_value(i) as doubled FROM test_table")
|
|
df = DataFrame(results)
|
|
@test names(df) == ["i", "doubled"]
|
|
@test size(df, 1) == 10
|
|
@test df.doubled == df.i .* 2
|
|
|
|
# Test 2: Null Function
|
|
# Create the scalar function
|
|
f_null = DuckDB.duckdb_create_scalar_function()
|
|
DuckDB.duckdb_scalar_function_set_name(f_null, "null_if_odd")
|
|
|
|
# Set parameter types
|
|
DuckDB.duckdb_scalar_function_add_parameter(f_null, type_bigint)
|
|
|
|
# Set return type
|
|
DuckDB.duckdb_scalar_function_set_return_type(f_null, type_bigint)
|
|
|
|
# Set the function
|
|
CMyNullFunction = @cfunction(
|
|
my_null_function,
|
|
Cvoid,
|
|
(DuckDB.duckdb_function_info, DuckDB.duckdb_data_chunk, DuckDB.duckdb_vector)
|
|
)
|
|
DuckDB.duckdb_scalar_function_set_function(f_null, CMyNullFunction)
|
|
|
|
# Register the function
|
|
res_null = DuckDB.duckdb_register_scalar_function(con.handle, f_null)
|
|
@test res_null == DuckDB.DuckDBSuccess
|
|
|
|
# Execute the function in a query
|
|
results_null = DuckDB.query(con, "SELECT i, null_if_odd(i) as value_or_null FROM test_table")
|
|
df_null = DataFrame(results_null)
|
|
@test names(df_null) == ["i", "value_or_null"]
|
|
@test size(df_null, 1) == 10
|
|
expected_values = Vector{Union{Missing, Int64}}(undef, 10)
|
|
for idx in 1:10
|
|
i = idx - 1 # Since i ranges from 0 to 9
|
|
if i % 2 == 0
|
|
expected_values[idx] = i
|
|
else
|
|
expected_values[idx] = missing
|
|
end
|
|
end
|
|
@test all(df_null.value_or_null .=== expected_values)
|
|
|
|
# Adjusted Test 3: Error Function
|
|
# Create the scalar function
|
|
f_error = DuckDB.duckdb_create_scalar_function()
|
|
DuckDB.duckdb_scalar_function_set_name(f_error, "error_function")
|
|
|
|
# Set parameter types
|
|
DuckDB.duckdb_scalar_function_add_parameter(f_error, type_bigint)
|
|
|
|
# Set return type
|
|
DuckDB.duckdb_scalar_function_set_return_type(f_error, type_bigint)
|
|
|
|
# Set the function
|
|
CMyErrorFunction = @cfunction(
|
|
my_error_function,
|
|
Cvoid,
|
|
(DuckDB.duckdb_function_info, DuckDB.duckdb_data_chunk, DuckDB.duckdb_vector)
|
|
)
|
|
DuckDB.duckdb_scalar_function_set_function(f_error, CMyErrorFunction)
|
|
|
|
# Register the function
|
|
res_error = DuckDB.duckdb_register_scalar_function(con.handle, f_error)
|
|
@test res_error == DuckDB.DuckDBSuccess
|
|
|
|
# Adjusted test to expect ErrorException
|
|
@test_throws ErrorException DuckDB.query(con, "SELECT error_function(i) FROM test_table")
|
|
|
|
# Clean up logical type
|
|
DuckDB.duckdb_destroy_logical_type(type_bigint)
|
|
|
|
# Disconnect and close
|
|
DuckDB.disconnect(con)
|
|
DuckDB.close(db)
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
mysum(a, b) = a + b # Dummy function
|
|
my_reverse(s) = string(reverse(s))
|
|
|
|
@testset "UDF_Macro" begin
|
|
|
|
# Parse Expression
|
|
expr = :(mysum(a::Int, b::String)::Int)
|
|
func_name, func_params, return_value = DuckDB._udf_parse_function_expr(expr)
|
|
@test func_name == :mysum
|
|
@test func_params == [(:a, :Int), (:b, :String)]
|
|
@test return_value == :Int
|
|
|
|
# Build expressions
|
|
var_names, expressions =
|
|
DuckDB._udf_generate_conversion_expressions(func_params, :log_types, :convert, :param, :chunk)
|
|
@test var_names == [:param_1, :param_2]
|
|
@test expressions[1] == :(param_1 = convert(Int, log_types[1], chunk, 1))
|
|
@test expressions[2] == :(param_2 = convert(String, log_types[2], chunk, 2))
|
|
|
|
|
|
# Generate UDF
|
|
db = DuckDB.DB()
|
|
con = DuckDB.connect(db)
|
|
fun = DuckDB.@create_scalar_function mysum(a::Int, b::Int)::Int
|
|
#ptr = @cfunction(fun.wrapper, Cvoid, (Ptr{Cvoid},Ptr{Cvoid},Ptr{Cvoid}))
|
|
|
|
#ptr = pointer_from_objref(mysum_udf.wrapper)
|
|
#DuckDB.duckdb_scalar_function_set_function(mysum_udf.handle, ptr)
|
|
DuckDB.register_scalar_function(con, fun) # Register UDF
|
|
|
|
@test_throws ArgumentError DuckDB.register_scalar_function(con, fun) # Register UDF twice
|
|
|
|
|
|
DuckDB.execute(con, "CREATE TABLE test1 (a INT, b INT);")
|
|
DuckDB.execute(con, "INSERT INTO test1 VALUES ('1', '2'), ('3','4'), ('5', '6')")
|
|
result = DuckDB.execute(con, "SELECT mysum(a, b) as result FROM test1") |> DataFrame
|
|
@test result.result == [3, 7, 11]
|
|
|
|
end
|
|
|
|
@testset "UDF Macro Various Types" begin
|
|
import Dates
|
|
db = DuckDB.DB()
|
|
con = DuckDB.connect(db)
|
|
|
|
my_reverse_inner = (s) -> ("Inner:" * string(reverse(s)))
|
|
fun_is_weekend = (d) -> Dates.dayofweek(d) in (6, 7)
|
|
date_2020 = (x) -> Dates.Date(2020, 1, 1) + Dates.Day(x) # Dummy function
|
|
|
|
my_and(a, b) = a && b
|
|
my_int_add(a, b) = a + b
|
|
my_mixed_add(a::Int, b::Float64) = a + b
|
|
|
|
df_numbers =
|
|
DataFrame(a = rand(1:100, 30), b = rand(1:100, 30), c = rand(30), d = rand(Bool, 30), e = rand(Bool, 30))
|
|
df_strings = DataFrame(a = ["hello", "world", "julia", "duckdb", "🦆DB"])
|
|
t = Date(2020, 1, 1):Day(1):Date(2020, 12, 31)
|
|
df_dates = DataFrame(t = t, k = 1:length(t), is_weekend = fun_is_weekend.(t))
|
|
|
|
|
|
DuckDB.register_table(con, df_strings, "test_strings")
|
|
DuckDB.register_table(con, df_dates, "test_dates")
|
|
DuckDB.register_table(con, df_numbers, "test_numbers")
|
|
|
|
|
|
# Register UDFs
|
|
fun_string = DuckDB.@create_scalar_function my_reverse(s::String)::String (s) -> my_reverse_inner(s)
|
|
DuckDB.register_scalar_function(con, fun_string) # Register UDF
|
|
|
|
fun_date = DuckDB.@create_scalar_function is_weekend(d::Date)::Bool fun_is_weekend
|
|
fun_date2 = DuckDB.@create_scalar_function date_2020(x::Int)::Date date_2020
|
|
DuckDB.register_scalar_function(con, fun_date) # Register UDF
|
|
DuckDB.register_scalar_function(con, fun_date2) # Register UDF
|
|
|
|
fun_and = DuckDB.@create_scalar_function my_and(a::Bool, b::Bool)::Bool my_and
|
|
fun_int_add = DuckDB.@create_scalar_function my_int_add(a::Int, b::Int)::Int my_int_add
|
|
fun_mixed_add = DuckDB.@create_scalar_function my_mixed_add(a::Int, b::Float64)::Float64 my_mixed_add
|
|
DuckDB.register_scalar_function(con, fun_and)
|
|
DuckDB.register_scalar_function(con, fun_int_add)
|
|
DuckDB.register_scalar_function(con, fun_mixed_add)
|
|
|
|
result1 = DuckDB.execute(con, "SELECT my_reverse(a) as result FROM test_strings") |> DataFrame
|
|
@test result1.result == my_reverse_inner.(df_strings.a)
|
|
|
|
|
|
result2_1 = DuckDB.execute(con, "SELECT is_weekend(t) as result FROM test_dates") |> DataFrame
|
|
@test result2_1.result == fun_is_weekend.(df_dates.t)
|
|
result2_2 = DuckDB.execute(con, "SELECT date_2020(k) as result FROM test_dates") |> DataFrame
|
|
@test result2_2.result == date_2020.(df_dates.k)
|
|
|
|
result3 = DuckDB.execute(con, "SELECT my_and(d, e) as result FROM test_numbers") |> DataFrame
|
|
@test result3.result == my_and.(df_numbers.d, df_numbers.e)
|
|
|
|
|
|
result4 = DuckDB.execute(con, "SELECT my_int_add(a, b) as result FROM test_numbers") |> DataFrame
|
|
@test result4.result == my_int_add.(df_numbers.a, df_numbers.b)
|
|
|
|
result5 = DuckDB.execute(con, "SELECT my_mixed_add(a, c) as result FROM test_numbers") |> DataFrame
|
|
@test result5.result == my_mixed_add.(df_numbers.a, df_numbers.c)
|
|
|
|
end
|
|
|
|
@testset "UDF Macro Exception" begin
|
|
|
|
f_error = function (a)
|
|
if iseven(a)
|
|
throw(ArgumentError("Even number"))
|
|
else
|
|
return a + 1
|
|
end
|
|
end
|
|
|
|
|
|
db = DuckDB.DB()
|
|
con = DuckDB.connect(db)
|
|
|
|
fun_error = DuckDB.@create_scalar_function f_error(a::Int)::Int f_error
|
|
DuckDB.register_scalar_function(con, fun_error) # Register UDF
|
|
|
|
df = DataFrame(a = 1:10)
|
|
DuckDB.register_table(con, df, "test1")
|
|
|
|
@test_throws Exception result = DuckDB.execute(con, "SELECT f_error(a) as result FROM test1") |> DataFrame
|
|
end
|
|
|
|
@testset "UDF Macro Missing Values" begin
|
|
|
|
f_add = (a, b) -> a + b
|
|
db = DuckDB.DB()
|
|
con = DuckDB.connect(db)
|
|
|
|
fun = DuckDB.@create_scalar_function f_add(a::Int, b::Int)::Int f_add
|
|
DuckDB.register_scalar_function(con, fun)
|
|
|
|
df = DataFrame(a = [1, missing, 3], b = [missing, 2, 3])
|
|
DuckDB.register_table(con, df, "test1")
|
|
|
|
result = DuckDB.execute(con, "SELECT f_add(a, b) as result FROM test1") |> DataFrame
|
|
@test isequal(result.result, [missing, missing, 6])
|
|
end
|
|
|
|
@testset "UDF Macro Benchmark" begin
|
|
# Check if the generated UDF is comparable to pure Julia or DuckDB expressions
|
|
#
|
|
# Currently UDFs takes about as much time as Julia/DuckDB expressions
|
|
# - The evaluation of the wrapper takes around 20% of the execution time
|
|
# - slow calls are setindex! and getindex
|
|
# - table_scan_func is the slowest call
|
|
|
|
|
|
db = DuckDB.DB()
|
|
con = DuckDB.connect(db)
|
|
fun_int = DuckDB.@create_scalar_function mysum(a::Int, b::Int)::Int
|
|
fun_float = DuckDB.@create_scalar_function mysum_f(a::Float64, b::Float64)::Float64 mysum
|
|
|
|
DuckDB.register_scalar_function(con, fun_int) # Register UDF
|
|
DuckDB.register_scalar_function(con, fun_float) # Register UDF
|
|
|
|
N = 10_000_000
|
|
df = DataFrame(a = 1:N, b = 1:N, c = rand(N), d = rand(N))
|
|
|
|
DuckDB.register_table(con, df, "test1")
|
|
|
|
# Precompile functions
|
|
precompile(mysum, (Int, Int))
|
|
precompile(mysum, (Float64, Float64))
|
|
DuckDB.execute(con, "SELECT mysum(a, b) as result FROM test1")
|
|
DuckDB.execute(con, "SELECT mysum_f(c, d) as result FROM test1")
|
|
|
|
# INTEGER Benchmark
|
|
|
|
t1 = @elapsed result_exp = df.a .+ df.b
|
|
t2 = @elapsed result = DuckDB.execute(con, "SELECT mysum(a, b) as result FROM test1")
|
|
t3 = @elapsed result2 = DuckDB.execute(con, "SELECT a + b as result FROM test1")
|
|
@test DataFrame(result).result == result_exp
|
|
# Prints:
|
|
# Benchmark Int: Julia Expression: 0.092947083, UDF: 0.078665125, DDB: 0.065306042
|
|
@info "Benchmark Int: Julia Expression: $t1, UDF: $t2, DDB: $t3"
|
|
|
|
|
|
# FLOAT Benchmark
|
|
t1 = @elapsed result_exp = df.c .+ df.d
|
|
t2 = @elapsed result = DuckDB.execute(con, "SELECT mysum_f(c, d) as result FROM test1")
|
|
t3 = @elapsed result2 = DuckDB.execute(con, "SELECT c + d as result FROM test1")
|
|
@test DataFrame(result).result ≈ result_exp atol = 1e-6
|
|
# Prints:
|
|
# Benchmark Float: Julia Expression: 0.090409625, UDF: 0.080781, DDB: 0.054156167
|
|
@info "Benchmark Float: Julia Expression: $t1, UDF: $t2, DDB: $t3"
|
|
end
|