Files
email-tracker/external/duckdb/tools/juliapkg/test/test_scalar_udf.jl
2025-10-24 19:21:19 -05:00

437 lines
15 KiB
Julia

# test_scalar_udf.jl
# Define a simple scalar UDF that doubles the input value
function my_double_function(
info::DuckDB.duckdb_function_info,
input::DuckDB.duckdb_data_chunk,
output::DuckDB.duckdb_vector
)
# Convert input data chunk to DataChunk object
input_chunk = DuckDB.DataChunk(input, false)
n = DuckDB.get_size(input_chunk)
# Get input vector (assuming one input parameter)
input_vector = DuckDB.get_vector(input_chunk, 1)
input_array = DuckDB.get_array(input_vector, Int64, n)
# Get output vector
output_array = DuckDB.get_array(DuckDB.Vec(output), Int64, n)
# Perform the operation: double each input value
for i in 1:n
output_array[i] = input_array[i] * 2
end
end
# Define a scalar UDF that returns NULL for odd numbers and the number itself for even numbers
function my_null_function(
info::DuckDB.duckdb_function_info,
input::DuckDB.duckdb_data_chunk,
output::DuckDB.duckdb_vector
)
# Convert input data chunk to DataChunk object
input_chunk = DuckDB.DataChunk(input, false)
n = DuckDB.get_size(input_chunk)
# Get input vector
input_vector = DuckDB.get_vector(input_chunk, 1)
input_array = DuckDB.get_array(input_vector, Int64, n)
validity_input = DuckDB.get_validity(input_vector)
# Get output vector
output_vector = DuckDB.Vec(output)
output_array = DuckDB.get_array(output_vector, Int64, n)
validity_output = DuckDB.get_validity(output_vector)
# Perform the operation
for i in 1:n
if DuckDB.isvalid(validity_input, i)
if input_array[i] % 2 == 0
output_array[i] = input_array[i]
# Validity is true by default, no need to set
else
# Set output as NULL
DuckDB.setinvalid(validity_output, i)
end
else
# Input is NULL, set output as NULL
DuckDB.setinvalid(validity_output, i)
end
end
end
# Define a scalar UDF that always throws an error
function my_error_function(
info::DuckDB.duckdb_function_info,
input::DuckDB.duckdb_data_chunk,
output::DuckDB.duckdb_vector
)
throw(ErrorException("Runtime error in scalar function"))
end
function my_string_function_count_a(
info::DuckDB.duckdb_function_info,
input::DuckDB.duckdb_data_chunk,
output::DuckDB.duckdb_vector
)
input_chunk = DuckDB.DataChunk(input, false)
output_vec = DuckDB.Vec(output)
n = DuckDB.get_size(input_chunk)
chunks = [input_chunk]
extra_info_ptr = DuckDB.duckdb_scalar_function_get_extra_info(info)
extra_info::DuckDB.ScalarFunction = unsafe_pointer_to_objref(extra_info_ptr)
conversion_data = DuckDB.ColumnConversionData(chunks, 1, extra_info.logical_parameters[1], nothing)
a_data_converted = DuckDB.DuckDB.convert_column(conversion_data)
output_data = DuckDB.get_array(DuckDB.Vec(output), Int, n)
# # # @info "Values" a_data b_data
for row in 1:n
result = count(x -> x == 'a', a_data_converted[row])
output_data[row] = result
end
return nothing
end
function my_string_function_reverse_concat(
info::DuckDB.duckdb_function_info,
input::DuckDB.duckdb_data_chunk,
output::DuckDB.duckdb_vector
)
input_chunk = DuckDB.DataChunk(input, false)
output_vec = DuckDB.Vec(output)
n = Int64(DuckDB.get_size(input_chunk))
chunks = [input_chunk]
extra_info_ptr = DuckDB.duckdb_scalar_function_get_extra_info(info)
extra_info::DuckDB.ScalarFunction = unsafe_pointer_to_objref(extra_info_ptr)
conversion_data_a = DuckDB.ColumnConversionData(chunks, 1, extra_info.logical_parameters[1], nothing)
conversion_data_b = DuckDB.ColumnConversionData(chunks, 2, extra_info.logical_parameters[2], nothing)
a_data_converted = DuckDB.DuckDB.convert_column(conversion_data_a)
b_data_converted = DuckDB.DuckDB.convert_column(conversion_data_b)
for row in 1:n
result = string(reverse(a_data_converted[row]), b_data_converted[row])
DuckDB.assign_string_element(output_vec, row, result)
end
return nothing
end
@testset "Test custom scalar functions" begin
# Connect to DuckDB
db = DuckDB.DB()
con = DuckDB.connect(db)
# Create the test table
DuckDB.query(con, "CREATE TABLE test_table AS SELECT i FROM range(10) t(i)")
# Define logical type BIGINT
type_bigint = DuckDB.duckdb_create_logical_type(DuckDB.DUCKDB_TYPE_BIGINT)
# Test 1: Double Function
# Create the scalar function
f_double = DuckDB.duckdb_create_scalar_function()
DuckDB.duckdb_scalar_function_set_name(f_double, "double_value")
# Set parameter types
DuckDB.duckdb_scalar_function_add_parameter(f_double, type_bigint)
# Set return type
DuckDB.duckdb_scalar_function_set_return_type(f_double, type_bigint)
# Set the function
CMyDoubleFunction = @cfunction(
my_double_function,
Cvoid,
(DuckDB.duckdb_function_info, DuckDB.duckdb_data_chunk, DuckDB.duckdb_vector)
)
DuckDB.duckdb_scalar_function_set_function(f_double, CMyDoubleFunction)
# Register the function
res = DuckDB.duckdb_register_scalar_function(con.handle, f_double)
@test res == DuckDB.DuckDBSuccess
# Execute the function in a query
results = DuckDB.query(con, "SELECT i, double_value(i) as doubled FROM test_table")
df = DataFrame(results)
@test names(df) == ["i", "doubled"]
@test size(df, 1) == 10
@test df.doubled == df.i .* 2
# Test 2: Null Function
# Create the scalar function
f_null = DuckDB.duckdb_create_scalar_function()
DuckDB.duckdb_scalar_function_set_name(f_null, "null_if_odd")
# Set parameter types
DuckDB.duckdb_scalar_function_add_parameter(f_null, type_bigint)
# Set return type
DuckDB.duckdb_scalar_function_set_return_type(f_null, type_bigint)
# Set the function
CMyNullFunction = @cfunction(
my_null_function,
Cvoid,
(DuckDB.duckdb_function_info, DuckDB.duckdb_data_chunk, DuckDB.duckdb_vector)
)
DuckDB.duckdb_scalar_function_set_function(f_null, CMyNullFunction)
# Register the function
res_null = DuckDB.duckdb_register_scalar_function(con.handle, f_null)
@test res_null == DuckDB.DuckDBSuccess
# Execute the function in a query
results_null = DuckDB.query(con, "SELECT i, null_if_odd(i) as value_or_null FROM test_table")
df_null = DataFrame(results_null)
@test names(df_null) == ["i", "value_or_null"]
@test size(df_null, 1) == 10
expected_values = Vector{Union{Missing, Int64}}(undef, 10)
for idx in 1:10
i = idx - 1 # Since i ranges from 0 to 9
if i % 2 == 0
expected_values[idx] = i
else
expected_values[idx] = missing
end
end
@test all(df_null.value_or_null .=== expected_values)
# Adjusted Test 3: Error Function
# Create the scalar function
f_error = DuckDB.duckdb_create_scalar_function()
DuckDB.duckdb_scalar_function_set_name(f_error, "error_function")
# Set parameter types
DuckDB.duckdb_scalar_function_add_parameter(f_error, type_bigint)
# Set return type
DuckDB.duckdb_scalar_function_set_return_type(f_error, type_bigint)
# Set the function
CMyErrorFunction = @cfunction(
my_error_function,
Cvoid,
(DuckDB.duckdb_function_info, DuckDB.duckdb_data_chunk, DuckDB.duckdb_vector)
)
DuckDB.duckdb_scalar_function_set_function(f_error, CMyErrorFunction)
# Register the function
res_error = DuckDB.duckdb_register_scalar_function(con.handle, f_error)
@test res_error == DuckDB.DuckDBSuccess
# Adjusted test to expect ErrorException
@test_throws ErrorException DuckDB.query(con, "SELECT error_function(i) FROM test_table")
# Clean up logical type
DuckDB.duckdb_destroy_logical_type(type_bigint)
# Disconnect and close
DuckDB.disconnect(con)
DuckDB.close(db)
end
mysum(a, b) = a + b # Dummy function
my_reverse(s) = string(reverse(s))
@testset "UDF_Macro" begin
# Parse Expression
expr = :(mysum(a::Int, b::String)::Int)
func_name, func_params, return_value = DuckDB._udf_parse_function_expr(expr)
@test func_name == :mysum
@test func_params == [(:a, :Int), (:b, :String)]
@test return_value == :Int
# Build expressions
var_names, expressions =
DuckDB._udf_generate_conversion_expressions(func_params, :log_types, :convert, :param, :chunk)
@test var_names == [:param_1, :param_2]
@test expressions[1] == :(param_1 = convert(Int, log_types[1], chunk, 1))
@test expressions[2] == :(param_2 = convert(String, log_types[2], chunk, 2))
# Generate UDF
db = DuckDB.DB()
con = DuckDB.connect(db)
fun = DuckDB.@create_scalar_function mysum(a::Int, b::Int)::Int
#ptr = @cfunction(fun.wrapper, Cvoid, (Ptr{Cvoid},Ptr{Cvoid},Ptr{Cvoid}))
#ptr = pointer_from_objref(mysum_udf.wrapper)
#DuckDB.duckdb_scalar_function_set_function(mysum_udf.handle, ptr)
DuckDB.register_scalar_function(con, fun) # Register UDF
@test_throws ArgumentError DuckDB.register_scalar_function(con, fun) # Register UDF twice
DuckDB.execute(con, "CREATE TABLE test1 (a INT, b INT);")
DuckDB.execute(con, "INSERT INTO test1 VALUES ('1', '2'), ('3','4'), ('5', '6')")
result = DuckDB.execute(con, "SELECT mysum(a, b) as result FROM test1") |> DataFrame
@test result.result == [3, 7, 11]
end
@testset "UDF Macro Various Types" begin
import Dates
db = DuckDB.DB()
con = DuckDB.connect(db)
my_reverse_inner = (s) -> ("Inner:" * string(reverse(s)))
fun_is_weekend = (d) -> Dates.dayofweek(d) in (6, 7)
date_2020 = (x) -> Dates.Date(2020, 1, 1) + Dates.Day(x) # Dummy function
my_and(a, b) = a && b
my_int_add(a, b) = a + b
my_mixed_add(a::Int, b::Float64) = a + b
df_numbers =
DataFrame(a = rand(1:100, 30), b = rand(1:100, 30), c = rand(30), d = rand(Bool, 30), e = rand(Bool, 30))
df_strings = DataFrame(a = ["hello", "world", "julia", "duckdb", "🦆DB"])
t = Date(2020, 1, 1):Day(1):Date(2020, 12, 31)
df_dates = DataFrame(t = t, k = 1:length(t), is_weekend = fun_is_weekend.(t))
DuckDB.register_table(con, df_strings, "test_strings")
DuckDB.register_table(con, df_dates, "test_dates")
DuckDB.register_table(con, df_numbers, "test_numbers")
# Register UDFs
fun_string = DuckDB.@create_scalar_function my_reverse(s::String)::String (s) -> my_reverse_inner(s)
DuckDB.register_scalar_function(con, fun_string) # Register UDF
fun_date = DuckDB.@create_scalar_function is_weekend(d::Date)::Bool fun_is_weekend
fun_date2 = DuckDB.@create_scalar_function date_2020(x::Int)::Date date_2020
DuckDB.register_scalar_function(con, fun_date) # Register UDF
DuckDB.register_scalar_function(con, fun_date2) # Register UDF
fun_and = DuckDB.@create_scalar_function my_and(a::Bool, b::Bool)::Bool my_and
fun_int_add = DuckDB.@create_scalar_function my_int_add(a::Int, b::Int)::Int my_int_add
fun_mixed_add = DuckDB.@create_scalar_function my_mixed_add(a::Int, b::Float64)::Float64 my_mixed_add
DuckDB.register_scalar_function(con, fun_and)
DuckDB.register_scalar_function(con, fun_int_add)
DuckDB.register_scalar_function(con, fun_mixed_add)
result1 = DuckDB.execute(con, "SELECT my_reverse(a) as result FROM test_strings") |> DataFrame
@test result1.result == my_reverse_inner.(df_strings.a)
result2_1 = DuckDB.execute(con, "SELECT is_weekend(t) as result FROM test_dates") |> DataFrame
@test result2_1.result == fun_is_weekend.(df_dates.t)
result2_2 = DuckDB.execute(con, "SELECT date_2020(k) as result FROM test_dates") |> DataFrame
@test result2_2.result == date_2020.(df_dates.k)
result3 = DuckDB.execute(con, "SELECT my_and(d, e) as result FROM test_numbers") |> DataFrame
@test result3.result == my_and.(df_numbers.d, df_numbers.e)
result4 = DuckDB.execute(con, "SELECT my_int_add(a, b) as result FROM test_numbers") |> DataFrame
@test result4.result == my_int_add.(df_numbers.a, df_numbers.b)
result5 = DuckDB.execute(con, "SELECT my_mixed_add(a, c) as result FROM test_numbers") |> DataFrame
@test result5.result == my_mixed_add.(df_numbers.a, df_numbers.c)
end
@testset "UDF Macro Exception" begin
f_error = function (a)
if iseven(a)
throw(ArgumentError("Even number"))
else
return a + 1
end
end
db = DuckDB.DB()
con = DuckDB.connect(db)
fun_error = DuckDB.@create_scalar_function f_error(a::Int)::Int f_error
DuckDB.register_scalar_function(con, fun_error) # Register UDF
df = DataFrame(a = 1:10)
DuckDB.register_table(con, df, "test1")
@test_throws Exception result = DuckDB.execute(con, "SELECT f_error(a) as result FROM test1") |> DataFrame
end
@testset "UDF Macro Missing Values" begin
f_add = (a, b) -> a + b
db = DuckDB.DB()
con = DuckDB.connect(db)
fun = DuckDB.@create_scalar_function f_add(a::Int, b::Int)::Int f_add
DuckDB.register_scalar_function(con, fun)
df = DataFrame(a = [1, missing, 3], b = [missing, 2, 3])
DuckDB.register_table(con, df, "test1")
result = DuckDB.execute(con, "SELECT f_add(a, b) as result FROM test1") |> DataFrame
@test isequal(result.result, [missing, missing, 6])
end
@testset "UDF Macro Benchmark" begin
# Check if the generated UDF is comparable to pure Julia or DuckDB expressions
#
# Currently UDFs takes about as much time as Julia/DuckDB expressions
# - The evaluation of the wrapper takes around 20% of the execution time
# - slow calls are setindex! and getindex
# - table_scan_func is the slowest call
db = DuckDB.DB()
con = DuckDB.connect(db)
fun_int = DuckDB.@create_scalar_function mysum(a::Int, b::Int)::Int
fun_float = DuckDB.@create_scalar_function mysum_f(a::Float64, b::Float64)::Float64 mysum
DuckDB.register_scalar_function(con, fun_int) # Register UDF
DuckDB.register_scalar_function(con, fun_float) # Register UDF
N = 10_000_000
df = DataFrame(a = 1:N, b = 1:N, c = rand(N), d = rand(N))
DuckDB.register_table(con, df, "test1")
# Precompile functions
precompile(mysum, (Int, Int))
precompile(mysum, (Float64, Float64))
DuckDB.execute(con, "SELECT mysum(a, b) as result FROM test1")
DuckDB.execute(con, "SELECT mysum_f(c, d) as result FROM test1")
# INTEGER Benchmark
t1 = @elapsed result_exp = df.a .+ df.b
t2 = @elapsed result = DuckDB.execute(con, "SELECT mysum(a, b) as result FROM test1")
t3 = @elapsed result2 = DuckDB.execute(con, "SELECT a + b as result FROM test1")
@test DataFrame(result).result == result_exp
# Prints:
# Benchmark Int: Julia Expression: 0.092947083, UDF: 0.078665125, DDB: 0.065306042
@info "Benchmark Int: Julia Expression: $t1, UDF: $t2, DDB: $t3"
# FLOAT Benchmark
t1 = @elapsed result_exp = df.c .+ df.d
t2 = @elapsed result = DuckDB.execute(con, "SELECT mysum_f(c, d) as result FROM test1")
t3 = @elapsed result2 = DuckDB.execute(con, "SELECT c + d as result FROM test1")
@test DataFrame(result).result result_exp atol = 1e-6
# Prints:
# Benchmark Float: Julia Expression: 0.090409625, UDF: 0.080781, DDB: 0.054156167
@info "Benchmark Float: Julia Expression: $t1, UDF: $t2, DDB: $t3"
end