should be it
This commit is contained in:
131
external/duckdb/extension/delta/CMakeLists.txt
vendored
Normal file
131
external/duckdb/extension/delta/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,131 @@
|
||||
cmake_minimum_required(VERSION 2.8.12)
|
||||
include(ExternalProject)
|
||||
|
||||
# Core config
|
||||
set(TARGET_NAME delta)
|
||||
|
||||
set(EXTENSION_NAME ${TARGET_NAME}_extension)
|
||||
set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)
|
||||
|
||||
project(${TARGET_NAME})
|
||||
|
||||
include_directories(src/include)
|
||||
|
||||
set(EXTENSION_SOURCES src/delta_extension.cpp src/delta_functions.cpp
|
||||
src/delta_utils.cpp src/functions/delta_scan.cpp)
|
||||
|
||||
# Custom config TODO: figure out if we really need this?
|
||||
if(APPLE)
|
||||
set(PLATFORM_LIBS
|
||||
m
|
||||
c
|
||||
System
|
||||
resolv
|
||||
"-framework Corefoundation -framework SystemConfiguration -framework Security"
|
||||
)
|
||||
elseif(UNIX)
|
||||
set(PLATFORM_LIBS m c resolv)
|
||||
elseif(WIN32)
|
||||
set(PLATFORM_LIBS ws2_32 userenv advapi32)
|
||||
else()
|
||||
message(STATUS "UNKNOWN OS")
|
||||
endif()
|
||||
|
||||
# Setup delta-kernel-rs dependency
|
||||
set(KERNEL_NAME delta_kernel)
|
||||
|
||||
# Set default ExternalProject root directory
|
||||
set_directory_properties(PROPERTIES EP_PREFIX ${CMAKE_BINARY_DIR}/rust)
|
||||
|
||||
# Propagate arch to rust build for CI
|
||||
set(RUST_PLATFORM_TARGET "")
|
||||
if("${OS_NAME}" STREQUAL "linux")
|
||||
if("${OS_ARCH}" STREQUAL "arm64")
|
||||
set(RUST_PLATFORM_TARGET "aarch64-unknown-linux-gnu")
|
||||
else()
|
||||
set(RUST_PLATFORM_TARGET "x86_64-unknown-linux-gnu")
|
||||
endif()
|
||||
elseif("${OS_NAME}" STREQUAL "osx")
|
||||
# TODO: clean up upstream; we are not correctly setting OS_ARCH for cross
|
||||
# compile
|
||||
if("${OSX_BUILD_ARCH}" STREQUAL "arm64")
|
||||
set(RUST_PLATFORM_TARGET "aarch64-apple-darwin")
|
||||
elseif("${OSX_BUILD_ARCH}" STREQUAL "x86_64")
|
||||
set(RUST_PLATFORM_TARGET "x86_64-apple-darwin")
|
||||
elseif("${OS_ARCH}" STREQUAL "arm64")
|
||||
set(RUST_PLATFORM_TARGET "aarch64-apple-darwin")
|
||||
else()
|
||||
set(RUST_PLATFORM_TARGET "x86_64-apple-darwin")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Add rust_example as a CMake target
|
||||
ExternalProject_Add(
|
||||
${KERNEL_NAME}
|
||||
GIT_REPOSITORY "https://github.com/delta-incubator/delta-kernel-rs"
|
||||
GIT_TAG 08f0764a00e89f42136fd478823d28278adc7ee8
|
||||
CONFIGURE_COMMAND ""
|
||||
UPDATE_COMMAND ""
|
||||
BUILD_IN_SOURCE 1
|
||||
# Build debug build
|
||||
BUILD_COMMAND cargo build --package delta_kernel_ffi --workspace
|
||||
--all-features --target=${RUST_PLATFORM_TARGET}
|
||||
# Build release build
|
||||
COMMAND cargo build --package delta_kernel_ffi --workspace --all-features
|
||||
--release --target=${RUST_PLATFORM_TARGET}
|
||||
# Build DATs
|
||||
COMMAND
|
||||
cargo build
|
||||
--manifest-path=${CMAKE_BINARY_DIR}/rust/src/delta_kernel/acceptance/Cargo.toml
|
||||
BUILD_BYPRODUCTS
|
||||
"${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/debug/libdelta_kernel_ffi.a"
|
||||
BUILD_BYPRODUCTS
|
||||
"${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/release/libdelta_kernel_ffi.a"
|
||||
BUILD_BYPRODUCTS
|
||||
"${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/ffi-headers/delta_kernel_ffi.h"
|
||||
BUILD_BYPRODUCTS
|
||||
"${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/ffi-headers/delta_kernel_ffi.hpp"
|
||||
INSTALL_COMMAND ""
|
||||
LOG_BUILD ON)
|
||||
|
||||
build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES})
|
||||
build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES})
|
||||
|
||||
include_directories(
|
||||
${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/ffi-headers)
|
||||
include_directories(
|
||||
${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/ffi-headers)
|
||||
|
||||
# Hides annoying linker warnings
|
||||
set(CMAKE_OSX_DEPLOYMENT_TARGET
|
||||
13.3
|
||||
CACHE STRING "Minimum OS X deployment version" FORCE)
|
||||
|
||||
# Add the default client
|
||||
add_compile_definitions(DEFINE_DEFAULT_ENGINE)
|
||||
|
||||
# Link delta-kernal-rs to static lib
|
||||
target_link_libraries(
|
||||
${EXTENSION_NAME}
|
||||
debug
|
||||
"${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/debug/libdelta_kernel_ffi.a"
|
||||
optimized
|
||||
"${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/release/libdelta_kernel_ffi.a"
|
||||
${PLATFORM_LIBS})
|
||||
add_dependencies(${EXTENSION_NAME} delta_kernel)
|
||||
|
||||
# Link delta-kernal-rs to dynamic lib
|
||||
target_link_libraries(
|
||||
${LOADABLE_EXTENSION_NAME}
|
||||
debug
|
||||
"${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/debug/libdelta_kernel_ffi.a"
|
||||
optimized
|
||||
"${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/release/libdelta_kernel_ffi.a"
|
||||
${PLATFORM_LIBS})
|
||||
add_dependencies(${LOADABLE_EXTENSION_NAME} delta_kernel)
|
||||
|
||||
install(
|
||||
TARGETS ${EXTENSION_NAME}
|
||||
EXPORT "${DUCKDB_EXPORT_SET}"
|
||||
LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
|
||||
ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")
|
||||
71
external/duckdb/extension/delta/README.md
vendored
Normal file
71
external/duckdb/extension/delta/README.md
vendored
Normal file
@@ -0,0 +1,71 @@
|
||||
# DuckDB Delta Extension
|
||||
This is the experimental DuckDB extension for [Delta](https://delta.io/). It is built using the (also experimental)
|
||||
[Delta Kernel](https://github.com/delta-incubator/delta-kernel-rs). The extension (currently) offers **read** support for delta
|
||||
tables, both local and remote.
|
||||
|
||||
# Supported platforms
|
||||
The supported platforms are:
|
||||
- `linux_amd64`
|
||||
- `osx_amd64` and `osx_arm64`
|
||||
|
||||
Support for the [other](https://duckdb.org/docs/stable/extensions/extension_distribution#platforms) DuckDB platforms is
|
||||
work-in-progress
|
||||
|
||||
# How to use
|
||||
**NOTE: this extension requires the DuckDB v0.10.3 or higher**
|
||||
|
||||
This extension is distributed as a binary extension. To use it, simply use one of its functions from DuckDB and the extension will be autoloaded:
|
||||
```SQL
|
||||
FROM delta_scan('s3://some/delta/table');
|
||||
```
|
||||
|
||||
Note that using DuckDB [Secrets](https://duckdb.org/docs/stable/configuration/secrets_manager) for S3 authentication is supported:
|
||||
|
||||
```SQL
|
||||
CREATE SECRET (TYPE S3, provider credential_chain);
|
||||
FROM delta_scan('s3://some/delta/table/with/auth');
|
||||
```
|
||||
|
||||
To scan a local table, use the full path prefixes with `file://`
|
||||
```SQL
|
||||
FROM delta_scan('file:///some/path/on/local/machine');
|
||||
```
|
||||
|
||||
# Features
|
||||
While still experimental, many (scanning) features/optimizations are already supported in this extension as it reuses most of DuckDB's
|
||||
regular parquet scanning logic:
|
||||
- multithreaded scans and parquet metadata reading
|
||||
- data skipping/filter pushdown
|
||||
- skipping row-groups in file (based on parquet metadata)
|
||||
- skipping complete files (based on delta partition info)
|
||||
- projection pushdown
|
||||
- scanning tables with deletion vectors
|
||||
- all primitive types
|
||||
- structs
|
||||
- S3 support with secrets
|
||||
|
||||
More features coming soon!
|
||||
|
||||
# Building
|
||||
See the [Extension Template](https://github.com/duckdb/extension-template) for generic build instructions
|
||||
|
||||
# Running tests
|
||||
There are various tests available for the delta extension:
|
||||
1. Delta Acceptence Test (DAT) based tests in `/test/sql/dat`
|
||||
2. delta-kernel-rs based tests in `/test/sql/delta_kernel_rs`
|
||||
3. Generated data based tests in `tests/sql/generated` (generated using [delta-rs](https://delta-io.github.io/delta-rs/), [PySpark](https://spark.apache.org/docs/latest/api/python/index.html), and DuckDB)
|
||||
|
||||
To run the first 2 sets of tests:
|
||||
```shell
|
||||
make test_debug
|
||||
```
|
||||
or in release mode
|
||||
```shell
|
||||
make test
|
||||
```
|
||||
|
||||
To also run the tests on generated data:
|
||||
```shell
|
||||
make generate-data
|
||||
GENERATED_DATA_AVAILABLE=1 make test
|
||||
```
|
||||
14
external/duckdb/extension/delta/extension_config.cmake
vendored
Normal file
14
external/duckdb/extension/delta/extension_config.cmake
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
# This file is included by DuckDB's build system. It specifies which extension to load
|
||||
|
||||
# Extension from this repo
|
||||
duckdb_extension_load(delta
|
||||
SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}
|
||||
LOAD_TESTS
|
||||
)
|
||||
|
||||
# Build the httpfs extension to test with s3/http
|
||||
duckdb_extension_load(httpfs)
|
||||
|
||||
# Build the tpch and tpcds extension for testing/benchmarking
|
||||
duckdb_extension_load(tpch)
|
||||
duckdb_extension_load(tpcds)
|
||||
32
external/duckdb/extension/delta/src/delta_extension.cpp
vendored
Normal file
32
external/duckdb/extension/delta/src/delta_extension.cpp
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
#include "delta_extension.hpp"
|
||||
#include "delta_functions.hpp"
|
||||
|
||||
#include "duckdb.hpp"
|
||||
#include "duckdb/common/exception.hpp"
|
||||
#include "duckdb/main/extension/extension_loader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static void LoadInternal(ExtensionLoader &loader) {
|
||||
// Load functions
|
||||
for (const auto &function : DeltaFunctions::GetTableFunctions(instance)) {
|
||||
loader.RegisterFunction(function);
|
||||
}
|
||||
}
|
||||
|
||||
void DeltaExtension::Load(ExtensionLoader &loader) {
|
||||
LoadInternal(loader);
|
||||
}
|
||||
|
||||
std::string DeltaExtension::Name() {
|
||||
return "delta";
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
|
||||
extern "C" {
|
||||
|
||||
DUCKDB_CPP_EXTENSION_ENTRY(delta, loader) {
|
||||
duckdb::LoadInternal(loader);
|
||||
}
|
||||
}
|
||||
17
external/duckdb/extension/delta/src/delta_functions.cpp
vendored
Normal file
17
external/duckdb/extension/delta/src/delta_functions.cpp
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
#include "delta_functions.hpp"
|
||||
|
||||
#include "duckdb.hpp"
|
||||
#include "duckdb/main/extension_util.hpp"
|
||||
#include <duckdb/parser/parsed_data/create_scalar_function_info.hpp>
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
vector<TableFunctionSet> DeltaFunctions::GetTableFunctions(DatabaseInstance &instance) {
|
||||
vector<TableFunctionSet> functions;
|
||||
|
||||
functions.push_back(GetDeltaScanFunction(instance));
|
||||
|
||||
return functions;
|
||||
}
|
||||
|
||||
}; // namespace duckdb
|
||||
322
external/duckdb/extension/delta/src/delta_utils.cpp
vendored
Normal file
322
external/duckdb/extension/delta/src/delta_utils.cpp
vendored
Normal file
@@ -0,0 +1,322 @@
|
||||
#include "delta_utils.hpp"
|
||||
|
||||
#include "duckdb.hpp"
|
||||
#include "duckdb/main/extension_util.hpp"
|
||||
#include <duckdb/parser/parsed_data/create_scalar_function_info.hpp>
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
unique_ptr<SchemaVisitor::FieldList> SchemaVisitor::VisitSnapshotSchema(ffi::SharedSnapshot *snapshot) {
|
||||
SchemaVisitor state;
|
||||
ffi::EngineSchemaVisitor visitor;
|
||||
|
||||
visitor.data = &state;
|
||||
visitor.make_field_list = (uintptr_t(*)(void *, uintptr_t)) & MakeFieldList;
|
||||
visitor.visit_struct = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, uintptr_t)) & VisitStruct;
|
||||
visitor.visit_array = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, bool, uintptr_t)) & VisitArray;
|
||||
visitor.visit_map = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, bool, uintptr_t)) & VisitMap;
|
||||
visitor.visit_decimal = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, uint8_t, uint8_t)) & VisitDecimal;
|
||||
visitor.visit_string = VisitSimpleType<LogicalType::VARCHAR>();
|
||||
visitor.visit_long = VisitSimpleType<LogicalType::BIGINT>();
|
||||
visitor.visit_integer = VisitSimpleType<LogicalType::INTEGER>();
|
||||
visitor.visit_short = VisitSimpleType<LogicalType::SMALLINT>();
|
||||
visitor.visit_byte = VisitSimpleType<LogicalType::TINYINT>();
|
||||
visitor.visit_float = VisitSimpleType<LogicalType::FLOAT>();
|
||||
visitor.visit_double = VisitSimpleType<LogicalType::DOUBLE>();
|
||||
visitor.visit_boolean = VisitSimpleType<LogicalType::BOOLEAN>();
|
||||
visitor.visit_binary = VisitSimpleType<LogicalType::VARCHAR>();
|
||||
visitor.visit_date = VisitSimpleType<LogicalType::DATE>();
|
||||
visitor.visit_timestamp = VisitSimpleType<LogicalType::TIMESTAMP>();
|
||||
visitor.visit_timestamp_ntz = VisitSimpleType<LogicalType::TIMESTAMP_TZ>();
|
||||
|
||||
uintptr_t result = visit_schema(snapshot, &visitor);
|
||||
return state.TakeFieldList(result);
|
||||
}
|
||||
|
||||
void SchemaVisitor::VisitDecimal(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name,
|
||||
uint8_t precision, uint8_t scale) {
|
||||
state->AppendToList(sibling_list_id, name, LogicalType::DECIMAL(precision, scale));
|
||||
}
|
||||
|
||||
uintptr_t SchemaVisitor::MakeFieldList(SchemaVisitor *state, uintptr_t capacity_hint) {
|
||||
return state->MakeFieldListImpl(capacity_hint);
|
||||
}
|
||||
|
||||
void SchemaVisitor::VisitStruct(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name,
|
||||
uintptr_t child_list_id) {
|
||||
auto children = state->TakeFieldList(child_list_id);
|
||||
state->AppendToList(sibling_list_id, name, LogicalType::STRUCT(std::move(*children)));
|
||||
}
|
||||
|
||||
void SchemaVisitor::VisitArray(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name,
|
||||
bool contains_null, uintptr_t child_list_id) {
|
||||
auto children = state->TakeFieldList(child_list_id);
|
||||
|
||||
D_ASSERT(children->size() == 1);
|
||||
state->AppendToList(sibling_list_id, name, LogicalType::LIST(children->front().second));
|
||||
}
|
||||
|
||||
void SchemaVisitor::VisitMap(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name,
|
||||
bool contains_null, uintptr_t child_list_id) {
|
||||
auto children = state->TakeFieldList(child_list_id);
|
||||
|
||||
D_ASSERT(children->size() == 2);
|
||||
state->AppendToList(sibling_list_id, name, LogicalType::MAP(LogicalType::STRUCT(std::move(*children))));
|
||||
}
|
||||
|
||||
uintptr_t SchemaVisitor::MakeFieldListImpl(uintptr_t capacity_hint) {
|
||||
uintptr_t id = next_id++;
|
||||
auto list = make_uniq<FieldList>();
|
||||
if (capacity_hint > 0) {
|
||||
list->reserve(capacity_hint);
|
||||
}
|
||||
inflight_lists.emplace(id, std::move(list));
|
||||
return id;
|
||||
}
|
||||
|
||||
void SchemaVisitor::AppendToList(uintptr_t id, ffi::KernelStringSlice name, LogicalType &&child) {
|
||||
auto it = inflight_lists.find(id);
|
||||
if (it == inflight_lists.end()) {
|
||||
// TODO... some error...
|
||||
throw InternalException("WEIRD SHIT");
|
||||
} else {
|
||||
it->second->emplace_back(std::make_pair(string(name.ptr, name.len), std::move(child)));
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<SchemaVisitor::FieldList> SchemaVisitor::TakeFieldList(uintptr_t id) {
|
||||
auto it = inflight_lists.find(id);
|
||||
if (it == inflight_lists.end()) {
|
||||
// TODO: Raise some kind of error.
|
||||
throw InternalException("WEIRD SHIT 2");
|
||||
}
|
||||
auto rval = std::move(it->second);
|
||||
inflight_lists.erase(it);
|
||||
return rval;
|
||||
}
|
||||
|
||||
ffi::EngineError *DuckDBEngineError::AllocateError(ffi::KernelError etype, ffi::KernelStringSlice msg) {
|
||||
auto error = new DuckDBEngineError;
|
||||
error->etype = etype;
|
||||
error->error_message = string(msg.ptr, msg.len);
|
||||
return error;
|
||||
}
|
||||
|
||||
string DuckDBEngineError::KernelErrorEnumToString(ffi::KernelError err) {
|
||||
const char *KERNEL_ERROR_ENUM_STRINGS[] = {
|
||||
"UnknownError",
|
||||
"FFIError",
|
||||
"ArrowError",
|
||||
"EngineDataTypeError",
|
||||
"ExtractError",
|
||||
"GenericError",
|
||||
"IOErrorError",
|
||||
"ParquetError",
|
||||
"ObjectStoreError",
|
||||
"ObjectStorePathError",
|
||||
"Reqwest",
|
||||
"FileNotFoundError",
|
||||
"MissingColumnError",
|
||||
"UnexpectedColumnTypeError",
|
||||
"MissingDataError",
|
||||
"MissingVersionError",
|
||||
"DeletionVectorError",
|
||||
"InvalidUrlError",
|
||||
"MalformedJsonError",
|
||||
"MissingMetadataError",
|
||||
"MissingProtocolError",
|
||||
"MissingMetadataAndProtocolError",
|
||||
"ParseError",
|
||||
"JoinFailureError",
|
||||
"Utf8Error",
|
||||
"ParseIntError",
|
||||
"InvalidColumnMappingMode",
|
||||
"InvalidTableLocation",
|
||||
"InvalidDecimalError",
|
||||
};
|
||||
|
||||
static_assert(sizeof(KERNEL_ERROR_ENUM_STRINGS) / sizeof(char *) - 1 == (int)ffi::KernelError::InvalidDecimalError,
|
||||
"KernelErrorEnumStrings mismatched with kernel");
|
||||
|
||||
if ((int)err < sizeof(KERNEL_ERROR_ENUM_STRINGS) / sizeof(char *)) {
|
||||
return KERNEL_ERROR_ENUM_STRINGS[(int)err];
|
||||
}
|
||||
|
||||
return StringUtil::Format("EnumOutOfRange (enum val out of range: %d)", (int)err);
|
||||
}
|
||||
|
||||
void DuckDBEngineError::Throw(string from_where) {
|
||||
// Make copies before calling delete this
|
||||
auto etype_copy = etype;
|
||||
auto message_copy = error_message;
|
||||
|
||||
// Consume error by calling delete this (remember this error is created by kernel using AllocateError)
|
||||
delete this;
|
||||
throw IOException("Hit DeltaKernel FFI error (from: %s): Hit error: %u (%s) with message (%s)", from_where.c_str(),
|
||||
etype_copy, KernelErrorEnumToString(etype_copy), message_copy);
|
||||
}
|
||||
|
||||
ffi::KernelStringSlice KernelUtils::ToDeltaString(const string &str) {
|
||||
return {str.data(), str.size()};
|
||||
}
|
||||
|
||||
string KernelUtils::FromDeltaString(const struct ffi::KernelStringSlice slice) {
|
||||
return {slice.ptr, slice.len};
|
||||
}
|
||||
|
||||
vector<bool> KernelUtils::FromDeltaBoolSlice(const struct ffi::KernelBoolSlice slice) {
|
||||
vector<bool> result;
|
||||
result.assign(slice.ptr, slice.ptr + slice.len);
|
||||
return result;
|
||||
}
|
||||
|
||||
PredicateVisitor::PredicateVisitor(const vector<string> &column_names, optional_ptr<TableFilterSet> filters)
|
||||
: EnginePredicate {.predicate = this,
|
||||
.visitor = (uintptr_t(*)(void *, ffi::KernelExpressionVisitorState *)) & VisitPredicate} {
|
||||
if (filters) {
|
||||
for (auto &filter : filters->filters) {
|
||||
column_filters[column_names[filter.first]] = filter.second.get();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Template wrapper function that implements get_next for EngineIteratorFromCallable.
|
||||
template <typename Callable>
|
||||
static auto GetNextFromCallable(Callable *callable) -> decltype(std::declval<Callable>()()) {
|
||||
return callable->operator()();
|
||||
}
|
||||
|
||||
// Wraps a callable object (e.g. C++11 lambda) as an EngineIterator.
|
||||
template <typename Callable>
|
||||
ffi::EngineIterator EngineIteratorFromCallable(Callable &callable) {
|
||||
auto *get_next = &GetNextFromCallable<Callable>;
|
||||
return {.data = &callable, .get_next = (const void *(*)(void *))get_next};
|
||||
};
|
||||
|
||||
// Helper function to prevent pushing down filters kernel cant handle
|
||||
// TODO: remove once kernel handles this properly?
|
||||
static bool CanHandleFilter(TableFilter *filter) {
|
||||
switch (filter->filter_type) {
|
||||
case TableFilterType::CONSTANT_COMPARISON:
|
||||
return true;
|
||||
case TableFilterType::CONJUNCTION_AND: {
|
||||
auto &conjunction = static_cast<const ConjunctionAndFilter &>(*filter);
|
||||
bool can_handle = true;
|
||||
for (const auto &child : conjunction.child_filters) {
|
||||
can_handle = can_handle && CanHandleFilter(child.get());
|
||||
}
|
||||
return can_handle;
|
||||
}
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Prunes the list of predicates to ones that we can handle
|
||||
static unordered_map<string, TableFilter *> PrunePredicates(unordered_map<string, TableFilter *> predicates) {
|
||||
unordered_map<string, TableFilter *> result;
|
||||
for (const auto &predicate : predicates) {
|
||||
if (CanHandleFilter(predicate.second)) {
|
||||
result[predicate.first] = predicate.second;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
uintptr_t PredicateVisitor::VisitPredicate(PredicateVisitor *predicate, ffi::KernelExpressionVisitorState *state) {
|
||||
auto filters = PrunePredicates(predicate->column_filters);
|
||||
|
||||
auto it = filters.begin();
|
||||
auto end = filters.end();
|
||||
auto get_next = [predicate, state, &it, &end]() -> uintptr_t {
|
||||
if (it == end) {
|
||||
return 0;
|
||||
}
|
||||
auto &filter = *it++;
|
||||
return predicate->VisitFilter(filter.first, *filter.second, state);
|
||||
};
|
||||
auto eit = EngineIteratorFromCallable(get_next);
|
||||
|
||||
// TODO: this should be fixed upstream?
|
||||
try {
|
||||
return visit_expression_and(state, &eit);
|
||||
} catch (...) {
|
||||
return ~0;
|
||||
}
|
||||
}
|
||||
|
||||
uintptr_t PredicateVisitor::VisitConstantFilter(const string &col_name, const ConstantFilter &filter,
|
||||
ffi::KernelExpressionVisitorState *state) {
|
||||
auto maybe_left =
|
||||
ffi::visit_expression_column(state, KernelUtils::ToDeltaString(col_name), DuckDBEngineError::AllocateError);
|
||||
uintptr_t left = KernelUtils::UnpackResult(maybe_left, "VisitConstantFilter failed to visit_expression_column");
|
||||
|
||||
uintptr_t right = ~0;
|
||||
auto &value = filter.constant;
|
||||
switch (value.type().id()) {
|
||||
case LogicalType::BIGINT:
|
||||
right = visit_expression_literal_long(state, BigIntValue::Get(value));
|
||||
break;
|
||||
|
||||
case LogicalType::VARCHAR: {
|
||||
// WARNING: C++ lifetime extension rules don't protect calls of the form foo(std::string(...).c_str())
|
||||
auto str = StringValue::Get(value);
|
||||
auto maybe_right = ffi::visit_expression_literal_string(state, KernelUtils::ToDeltaString(col_name),
|
||||
DuckDBEngineError::AllocateError);
|
||||
right = KernelUtils::UnpackResult(maybe_right, "VisitConstantFilter failed to visit_expression_literal_string");
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break; // unsupported type
|
||||
}
|
||||
|
||||
// TODO support other comparison types?
|
||||
switch (filter.comparison_type) {
|
||||
case ExpressionType::COMPARE_LESSTHAN:
|
||||
return visit_expression_lt(state, left, right);
|
||||
case ExpressionType::COMPARE_LESSTHANOREQUALTO:
|
||||
return visit_expression_le(state, left, right);
|
||||
case ExpressionType::COMPARE_GREATERTHAN:
|
||||
return visit_expression_gt(state, left, right);
|
||||
case ExpressionType::COMPARE_GREATERTHANOREQUALTO:
|
||||
return visit_expression_ge(state, left, right);
|
||||
case ExpressionType::COMPARE_EQUAL:
|
||||
return visit_expression_eq(state, left, right);
|
||||
|
||||
default:
|
||||
std::cout << " Unsupported operation: " << (int)filter.comparison_type << std::endl;
|
||||
return ~0; // Unsupported operation
|
||||
}
|
||||
}
|
||||
|
||||
uintptr_t PredicateVisitor::VisitAndFilter(const string &col_name, const ConjunctionAndFilter &filter,
|
||||
ffi::KernelExpressionVisitorState *state) {
|
||||
auto it = filter.child_filters.begin();
|
||||
auto end = filter.child_filters.end();
|
||||
auto get_next = [this, col_name, state, &it, &end]() -> uintptr_t {
|
||||
if (it == end) {
|
||||
return 0;
|
||||
}
|
||||
auto &child_filter = *it++;
|
||||
return VisitFilter(col_name, *child_filter, state);
|
||||
};
|
||||
auto eit = EngineIteratorFromCallable(get_next);
|
||||
return visit_expression_and(state, &eit);
|
||||
}
|
||||
|
||||
uintptr_t PredicateVisitor::VisitFilter(const string &col_name, const TableFilter &filter,
|
||||
ffi::KernelExpressionVisitorState *state) {
|
||||
switch (filter.filter_type) {
|
||||
case TableFilterType::CONSTANT_COMPARISON:
|
||||
return VisitConstantFilter(col_name, static_cast<const ConstantFilter &>(filter), state);
|
||||
case TableFilterType::CONJUNCTION_AND:
|
||||
return VisitAndFilter(col_name, static_cast<const ConjunctionAndFilter &>(filter), state);
|
||||
default:
|
||||
throw NotImplementedException("Attempted to push down unimplemented filter type: '%s'",
|
||||
EnumUtil::ToString(filter.filter_type));
|
||||
}
|
||||
}
|
||||
|
||||
}; // namespace duckdb
|
||||
626
external/duckdb/extension/delta/src/functions/delta_scan.cpp
vendored
Normal file
626
external/duckdb/extension/delta/src/functions/delta_scan.cpp
vendored
Normal file
@@ -0,0 +1,626 @@
|
||||
#include "duckdb/function/table_function.hpp"
|
||||
|
||||
#include "delta_functions.hpp"
|
||||
#include "functions/delta_scan.hpp"
|
||||
#include "duckdb/optimizer/filter_combiner.hpp"
|
||||
#include "duckdb/planner/operator/logical_get.hpp"
|
||||
#include "duckdb/main/extension_util.hpp"
|
||||
#include "duckdb/catalog/catalog_entry/table_function_catalog_entry.hpp"
|
||||
#include "duckdb/common/local_file_system.hpp"
|
||||
#include "duckdb/common/types/data_chunk.hpp"
|
||||
#include "duckdb/parser/expression/constant_expression.hpp"
|
||||
#include "duckdb/parser/expression/function_expression.hpp"
|
||||
#include "duckdb/parser/parsed_expression.hpp"
|
||||
#include "duckdb/execution/expression_executor.hpp"
|
||||
#include "duckdb/planner/binder.hpp"
|
||||
#include "duckdb/main/secret/secret_manager.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <numeric>
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
static void *allocate_string(const struct ffi::KernelStringSlice slice) {
|
||||
return new string(slice.ptr, slice.len);
|
||||
}
|
||||
|
||||
static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size,
|
||||
const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) {
|
||||
auto context = (DeltaSnapshot *)engine_context;
|
||||
auto path_string = context->GetPath();
|
||||
StringUtil::RTrim(path_string, "/");
|
||||
path_string += "/" + KernelUtils::FromDeltaString(path);
|
||||
|
||||
// First we append the file to our resolved files
|
||||
context->resolved_files.push_back(DeltaSnapshot::ToDuckDBPath(path_string));
|
||||
context->metadata.emplace_back(make_uniq<DeltaFileMetaData>());
|
||||
|
||||
D_ASSERT(context->resolved_files.size() == context->metadata.size());
|
||||
|
||||
// Initialize the file metadata
|
||||
context->metadata.back()->delta_snapshot_version = context->version;
|
||||
context->metadata.back()->file_number = context->resolved_files.size() - 1;
|
||||
|
||||
// Fetch the deletion vector
|
||||
auto selection_vector_res =
|
||||
ffi::selection_vector_from_dv(dv_info, context->extern_engine.get(), context->global_state.get());
|
||||
auto selection_vector =
|
||||
KernelUtils::UnpackResult(selection_vector_res, "selection_vector_from_dv for path " + context->GetPath());
|
||||
if (selection_vector.ptr) {
|
||||
context->metadata.back()->selection_vector = selection_vector;
|
||||
}
|
||||
|
||||
// Lookup all columns for potential hits in the constant map
|
||||
case_insensitive_map_t<string> constant_map;
|
||||
for (const auto &col : context->names) {
|
||||
auto key = KernelUtils::ToDeltaString(col);
|
||||
auto *partition_val = (string *)ffi::get_from_map(partition_values, key, allocate_string);
|
||||
if (partition_val) {
|
||||
constant_map[col] = *partition_val;
|
||||
delete partition_val;
|
||||
}
|
||||
}
|
||||
context->metadata.back()->partition_map = std::move(constant_map);
|
||||
}
|
||||
|
||||
static void visit_data(void *engine_context, ffi::EngineData *engine_data,
|
||||
const struct ffi::KernelBoolSlice selection_vec) {
|
||||
ffi::visit_scan_data(engine_data, selection_vec, engine_context, visit_callback);
|
||||
}
|
||||
|
||||
static ffi::EngineBuilder *CreateBuilder(ClientContext &context, const string &path) {
|
||||
ffi::EngineBuilder *builder;
|
||||
|
||||
// For "regular" paths we early out with the default builder config
|
||||
if (!StringUtil::StartsWith(path, "s3://")) {
|
||||
auto interface_builder_res =
|
||||
ffi::get_engine_builder(KernelUtils::ToDeltaString(path), DuckDBEngineError::AllocateError);
|
||||
return KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + path);
|
||||
}
|
||||
|
||||
auto end_of_container = path.find('/', 5);
|
||||
|
||||
if (end_of_container == string::npos) {
|
||||
throw IOException("Invalid s3 url passed to delta scan: %s", path);
|
||||
}
|
||||
auto bucket = path.substr(5, end_of_container - 5);
|
||||
auto path_in_bucket = path.substr(end_of_container);
|
||||
|
||||
auto interface_builder_res =
|
||||
ffi::get_engine_builder(KernelUtils::ToDeltaString(path), DuckDBEngineError::AllocateError);
|
||||
builder = KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + path);
|
||||
|
||||
// For S3 paths we need to trim the url, set the container, and fetch a potential secret
|
||||
auto &secret_manager = SecretManager::Get(context);
|
||||
auto transaction = CatalogTransaction::GetSystemCatalogTransaction(context);
|
||||
|
||||
auto secret_match = secret_manager.LookupSecret(transaction, path, "s3");
|
||||
|
||||
// No secret: nothing left to do here!
|
||||
if (!secret_match.HasMatch()) {
|
||||
return builder;
|
||||
}
|
||||
const auto &kv_secret = dynamic_cast<const KeyValueSecret &>(*secret_match.secret_entry->secret);
|
||||
|
||||
auto key_id = kv_secret.TryGetValue("key_id").ToString();
|
||||
auto secret = kv_secret.TryGetValue("secret").ToString();
|
||||
auto session_token = kv_secret.TryGetValue("session_token").ToString();
|
||||
auto region = kv_secret.TryGetValue("region").ToString();
|
||||
|
||||
if (key_id.empty() && secret.empty()) {
|
||||
ffi::set_builder_option(builder, KernelUtils::ToDeltaString("skip_signature"),
|
||||
KernelUtils::ToDeltaString("true"));
|
||||
}
|
||||
|
||||
if (!key_id.empty()) {
|
||||
ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_access_key_id"),
|
||||
KernelUtils::ToDeltaString(key_id));
|
||||
}
|
||||
if (!secret.empty()) {
|
||||
ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_secret_access_key"),
|
||||
KernelUtils::ToDeltaString(secret));
|
||||
}
|
||||
if (!session_token.empty()) {
|
||||
ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_session_token"),
|
||||
KernelUtils::ToDeltaString(session_token));
|
||||
}
|
||||
ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_region"), KernelUtils::ToDeltaString(region));
|
||||
|
||||
return builder;
|
||||
}
|
||||
|
||||
DeltaSnapshot::DeltaSnapshot(ClientContext &context_p, const string &path)
|
||||
: MultiFileList({ToDeltaPath(path)}, FileGlobOptions::ALLOW_EMPTY), context(context_p) {
|
||||
}
|
||||
|
||||
string DeltaSnapshot::GetPath() {
|
||||
return GetPaths()[0];
|
||||
}
|
||||
|
||||
string DeltaSnapshot::ToDuckDBPath(const string &raw_path) {
|
||||
if (StringUtil::StartsWith(raw_path, "file://")) {
|
||||
return raw_path.substr(7);
|
||||
}
|
||||
return raw_path;
|
||||
}
|
||||
|
||||
string DeltaSnapshot::ToDeltaPath(const string &raw_path) {
|
||||
string path;
|
||||
if (StringUtil::StartsWith(raw_path, "./")) {
|
||||
LocalFileSystem fs;
|
||||
path = fs.JoinPath(fs.GetWorkingDirectory(), raw_path.substr(2));
|
||||
path = "file://" + path;
|
||||
} else {
|
||||
path = raw_path;
|
||||
}
|
||||
|
||||
// Paths always end in a slash (kernel likes it that way for now)
|
||||
if (path[path.size() - 1] != '/') {
|
||||
path = path + '/';
|
||||
}
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
void DeltaSnapshot::Bind(vector<LogicalType> &return_types, vector<string> &names) {
|
||||
if (!initialized) {
|
||||
InitializeFiles();
|
||||
}
|
||||
auto schema = SchemaVisitor::VisitSnapshotSchema(snapshot.get());
|
||||
for (const auto &field : *schema) {
|
||||
names.push_back(field.first);
|
||||
return_types.push_back(field.second);
|
||||
}
|
||||
// Store the bound names for resolving the complex filter pushdown later
|
||||
this->names = names;
|
||||
}
|
||||
|
||||
string DeltaSnapshot::GetFile(idx_t i) {
|
||||
if (!initialized) {
|
||||
InitializeFiles();
|
||||
}
|
||||
// We already have this file
|
||||
if (i < resolved_files.size()) {
|
||||
return resolved_files[i];
|
||||
}
|
||||
|
||||
if (files_exhausted) {
|
||||
return "";
|
||||
}
|
||||
|
||||
while (i >= resolved_files.size()) {
|
||||
auto have_scan_data_res = ffi::kernel_scan_data_next(scan_data_iterator.get(), this, visit_data);
|
||||
|
||||
auto have_scan_data = TryUnpackKernelResult(have_scan_data_res);
|
||||
|
||||
// kernel has indicated that we have no more data to scan
|
||||
if (!have_scan_data) {
|
||||
files_exhausted = true;
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
// The kernel scan visitor should have resolved a file OR returned
|
||||
if (i >= resolved_files.size()) {
|
||||
throw IOException("Delta Kernel seems to have failed to resolve a new file");
|
||||
}
|
||||
|
||||
return resolved_files[i];
|
||||
}
|
||||
|
||||
void DeltaSnapshot::InitializeFiles() {
|
||||
auto path_slice = KernelUtils::ToDeltaString(paths[0]);
|
||||
|
||||
// Register engine
|
||||
auto interface_builder = CreateBuilder(context, paths[0]);
|
||||
extern_engine = TryUnpackKernelResult(ffi::builder_build(interface_builder));
|
||||
|
||||
// Initialize Snapshot
|
||||
snapshot = TryUnpackKernelResult(ffi::snapshot(path_slice, extern_engine.get()));
|
||||
|
||||
// Create Scan
|
||||
PredicateVisitor visitor(names, &table_filters);
|
||||
scan = TryUnpackKernelResult(ffi::scan(snapshot.get(), extern_engine.get(), &visitor));
|
||||
|
||||
// Create GlobalState
|
||||
global_state = ffi::get_global_scan_state(scan.get());
|
||||
|
||||
// Set version
|
||||
this->version = ffi::version(snapshot.get());
|
||||
|
||||
// Create scan data iterator
|
||||
scan_data_iterator = TryUnpackKernelResult(ffi::kernel_scan_data_init(extern_engine.get(), scan.get()));
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
unique_ptr<MultiFileList> DeltaSnapshot::ComplexFilterPushdown(ClientContext &context, const MultiFileOptions &options,
|
||||
LogicalGet &get,
|
||||
vector<unique_ptr<Expression>> &filters) {
|
||||
FilterCombiner combiner(context);
|
||||
for (const auto &filter : filters) {
|
||||
combiner.AddFilter(filter->Copy());
|
||||
}
|
||||
auto filterstmp = combiner.GenerateTableScanFilters(get.column_ids);
|
||||
|
||||
// TODO: can/should we figure out if this filtered anything?
|
||||
auto filtered_list = make_uniq<DeltaSnapshot>(context, paths[0]);
|
||||
filtered_list->table_filters = std::move(filterstmp);
|
||||
filtered_list->names = names;
|
||||
|
||||
return std::move(filtered_list);
|
||||
}
|
||||
|
||||
vector<string> DeltaSnapshot::GetAllFiles() {
|
||||
idx_t i = resolved_files.size();
|
||||
// TODO: this can probably be improved
|
||||
while (!GetFile(i).empty()) {
|
||||
i++;
|
||||
}
|
||||
return resolved_files;
|
||||
}
|
||||
|
||||
FileExpandResult DeltaSnapshot::GetExpandResult() {
|
||||
// GetFile(1) will ensure at least the first 2 files are expanded if they are available
|
||||
GetFile(1);
|
||||
|
||||
if (resolved_files.size() > 1) {
|
||||
return FileExpandResult::MULTIPLE_FILES;
|
||||
} else if (resolved_files.size() == 1) {
|
||||
return FileExpandResult::SINGLE_FILE;
|
||||
}
|
||||
|
||||
return FileExpandResult::NO_FILES;
|
||||
}
|
||||
|
||||
idx_t DeltaSnapshot::GetTotalFileCount() {
|
||||
// TODO: this can probably be improved
|
||||
idx_t i = resolved_files.size();
|
||||
while (!GetFile(i).empty()) {
|
||||
i++;
|
||||
}
|
||||
return resolved_files.size();
|
||||
}
|
||||
|
||||
unique_ptr<MultiFileReader> DeltaMultiFileReader::CreateInstance() {
|
||||
return std::move(make_uniq<DeltaMultiFileReader>());
|
||||
}
|
||||
|
||||
bool DeltaMultiFileReader::Bind(MultiFileOptions &options, MultiFileList &files, vector<LogicalType> &return_types,
|
||||
vector<string> &names, MultiFileReaderBindData &bind_data) {
|
||||
auto &delta_snapshot = dynamic_cast<DeltaSnapshot &>(files);
|
||||
|
||||
delta_snapshot.Bind(return_types, names);
|
||||
|
||||
// We need to parse this option
|
||||
bool file_row_number_enabled = options.custom_options.find("file_row_number") != options.custom_options.end();
|
||||
if (file_row_number_enabled) {
|
||||
bind_data.file_row_number_idx = names.size();
|
||||
return_types.emplace_back(LogicalType::BIGINT);
|
||||
names.emplace_back("file_row_number");
|
||||
} else {
|
||||
// TODO: this is a bogus ID? Change for flag indicating it should be enabled?
|
||||
bind_data.file_row_number_idx = names.size();
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
void DeltaMultiFileReader::BindOptions(MultiFileOptions &options, MultiFileList &files,
|
||||
vector<LogicalType> &return_types, vector<string> &names,
|
||||
MultiFileReaderBindData &bind_data) {
|
||||
|
||||
// Disable all other multifilereader options
|
||||
options.auto_detect_hive_partitioning = false;
|
||||
options.hive_partitioning = false;
|
||||
options.union_by_name = false;
|
||||
|
||||
MultiFileReader::BindOptions(options, files, return_types, names, bind_data);
|
||||
|
||||
auto demo_gen_col_opt = options.custom_options.find("delta_file_number");
|
||||
if (demo_gen_col_opt != options.custom_options.end()) {
|
||||
if (demo_gen_col_opt->second.GetValue<bool>()) {
|
||||
names.push_back("delta_file_number");
|
||||
return_types.push_back(LogicalType::UBIGINT);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DeltaMultiFileReader::FinalizeBind(const MultiFileOptions &file_options, const MultiFileReaderBindData &options,
|
||||
const string &filename, const vector<string> &local_names,
|
||||
const vector<LogicalType> &global_types, const vector<string> &global_names,
|
||||
const vector<column_t> &global_column_ids, MultiFileReaderData &reader_data,
|
||||
ClientContext &context, optional_ptr<MultiFileReaderGlobalState> global_state) {
|
||||
MultiFileReader::FinalizeBind(file_options, options, filename, local_names, global_types, global_names,
|
||||
global_column_ids, reader_data, context, global_state);
|
||||
|
||||
// Handle custom delta option set in MultiFileOptions::custom_options
|
||||
auto file_number_opt = file_options.custom_options.find("delta_file_number");
|
||||
if (file_number_opt != file_options.custom_options.end()) {
|
||||
if (file_number_opt->second.GetValue<bool>()) {
|
||||
D_ASSERT(global_state);
|
||||
auto &delta_global_state = global_state->Cast<DeltaMultiFileReaderGlobalState>();
|
||||
D_ASSERT(delta_global_state.delta_file_number_idx != DConstants::INVALID_INDEX);
|
||||
|
||||
// We add the constant column for the delta_file_number option
|
||||
// NOTE: we add a placeholder here, to demonstrate how we can also populate extra columns in the
|
||||
// FinalizeChunk
|
||||
reader_data.constant_map.emplace_back(delta_global_state.delta_file_number_idx, Value::UBIGINT(0));
|
||||
}
|
||||
}
|
||||
|
||||
// Get the metadata for this file
|
||||
D_ASSERT(global_state->file_list);
|
||||
const auto &snapshot = dynamic_cast<const DeltaSnapshot &>(*global_state->file_list);
|
||||
auto &file_metadata = snapshot.metadata[reader_data.file_list_idx.GetIndex()];
|
||||
|
||||
if (!file_metadata->partition_map.empty()) {
|
||||
for (idx_t i = 0; i < global_column_ids.size(); i++) {
|
||||
column_t col_id = global_column_ids[i];
|
||||
auto col_partition_entry = file_metadata->partition_map.find(global_names[col_id]);
|
||||
if (col_partition_entry != file_metadata->partition_map.end()) {
|
||||
// Todo: use https://github.com/delta-io/delta/blob/master/PROTOCOL.md#partition-value-serialization
|
||||
auto maybe_value = Value(col_partition_entry->second).DefaultCastAs(global_types[i]);
|
||||
reader_data.constant_map.emplace_back(i, maybe_value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<MultiFileList> DeltaMultiFileReader::CreateFileList(ClientContext &context, const vector<string> &paths,
|
||||
FileGlobOptions options) {
|
||||
if (paths.size() != 1) {
|
||||
throw BinderException("'delta_scan' only supports single path as input");
|
||||
}
|
||||
|
||||
return make_uniq<DeltaSnapshot>(context, paths[0]);
|
||||
}
|
||||
|
||||
// Generate the correct Selection Vector Based on the Raw delta KernelBoolSlice dv and the row_id_column
|
||||
// TODO: this probably is slower than needed (we can do with less branches in the for loop for most cases)
|
||||
static SelectionVector DuckSVFromDeltaSV(const ffi::KernelBoolSlice &dv, Vector row_id_column, idx_t count,
|
||||
idx_t &select_count) {
|
||||
D_ASSERT(row_id_column.GetType() == LogicalType::BIGINT);
|
||||
|
||||
UnifiedVectorFormat data;
|
||||
row_id_column.ToUnifiedFormat(count, data);
|
||||
auto row_ids = UnifiedVectorFormat::GetData<int64_t>(data);
|
||||
|
||||
SelectionVector result {count};
|
||||
idx_t current_select = 0;
|
||||
for (idx_t i = 0; i < count; i++) {
|
||||
auto row_id = row_ids[data.sel->get_index(i)];
|
||||
|
||||
// TODO: why are deletion vectors not spanning whole data?
|
||||
if (row_id >= dv.len || dv.ptr[row_id]) {
|
||||
result.data()[current_select] = i;
|
||||
current_select++;
|
||||
}
|
||||
}
|
||||
|
||||
select_count = current_select;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Parses the columns that are used by the delta extension into
|
||||
void DeltaMultiFileReaderGlobalState::SetColumnIdx(const string &column, idx_t idx) {
|
||||
if (column == "file_row_number") {
|
||||
file_row_number_idx = idx;
|
||||
return;
|
||||
} else if (column == "delta_file_number") {
|
||||
delta_file_number_idx = idx;
|
||||
return;
|
||||
}
|
||||
throw IOException("Unknown column '%s' found as required by the DeltaMultiFileReader");
|
||||
}
|
||||
|
||||
unique_ptr<MultiFileReaderGlobalState> DeltaMultiFileReader::InitializeGlobalState(
|
||||
duckdb::ClientContext &context, const duckdb::MultiFileOptions &file_options,
|
||||
const duckdb::MultiFileReaderBindData &bind_data, const duckdb::MultiFileList &file_list,
|
||||
const vector<duckdb::LogicalType> &global_types, const vector<std::string> &global_names,
|
||||
const vector<duckdb::column_t> &global_column_ids) {
|
||||
vector<LogicalType> extra_columns;
|
||||
vector<pair<string, idx_t>> mapped_columns;
|
||||
|
||||
// Create a map of the columns that are in the projection
|
||||
case_insensitive_map_t<idx_t> selected_columns;
|
||||
for (idx_t i = 0; i < global_column_ids.size(); i++) {
|
||||
auto global_id = global_column_ids[i];
|
||||
if (IsRowIdColumnId(global_id)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto &global_name = global_names[global_id];
|
||||
selected_columns.insert({global_name, i});
|
||||
}
|
||||
|
||||
// TODO: only add file_row_number column if there are deletes
|
||||
case_insensitive_map_t<LogicalType> columns_to_map = {
|
||||
{"file_row_number", LogicalType::BIGINT},
|
||||
};
|
||||
|
||||
// Add the delta_file_number column to the columns to map
|
||||
auto demo_gen_col_opt = file_options.custom_options.find("delta_file_number");
|
||||
if (demo_gen_col_opt != file_options.custom_options.end()) {
|
||||
if (demo_gen_col_opt->second.GetValue<bool>()) {
|
||||
columns_to_map.insert({"delta_file_number", LogicalType::UBIGINT});
|
||||
}
|
||||
}
|
||||
|
||||
// Map every column to either a column in the projection, or add it to the extra columns if it doesn't exist
|
||||
idx_t col_offset = 0;
|
||||
for (const auto &required_column : columns_to_map) {
|
||||
// First check if the column is in the projection
|
||||
auto res = selected_columns.find(required_column.first);
|
||||
if (res != selected_columns.end()) {
|
||||
// The column is in the projection, no special handling is required; we simply store the index
|
||||
mapped_columns.push_back({required_column.first, res->second});
|
||||
continue;
|
||||
}
|
||||
|
||||
// The column is NOT in the projection: it needs to be added as an extra_column
|
||||
|
||||
// Calculate the index of the added column (extra columns are added after all other columns)
|
||||
idx_t current_col_idx = global_column_ids.size() + col_offset++;
|
||||
|
||||
// Add column to the map, to ensure the MultiFileReader can find it when processing the Chunk
|
||||
mapped_columns.push_back({required_column.first, current_col_idx});
|
||||
|
||||
// Ensure the result DataChunk has a vector of the correct type to store this column
|
||||
extra_columns.push_back(required_column.second);
|
||||
}
|
||||
|
||||
auto res = make_uniq<DeltaMultiFileReaderGlobalState>(extra_columns, &file_list);
|
||||
|
||||
// Parse all the mapped columns into the DeltaMultiFileReaderGlobalState for easy use;
|
||||
for (const auto &mapped_column : mapped_columns) {
|
||||
res->SetColumnIdx(mapped_column.first, mapped_column.second);
|
||||
}
|
||||
|
||||
return std::move(res);
|
||||
}
|
||||
|
||||
void DeltaMultiFileReader::CreateNameMapping(const string &file_name, const vector<LogicalType> &local_types,
|
||||
const vector<string> &local_names, const vector<LogicalType> &global_types,
|
||||
const vector<string> &global_names,
|
||||
const vector<column_t> &global_column_ids,
|
||||
MultiFileReaderData &reader_data, const string &initial_file,
|
||||
optional_ptr<MultiFileReaderGlobalState> global_state) {
|
||||
// First call the base implementation to do most mapping
|
||||
MultiFileReader::CreateNameMapping(file_name, local_types, local_names, global_types, global_names,
|
||||
global_column_ids, reader_data, initial_file, global_state);
|
||||
|
||||
// Then we handle delta specific mapping
|
||||
D_ASSERT(global_state);
|
||||
auto &delta_global_state = global_state->Cast<DeltaMultiFileReaderGlobalState>();
|
||||
|
||||
// Check if the file_row_number column is an "extra_column" which is not part of the projection
|
||||
if (delta_global_state.file_row_number_idx >= global_column_ids.size()) {
|
||||
D_ASSERT(delta_global_state.file_row_number_idx != DConstants::INVALID_INDEX);
|
||||
|
||||
// Build the name map
|
||||
case_insensitive_map_t<idx_t> name_map;
|
||||
for (idx_t col_idx = 0; col_idx < local_names.size(); col_idx++) {
|
||||
name_map[local_names[col_idx]] = col_idx;
|
||||
}
|
||||
|
||||
// Lookup the required column in the local map
|
||||
auto entry = name_map.find("file_row_number");
|
||||
if (entry == name_map.end()) {
|
||||
throw IOException("Failed to find the file_row_number column");
|
||||
}
|
||||
|
||||
// Register the column to be scanned from this file
|
||||
reader_data.column_ids.push_back(entry->second);
|
||||
reader_data.column_mapping.push_back(delta_global_state.file_row_number_idx);
|
||||
}
|
||||
|
||||
// This may have changed: update it
|
||||
reader_data.empty_columns = reader_data.column_ids.empty();
|
||||
}
|
||||
|
||||
void DeltaMultiFileReader::FinalizeChunk(ClientContext &context, const MultiFileReaderBindData &bind_data,
|
||||
const MultiFileReaderData &reader_data, DataChunk &chunk,
|
||||
optional_ptr<MultiFileReaderGlobalState> global_state) {
|
||||
// Base class finalization first
|
||||
MultiFileReader::FinalizeChunk(context, bind_data, reader_data, chunk, global_state);
|
||||
|
||||
D_ASSERT(global_state);
|
||||
auto &delta_global_state = global_state->Cast<DeltaMultiFileReaderGlobalState>();
|
||||
D_ASSERT(delta_global_state.file_list);
|
||||
|
||||
// Get the metadata for this file
|
||||
const auto &snapshot = dynamic_cast<const DeltaSnapshot &>(*global_state->file_list);
|
||||
auto &metadata = snapshot.metadata[reader_data.file_list_idx.GetIndex()];
|
||||
|
||||
if (metadata->selection_vector.ptr && chunk.size() != 0) {
|
||||
D_ASSERT(delta_global_state.file_row_number_idx != DConstants::INVALID_INDEX);
|
||||
auto &file_row_number_column = chunk.data[delta_global_state.file_row_number_idx];
|
||||
|
||||
// Construct the selection vector using the file_row_number column and the raw selection vector from delta
|
||||
idx_t select_count;
|
||||
auto sv = DuckSVFromDeltaSV(metadata->selection_vector, file_row_number_column, chunk.size(), select_count);
|
||||
chunk.Slice(sv, select_count);
|
||||
}
|
||||
|
||||
// Note: this demo function shows how we can use DuckDB's Binder create expression-based generated columns
|
||||
if (delta_global_state.delta_file_number_idx != DConstants::INVALID_INDEX) {
|
||||
//! Create Dummy expression (0 + file_number)
|
||||
vector<unique_ptr<ParsedExpression>> child_expr;
|
||||
child_expr.push_back(make_uniq<ConstantExpression>(Value::UBIGINT(0)));
|
||||
child_expr.push_back(make_uniq<ConstantExpression>(Value::UBIGINT(7)));
|
||||
unique_ptr<ParsedExpression> expr =
|
||||
make_uniq<FunctionExpression>("+", std::move(child_expr), nullptr, nullptr, false, true);
|
||||
|
||||
//! s dummy expression
|
||||
auto binder = Binder::CreateBinder(context);
|
||||
ExpressionBinder expr_binder(*binder, context);
|
||||
auto bound_expr = expr_binder.Bind(expr, nullptr);
|
||||
|
||||
//! Execute dummy expression into result column
|
||||
ExpressionExecutor expr_executor(context);
|
||||
expr_executor.AddExpression(*bound_expr);
|
||||
|
||||
//! Execute the expression directly into the output Chunk
|
||||
expr_executor.ExecuteExpression(chunk.data[delta_global_state.delta_file_number_idx]);
|
||||
}
|
||||
};
|
||||
|
||||
bool DeltaMultiFileReader::ParseOption(const string &key, const Value &val, MultiFileOptions &options,
|
||||
ClientContext &context) {
|
||||
auto loption = StringUtil::Lower(key);
|
||||
|
||||
if (loption == "delta_file_number") {
|
||||
options.custom_options[loption] = val;
|
||||
return true;
|
||||
}
|
||||
|
||||
// We need to capture this one to know whether to emit
|
||||
if (loption == "file_row_number") {
|
||||
options.custom_options[loption] = val;
|
||||
return true;
|
||||
}
|
||||
|
||||
return MultiFileReader::ParseOption(key, val, options, context);
|
||||
}
|
||||
//
|
||||
// DeltaMultiFileReaderBindData::DeltaMultiFileReaderBindData(DeltaSnapshot & delta_snapshot):
|
||||
// current_snapshot(delta_snapshot){
|
||||
//
|
||||
//}
|
||||
|
||||
TableFunctionSet DeltaFunctions::GetDeltaScanFunction(DatabaseInstance &instance) {
|
||||
// The delta_scan function is constructed by grabbing the parquet scan from the Catalog, then injecting the
|
||||
// DeltaMultiFileReader into it to create a Delta-based multi file read
|
||||
|
||||
auto &parquet_scan = ExtensionUtil::GetTableFunction(instance, "parquet_scan");
|
||||
auto parquet_scan_copy = parquet_scan.functions;
|
||||
|
||||
for (auto &function : parquet_scan_copy.functions) {
|
||||
// Register the MultiFileReader as the driver for reads
|
||||
function.get_multi_file_reader = DeltaMultiFileReader::CreateInstance;
|
||||
|
||||
// Unset all of these: they are either broken, very inefficient.
|
||||
// TODO: implement/fix these
|
||||
function.serialize = nullptr;
|
||||
function.deserialize = nullptr;
|
||||
function.statistics = nullptr;
|
||||
function.table_scan_progress = nullptr;
|
||||
function.cardinality = nullptr;
|
||||
function.get_bind_info = nullptr;
|
||||
|
||||
// Schema param is just confusing here
|
||||
function.named_parameters.erase("schema");
|
||||
|
||||
// Demonstration of a generated column based on information from DeltaSnapshot
|
||||
function.named_parameters["delta_file_number"] = LogicalType::BOOLEAN;
|
||||
|
||||
function.name = "delta_scan";
|
||||
}
|
||||
|
||||
parquet_scan_copy.name = "delta_scan";
|
||||
return parquet_scan_copy;
|
||||
}
|
||||
|
||||
} // namespace duckdb
|
||||
13
external/duckdb/extension/delta/src/include/delta_extension.hpp
vendored
Normal file
13
external/duckdb/extension/delta/src/include/delta_extension.hpp
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
#pragma once
|
||||
|
||||
#include "duckdb.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class DeltaExtension : public Extension {
|
||||
public:
|
||||
void Load(ExtensionLoader &loader) override;
|
||||
std::string Name() override;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
22
external/duckdb/extension/delta/src/include/delta_functions.hpp
vendored
Normal file
22
external/duckdb/extension/delta/src/include/delta_functions.hpp
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// delta_functions.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "duckdb/parser/parsed_data/create_table_function_info.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
class DeltaFunctions {
|
||||
public:
|
||||
static vector<TableFunctionSet> GetTableFunctions(DatabaseInstance &instance);
|
||||
|
||||
private:
|
||||
static TableFunctionSet GetDeltaScanFunction(DatabaseInstance &instance);
|
||||
};
|
||||
} // namespace duckdb
|
||||
155
external/duckdb/extension/delta/src/include/delta_utils.hpp
vendored
Normal file
155
external/duckdb/extension/delta/src/include/delta_utils.hpp
vendored
Normal file
@@ -0,0 +1,155 @@
|
||||
#pragma once
|
||||
|
||||
#include "delta_kernel_ffi.hpp"
|
||||
#include "duckdb/planner/filter/constant_filter.hpp"
|
||||
#include "duckdb/planner/filter/conjunction_filter.hpp"
|
||||
#include "duckdb/common/enum_util.hpp"
|
||||
#include <iostream>
|
||||
|
||||
// TODO: clean up this file as we go
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
// SchemaVisitor is used to parse the schema of a Delta table from the Kernel
|
||||
class SchemaVisitor {
|
||||
public:
|
||||
using FieldList = child_list_t<LogicalType>;
|
||||
|
||||
static unique_ptr<FieldList> VisitSnapshotSchema(ffi::SharedSnapshot *snapshot);
|
||||
|
||||
private:
|
||||
unordered_map<uintptr_t, unique_ptr<FieldList>> inflight_lists;
|
||||
uintptr_t next_id = 1;
|
||||
|
||||
typedef void(SimpleTypeVisitorFunction)(void *, uintptr_t, ffi::KernelStringSlice);
|
||||
|
||||
template <LogicalTypeId TypeId>
|
||||
static SimpleTypeVisitorFunction *VisitSimpleType() {
|
||||
return (SimpleTypeVisitorFunction *)&VisitSimpleTypeImpl<TypeId>;
|
||||
}
|
||||
template <LogicalTypeId TypeId>
|
||||
static void VisitSimpleTypeImpl(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name) {
|
||||
state->AppendToList(sibling_list_id, name, TypeId);
|
||||
}
|
||||
|
||||
static void VisitDecimal(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name,
|
||||
uint8_t precision, uint8_t scale);
|
||||
static uintptr_t MakeFieldList(SchemaVisitor *state, uintptr_t capacity_hint);
|
||||
static void VisitStruct(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name,
|
||||
uintptr_t child_list_id);
|
||||
static void VisitArray(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name,
|
||||
bool contains_null, uintptr_t child_list_id);
|
||||
static void VisitMap(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name,
|
||||
bool contains_null, uintptr_t child_list_id);
|
||||
|
||||
uintptr_t MakeFieldListImpl(uintptr_t capacity_hint);
|
||||
void AppendToList(uintptr_t id, ffi::KernelStringSlice name, LogicalType &&child);
|
||||
unique_ptr<FieldList> TakeFieldList(uintptr_t id);
|
||||
};
|
||||
|
||||
// Allocator for errors that the kernel might throw
|
||||
struct DuckDBEngineError : ffi::EngineError {
|
||||
// Allocate a DuckDBEngineError, function ptr passed to kernel for error allocation
|
||||
static ffi::EngineError *AllocateError(ffi::KernelError etype, ffi::KernelStringSlice msg);
|
||||
// Convert a kernel error enum to a string
|
||||
static string KernelErrorEnumToString(ffi::KernelError err);
|
||||
|
||||
// Throw the error as an IOException
|
||||
[[noreturn]] void Throw(string from_info);
|
||||
|
||||
// The error message from Kernel
|
||||
string error_message;
|
||||
};
|
||||
|
||||
// RAII wrapper that returns ownership of a kernel pointer to kernel when it goes out of
|
||||
// scope. Similar to std::unique_ptr. but does not define operator->() and does not require the
|
||||
// kernel type to be complete.
|
||||
template <typename KernelType>
|
||||
struct UniqueKernelPointer {
|
||||
UniqueKernelPointer() : ptr(nullptr), free(nullptr) {
|
||||
}
|
||||
|
||||
// Takes ownership of a pointer with associated deleter.
|
||||
UniqueKernelPointer(KernelType *ptr, void (*free)(KernelType *)) : ptr(ptr), free(free) {
|
||||
}
|
||||
|
||||
// movable but not copyable
|
||||
UniqueKernelPointer(UniqueKernelPointer &&other) : ptr(other.ptr) {
|
||||
other.ptr = nullptr;
|
||||
}
|
||||
UniqueKernelPointer &operator=(UniqueKernelPointer &&other) {
|
||||
std::swap(ptr, other.ptr);
|
||||
std::swap(free, other.free);
|
||||
return *this;
|
||||
}
|
||||
UniqueKernelPointer(const UniqueKernelPointer &) = delete;
|
||||
UniqueKernelPointer &operator=(const UniqueKernelPointer &) = delete;
|
||||
|
||||
~UniqueKernelPointer() {
|
||||
if (ptr && free) {
|
||||
free(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
KernelType *get() const {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
private:
|
||||
KernelType *ptr;
|
||||
void (*free)(KernelType *) = nullptr;
|
||||
};
|
||||
|
||||
// Syntactic sugar around the different kernel types
|
||||
template <typename KernelType, void (*DeleteFunction)(KernelType *)>
|
||||
struct TemplatedUniqueKernelPointer : public UniqueKernelPointer<KernelType> {
|
||||
TemplatedUniqueKernelPointer() : UniqueKernelPointer<KernelType>() {};
|
||||
TemplatedUniqueKernelPointer(KernelType *ptr) : UniqueKernelPointer<KernelType>(ptr, DeleteFunction) {};
|
||||
};
|
||||
|
||||
typedef TemplatedUniqueKernelPointer<ffi::SharedSnapshot, ffi::drop_snapshot> KernelSnapshot;
|
||||
typedef TemplatedUniqueKernelPointer<ffi::SharedExternEngine, ffi::drop_engine> KernelExternEngine;
|
||||
typedef TemplatedUniqueKernelPointer<ffi::SharedScan, ffi::drop_scan> KernelScan;
|
||||
typedef TemplatedUniqueKernelPointer<ffi::SharedGlobalScanState, ffi::drop_global_scan_state> KernelGlobalScanState;
|
||||
typedef TemplatedUniqueKernelPointer<ffi::SharedScanDataIterator, ffi::kernel_scan_data_free> KernelScanDataIterator;
|
||||
|
||||
struct KernelUtils {
|
||||
static ffi::KernelStringSlice ToDeltaString(const string &str);
|
||||
static string FromDeltaString(const struct ffi::KernelStringSlice slice);
|
||||
static vector<bool> FromDeltaBoolSlice(const struct ffi::KernelBoolSlice slice);
|
||||
|
||||
// TODO: all kernel results need to be unpacked, not doing so will result in an error. This should be cleaned up
|
||||
template <class T>
|
||||
static T UnpackResult(ffi::ExternResult<T> result, const string &from_where) {
|
||||
if (result.tag == ffi::ExternResult<T>::Tag::Err) {
|
||||
if (result.err._0) {
|
||||
auto error_cast = static_cast<DuckDBEngineError *>(result.err._0);
|
||||
error_cast->Throw(from_where);
|
||||
} else {
|
||||
throw IOException("Hit DeltaKernel FFI error (from: %s): Hit error, but error was nullptr",
|
||||
from_where.c_str());
|
||||
}
|
||||
} else if (result.tag == ffi::ExternResult<T>::Tag::Ok) {
|
||||
return result.ok._0;
|
||||
}
|
||||
throw IOException("Invalid error ExternResult tag found!");
|
||||
}
|
||||
};
|
||||
|
||||
class PredicateVisitor : public ffi::EnginePredicate {
|
||||
public:
|
||||
PredicateVisitor(const vector<string> &column_names, optional_ptr<TableFilterSet> filters);
|
||||
|
||||
private:
|
||||
unordered_map<string, TableFilter *> column_filters;
|
||||
|
||||
static uintptr_t VisitPredicate(PredicateVisitor *predicate, ffi::KernelExpressionVisitorState *state);
|
||||
|
||||
uintptr_t VisitConstantFilter(const string &col_name, const ConstantFilter &filter,
|
||||
ffi::KernelExpressionVisitorState *state);
|
||||
uintptr_t VisitAndFilter(const string &col_name, const ConjunctionAndFilter &filter,
|
||||
ffi::KernelExpressionVisitorState *state);
|
||||
uintptr_t VisitFilter(const string &col_name, const TableFilter &filter, ffi::KernelExpressionVisitorState *state);
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
145
external/duckdb/extension/delta/src/include/functions/delta_scan.hpp
vendored
Normal file
145
external/duckdb/extension/delta/src/include/functions/delta_scan.hpp
vendored
Normal file
@@ -0,0 +1,145 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// DuckDB
|
||||
//
|
||||
// functions/delta_scan.hpp
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "delta_utils.hpp"
|
||||
#include "duckdb/common/multi_file/multi_file_reader.hpp"
|
||||
|
||||
namespace duckdb {
|
||||
|
||||
struct DeltaFileMetaData {
|
||||
DeltaFileMetaData() {};
|
||||
|
||||
// No copying pls
|
||||
DeltaFileMetaData(const DeltaFileMetaData &) = delete;
|
||||
DeltaFileMetaData &operator=(const DeltaFileMetaData &) = delete;
|
||||
|
||||
~DeltaFileMetaData() {
|
||||
if (selection_vector.ptr) {
|
||||
ffi::drop_bool_slice(selection_vector);
|
||||
}
|
||||
}
|
||||
|
||||
idx_t delta_snapshot_version = DConstants::INVALID_INDEX;
|
||||
idx_t file_number = DConstants::INVALID_INDEX;
|
||||
ffi::KernelBoolSlice selection_vector = {nullptr, 0};
|
||||
case_insensitive_map_t<string> partition_map;
|
||||
};
|
||||
|
||||
//! The DeltaSnapshot implements the MultiFileList API to allow injecting it into the regular DuckDB parquet scan
|
||||
struct DeltaSnapshot : public MultiFileList {
|
||||
DeltaSnapshot(ClientContext &context, const string &path);
|
||||
string GetPath();
|
||||
static string ToDuckDBPath(const string &raw_path);
|
||||
static string ToDeltaPath(const string &raw_path);
|
||||
|
||||
//! MultiFileList API
|
||||
public:
|
||||
void Bind(vector<LogicalType> &return_types, vector<string> &names);
|
||||
unique_ptr<MultiFileList> ComplexFilterPushdown(ClientContext &context, const MultiFileOptions &options,
|
||||
LogicalGet &get, vector<unique_ptr<Expression>> &filters) override;
|
||||
vector<string> GetAllFiles() override;
|
||||
FileExpandResult GetExpandResult() override;
|
||||
idx_t GetTotalFileCount() override;
|
||||
|
||||
protected:
|
||||
//! Get the i-th expanded file
|
||||
string GetFile(idx_t i) override;
|
||||
|
||||
protected:
|
||||
// TODO: How to guarantee we only call this after the filter pushdown?
|
||||
void InitializeFiles();
|
||||
|
||||
template <class T>
|
||||
T TryUnpackKernelResult(ffi::ExternResult<T> result) {
|
||||
return KernelUtils::UnpackResult<T>(
|
||||
result, StringUtil::Format("While trying to read from delta table: '%s'", paths[0]));
|
||||
}
|
||||
|
||||
// TODO: change back to protected
|
||||
public:
|
||||
idx_t version;
|
||||
|
||||
//! Delta Kernel Structures
|
||||
KernelSnapshot snapshot;
|
||||
KernelExternEngine extern_engine;
|
||||
KernelScan scan;
|
||||
KernelGlobalScanState global_state;
|
||||
KernelScanDataIterator scan_data_iterator;
|
||||
|
||||
//! Names
|
||||
vector<string> names;
|
||||
|
||||
//! Metadata map for files
|
||||
vector<unique_ptr<DeltaFileMetaData>> metadata;
|
||||
|
||||
//! Current file list resolution state
|
||||
bool initialized = false;
|
||||
bool files_exhausted = false;
|
||||
vector<string> resolved_files;
|
||||
TableFilterSet table_filters;
|
||||
|
||||
ClientContext &context;
|
||||
};
|
||||
|
||||
struct DeltaMultiFileReaderGlobalState : public MultiFileReaderGlobalState {
|
||||
DeltaMultiFileReaderGlobalState(vector<LogicalType> extra_columns_p, optional_ptr<const MultiFileList> file_list_p)
|
||||
: MultiFileReaderGlobalState(extra_columns_p, file_list_p) {
|
||||
}
|
||||
//! The idx of the file number column in the result chunk
|
||||
idx_t delta_file_number_idx = DConstants::INVALID_INDEX;
|
||||
//! The idx of the file_row_number column in the result chunk
|
||||
idx_t file_row_number_idx = DConstants::INVALID_INDEX;
|
||||
|
||||
void SetColumnIdx(const string &column, idx_t idx);
|
||||
};
|
||||
|
||||
struct DeltaMultiFileReader : public MultiFileReader {
|
||||
static unique_ptr<MultiFileReader> CreateInstance();
|
||||
//! Return a DeltaSnapshot
|
||||
unique_ptr<MultiFileList> CreateFileList(ClientContext &context, const vector<string> &paths,
|
||||
FileGlobOptions options) override;
|
||||
|
||||
//! Override the regular parquet bind using the MultiFileReader Bind. The bind from these are what DuckDB's file
|
||||
//! readers will try read
|
||||
bool Bind(MultiFileOptions &options, MultiFileList &files, vector<LogicalType> &return_types, vector<string> &names,
|
||||
MultiFileReaderBindData &bind_data) override;
|
||||
|
||||
//! Override the Options bind
|
||||
void BindOptions(MultiFileOptions &options, MultiFileList &files, vector<LogicalType> &return_types,
|
||||
vector<string> &names, MultiFileReaderBindData &bind_data) override;
|
||||
|
||||
void CreateNameMapping(const string &file_name, const vector<LogicalType> &local_types,
|
||||
const vector<string> &local_names, const vector<LogicalType> &global_types,
|
||||
const vector<string> &global_names, const vector<column_t> &global_column_ids,
|
||||
MultiFileReaderData &reader_data, const string &initial_file,
|
||||
optional_ptr<MultiFileReaderGlobalState> global_state) override;
|
||||
|
||||
unique_ptr<MultiFileReaderGlobalState>
|
||||
InitializeGlobalState(ClientContext &context, const MultiFileOptions &file_options,
|
||||
const MultiFileReaderBindData &bind_data, const MultiFileList &file_list,
|
||||
const vector<LogicalType> &global_types, const vector<string> &global_names,
|
||||
const vector<column_t> &global_column_ids) override;
|
||||
|
||||
void FinalizeBind(const MultiFileOptions &file_options, const MultiFileReaderBindData &options,
|
||||
const string &filename, const vector<string> &local_names,
|
||||
const vector<LogicalType> &global_types, const vector<string> &global_names,
|
||||
const vector<column_t> &global_column_ids, MultiFileReaderData &reader_data,
|
||||
ClientContext &context, optional_ptr<MultiFileReaderGlobalState> global_state) override;
|
||||
|
||||
//! Override the FinalizeChunk method
|
||||
void FinalizeChunk(ClientContext &context, const MultiFileReaderBindData &bind_data,
|
||||
const MultiFileReaderData &reader_data, DataChunk &chunk,
|
||||
optional_ptr<MultiFileReaderGlobalState> global_state) override;
|
||||
|
||||
//! Override the ParseOption call to parse delta_scan specific options
|
||||
bool ParseOption(const string &key, const Value &val, MultiFileOptions &options, ClientContext &context) override;
|
||||
};
|
||||
|
||||
} // namespace duckdb
|
||||
5
external/duckdb/extension/delta/vcpkg.json
vendored
Normal file
5
external/duckdb/extension/delta/vcpkg.json
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"dependencies": [
|
||||
"openssl"
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user