should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

608
external/duckdb/scripts/amalgamation.py vendored Normal file
View File

@@ -0,0 +1,608 @@
# this script creates a single header + source file combination out of the DuckDB sources
import os
import re
import sys
import shutil
import subprocess
from python_helpers import open_utf8, normalize_path
amal_dir = os.path.join('src', 'amalgamation')
header_file = os.path.join(amal_dir, "duckdb.hpp")
source_file = os.path.join(amal_dir, "duckdb.cpp")
temp_header = 'duckdb.hpp.tmp'
temp_source = 'duckdb.cpp.tmp'
skip_duckdb_includes = False
src_dir = 'src'
include_dir = os.path.join('src', 'include')
# files included in the amalgamated "duckdb.hpp" file
main_header_files = [
os.path.join(include_dir, 'duckdb.hpp'),
os.path.join(include_dir, 'duckdb.h'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'date.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'adbc', 'adbc.h'),
os.path.join(include_dir, 'duckdb', 'common', 'adbc', 'adbc.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'arrow', 'arrow.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'arrow', 'arrow_converter.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'arrow', 'arrow_wrapper.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'blob.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'decimal.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'hugeint.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'uhugeint.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'uuid.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'interval.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'timestamp.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'time.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'buffered_file_writer.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'memory_stream.hpp'),
os.path.join(include_dir, 'duckdb', 'main', 'appender.hpp'),
os.path.join(include_dir, 'duckdb', 'main', 'client_context.hpp'),
os.path.join(include_dir, 'duckdb', 'main', 'extension', 'extension_loader.hpp'),
os.path.join(include_dir, 'duckdb', 'function', 'function.hpp'),
os.path.join(include_dir, 'duckdb', 'function', 'table_function.hpp'),
os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_table_function_info.hpp'),
os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_copy_function_info.hpp'),
]
extended_amalgamation = False
if '--extended' in sys.argv:
def add_include_dir(dirpath):
return [os.path.join(dirpath, x) for x in os.listdir(dirpath)]
extended_amalgamation = True
main_header_files += [
os.path.join(include_dir, x)
for x in [
'duckdb/planner/expression/bound_constant_expression.hpp',
'duckdb/planner/expression/bound_function_expression.hpp',
'duckdb/catalog/catalog_entry/scalar_function_catalog_entry.hpp',
'duckdb/parser/parsed_data/create_table_info.hpp',
'duckdb/planner/parsed_data/bound_create_table_info.hpp',
'duckdb/parser/constraints/not_null_constraint.hpp',
'duckdb/storage/data_table.hpp',
'duckdb/function/pragma_function.hpp',
'duckdb/parser/qualified_name.hpp',
'duckdb/parser/parser.hpp',
'duckdb/planner/binder.hpp',
'duckdb/storage/object_cache.hpp',
'duckdb/planner/table_filter.hpp',
"duckdb/storage/statistics/base_statistics.hpp",
"duckdb/planner/filter/conjunction_filter.hpp",
"duckdb/planner/filter/constant_filter.hpp",
"duckdb/common/types/vector_cache.hpp",
"duckdb/common/string_map_set.hpp",
"duckdb/planner/filter/null_filter.hpp",
"duckdb/common/arrow/arrow_wrapper.hpp",
"duckdb/common/hive_partitioning.hpp",
"duckdb/common/multi_file/union_by_name.hpp",
"duckdb/planner/operator/logical_get.hpp",
"duckdb/common/compressed_file_system.hpp",
]
]
main_header_files += add_include_dir(os.path.join(include_dir, 'duckdb/parser/expression'))
main_header_files += add_include_dir(os.path.join(include_dir, 'duckdb/parser/parsed_data'))
main_header_files += add_include_dir(os.path.join(include_dir, 'duckdb/parser/tableref'))
main_header_files = normalize_path(main_header_files)
import package_build
# include paths for where to search for include files during amalgamation
include_paths = [include_dir] + package_build.third_party_includes()
# paths of where to look for files to compile and include to the final amalgamation
compile_directories = [src_dir] + package_build.third_party_sources()
# files always excluded
always_excluded = normalize_path(
[
'src/amalgamation/duckdb.cpp',
'src/amalgamation/duckdb.hpp',
'src/amalgamation/parquet-amalgamation.cpp',
'src/amalgamation/parquet-amalgamation.hpp',
]
)
# files excluded from the amalgamation
excluded_files = ['grammar.cpp', 'grammar.hpp', 'symbols.cpp']
# files excluded from individual file compilation during test_compile
excluded_compilation_files = excluded_files + ['gram.hpp', 'kwlist.hpp', "duckdb-c.cpp"]
linenumbers = False
def get_includes(fpath, text):
# find all the includes referred to in the directory
regex_include_statements = re.findall("(^[\t ]*[#][\t ]*include[\t ]+[\"]([^\"]+)[\"])", text, flags=re.MULTILINE)
include_statements = []
include_files = []
# figure out where they are located
for x in regex_include_statements:
included_file = x[1]
if skip_duckdb_includes and 'duckdb' in included_file:
continue
if (
'extension_helper.cpp' in fpath
and (included_file.endswith('_extension.hpp'))
or included_file == 'generated_extension_loader.hpp'
or included_file == 'generated_extension_headers.hpp'
):
continue
if 'allocator.cpp' in fpath and included_file.endswith('jemalloc_extension.hpp'):
continue
if x[0] in include_statements:
raise Exception(f"duplicate include {x[0]} in file {fpath}")
include_statements.append(x[0])
included_file = os.sep.join(included_file.split('/'))
found = False
for include_path in include_paths:
ipath = os.path.join(include_path, included_file)
if os.path.isfile(ipath):
include_files.append(ipath)
found = True
break
if not found:
raise Exception('Could not find include file "' + included_file + '", included from file "' + fpath + '"')
return (include_statements, include_files)
def cleanup_file(text):
# remove all "#pragma once" notifications
text = re.sub('#pragma once', '', text)
return text
# recursively get all includes and write them
written_files = {}
# licenses
licenses = []
def need_to_write_file(current_file, ignore_excluded=False):
if amal_dir in current_file:
return False
if current_file in always_excluded:
return False
if current_file.split(os.sep)[-1] in excluded_files and not ignore_excluded:
# file is in ignored files set
return False
if current_file in written_files:
# file is already written
return False
return True
def find_license(original_file):
global licenses
file = original_file
license = ""
while True:
(file, end) = os.path.split(file)
if file == "":
break
potential_license = os.path.join(file, "LICENSE")
if os.path.exists(potential_license):
license = potential_license
if license == "":
raise "Could not find license for %s" % original_file
if license not in licenses:
licenses += [license]
return licenses.index(license)
def write_file(current_file, ignore_excluded=False):
global linenumbers
global written_files
if not need_to_write_file(current_file, ignore_excluded):
return ""
written_files[current_file] = True
# first read this file
with open_utf8(current_file, 'r') as f:
text = f.read()
if current_file.startswith("third_party") and not current_file.endswith("LICENSE"):
lic_idx = find_license(current_file)
text = (
"\n\n// LICENSE_CHANGE_BEGIN\n// The following code up to LICENSE_CHANGE_END is subject to THIRD PARTY LICENSE #%s\n// See the end of this file for a list\n\n"
% str(lic_idx + 1)
+ text
+ "\n\n// LICENSE_CHANGE_END\n"
)
(statements, includes) = get_includes(current_file, text)
# find the linenr of the final #include statement we parsed
if len(statements) > 0:
index = text.find(statements[-1])
linenr = len(text[:index].split('\n'))
# now write all the dependencies of this header first
for i in range(len(includes)):
include_text = write_file(includes[i])
if linenumbers and i == len(includes) - 1:
# for the last include statement, we also include a #line directive
include_text += '\n#line %d "%s"\n' % (linenr, current_file)
text = text.replace(statements[i], include_text)
# add the initial line here
if linenumbers:
text = '\n#line 1 "%s"\n' % (current_file,) + text
# print(current_file)
# now read the header and write it
return cleanup_file(text)
def write_dir(dir):
files = os.listdir(dir)
files.sort()
text = ""
for fname in files:
if fname in excluded_files:
continue
# print(fname)
fpath = os.path.join(dir, fname)
if os.path.isdir(fpath):
text += write_dir(fpath)
elif fname.endswith('.cpp') or fname.endswith('.c') or fname.endswith('.cc'):
text += write_file(fpath)
return text
def copy_if_different(src, dest):
if os.path.isfile(dest):
# dest exists, check if the files are different
with open_utf8(src, 'r') as f:
source_text = f.read()
with open_utf8(dest, 'r') as f:
dest_text = f.read()
if source_text == dest_text:
# print("Skipping copy of " + src + ", identical copy already exists at " + dest)
return
# print("Copying " + src + " to " + dest)
shutil.copyfile(src, dest)
def git_commit_hash():
git_describe = package_build.get_git_describe()
hash = git_describe.split('-')[2].lstrip('g')
return hash
######
# MAIN_BRANCH_VERSIONING default should be 'True' for main branch and feature branches
# MAIN_BRANCH_VERSIONING default should be 'False' for release branches
# MAIN_BRANCH_VERSIONING default value needs to keep in sync between:
# - CMakeLists.txt
# - scripts/amalgamation.py
# - scripts/package_build.py
######
MAIN_BRANCH_VERSIONING = True
if os.getenv('MAIN_BRANCH_VERSIONING') == "0":
MAIN_BRANCH_VERSIONING = False
if os.getenv('MAIN_BRANCH_VERSIONING') == "1":
MAIN_BRANCH_VERSIONING = True
def git_dev_version():
try:
long_version = package_build.get_git_describe()
version_splits = long_version.split('-')[0].lstrip('v').split('.')
dev_version = long_version.split('-')[1]
if int(dev_version) == 0:
# directly on a tag: emit the regular version
return "v" + '.'.join(version_splits)
else:
# not on a tag: increment the version by one and add a -devX suffix
# this needs to keep in sync with changes to CMakeLists.txt
if MAIN_BRANCH_VERSIONING == True:
# increment minor version
version_splits[1] = str(int(version_splits[1]) + 1)
else:
# increment patch version
version_splits[2] = str(int(version_splits[2]) + 1)
return "v" + '.'.join(version_splits) + "-dev" + dev_version
except:
return "v0.0.0"
def generate_duckdb_hpp(header_file):
print("-----------------------")
print("-- Writing " + header_file + " --")
print("-----------------------")
with open_utf8(temp_header, 'w+') as hfile:
hfile.write("/*\n")
hfile.write(write_file("LICENSE"))
hfile.write("*/\n\n")
hfile.write("#pragma once\n")
hfile.write("#define DUCKDB_AMALGAMATION 1\n")
if extended_amalgamation:
hfile.write("#define DUCKDB_AMALGAMATION_EXTENDED 1\n")
hfile.write("#define DUCKDB_SOURCE_ID \"%s\"\n" % git_commit_hash())
dev_version = git_dev_version()
dev_v_parts = dev_version.lstrip('v').split('.')
hfile.write("#define DUCKDB_VERSION \"%s\"\n" % dev_version)
hfile.write("#define DUCKDB_MAJOR_VERSION %d\n" % int(dev_v_parts[0]))
hfile.write("#define DUCKDB_MINOR_VERSION %d\n" % int(dev_v_parts[1]))
hfile.write("#define DUCKDB_PATCH_VERSION \"%s\"\n" % dev_v_parts[2])
for fpath in main_header_files:
hfile.write(write_file(fpath))
def generate_amalgamation(source_file, header_file):
# construct duckdb.hpp from these headers
generate_duckdb_hpp(header_file)
# now construct duckdb.cpp
print("------------------------")
print("-- Writing " + source_file + " --")
print("------------------------")
# scan all the .cpp files
with open_utf8(temp_source, 'w+') as sfile:
header_file_name = header_file.split(os.sep)[-1]
sfile.write('#include "' + header_file_name + '"\n\n')
sfile.write("#ifndef DUCKDB_AMALGAMATION\n#error header mismatch\n#endif\n\n")
sfile.write("#if (!defined(DEBUG) && !defined NDEBUG)\n#define NDEBUG\n#endif\n\n")
for compile_dir in compile_directories:
sfile.write(write_dir(compile_dir))
sfile.write('\n\n/*\n')
license_idx = 0
for license in licenses:
sfile.write("\n\n\n### THIRD PARTY LICENSE #%s ###\n\n" % str(license_idx + 1))
sfile.write(write_file(license))
license_idx += 1
sfile.write('\n\n*/\n')
copy_if_different(temp_header, header_file)
copy_if_different(temp_source, source_file)
try:
os.remove(temp_header)
os.remove(temp_source)
except:
pass
def list_files(dname, file_list):
files = os.listdir(dname)
files.sort()
for fname in files:
if fname in excluded_files:
continue
fpath = os.path.join(dname, fname)
if os.path.isdir(fpath):
list_files(fpath, file_list)
elif fname.endswith(('.cpp', '.c', '.cc')):
if need_to_write_file(fpath):
file_list.append(fpath)
def list_sources():
file_list = []
for compile_dir in compile_directories:
list_files(compile_dir, file_list)
return file_list
def list_include_files_recursive(dname, file_list):
files = os.listdir(dname)
files.sort()
for fname in files:
if fname in excluded_files:
continue
fpath = os.path.join(dname, fname)
if os.path.isdir(fpath):
list_include_files_recursive(fpath, file_list)
elif fname.endswith(('.hpp', '.ipp', '.h', '.hh', '.tcc', '.inc')):
file_list.append(fpath)
def list_includes_files(include_dirs):
file_list = []
for include_dir in include_dirs:
list_include_files_recursive(include_dir, file_list)
return file_list
def list_includes():
return list_includes_files(include_paths)
def gather_file(current_file, source_files, header_files):
global linenumbers
global written_files
if not need_to_write_file(current_file, False):
return ""
written_files[current_file] = True
# first read this file
with open_utf8(current_file, 'r') as f:
text = f.read()
(statements, includes) = get_includes(current_file, text)
# find the linenr of the final #include statement we parsed
if len(statements) > 0:
index = text.find(statements[-1])
linenr = len(text[:index].split('\n'))
# now write all the dependencies of this header first
for i in range(len(includes)):
# source file inclusions are inlined into the main text
include_text = write_file(includes[i])
if linenumbers and i == len(includes) - 1:
# for the last include statement, we also include a #line directive
include_text += '\n#line %d "%s"\n' % (linenr, current_file)
if includes[i].endswith('.cpp') or includes[i].endswith('.cc') or includes[i].endswith('.c'):
# source file inclusions are inlined into the main text
text = text.replace(statements[i], include_text)
else:
text = text.replace(statements[i], '')
header_files.append(include_text)
# add the initial line here
if linenumbers:
text = '\n#line 1 "%s"\n' % (current_file,) + text
source_files.append(cleanup_file(text))
def gather_files(dir, source_files, header_files):
files = os.listdir(dir)
files.sort()
for fname in files:
if fname in excluded_files:
continue
fpath = os.path.join(dir, fname)
if os.path.isdir(fpath):
gather_files(fpath, source_files, header_files)
elif fname.endswith('.cpp') or fname.endswith('.c') or fname.endswith('.cc'):
gather_file(fpath, source_files, header_files)
def write_license(hfile):
hfile.write("// See https://raw.githubusercontent.com/duckdb/duckdb/main/LICENSE for licensing information\n\n")
def generate_amalgamation_splits(source_file, header_file, nsplits):
# construct duckdb.hpp from these headers
generate_duckdb_hpp(header_file)
# gather all files to read and write
source_files = []
header_files = []
for compile_dir in compile_directories:
if compile_dir != src_dir:
continue
gather_files(compile_dir, source_files, header_files)
# write duckdb-internal.hpp
if '.hpp' in header_file:
internal_header_file = header_file.replace('.hpp', '-internal.hpp')
elif '.h' in header_file:
internal_header_file = header_file.replace('.h', '-internal.h')
else:
raise "Unknown extension of header file"
temp_internal_header = internal_header_file + '.tmp'
with open_utf8(temp_internal_header, 'w+') as f:
write_license(f)
for hfile in header_files:
f.write(hfile)
# count the total amount of bytes in the source files
total_bytes = 0
for sfile in source_files:
total_bytes += len(sfile)
# now write the individual splits
# we approximate the splitting up by making every file have roughly the same amount of bytes
split_bytes = total_bytes / nsplits
current_bytes = 0
partitions = []
partition_names = []
current_partition = []
current_partition_idx = 1
for sfile in source_files:
current_partition.append(sfile)
current_bytes += len(sfile)
if current_bytes >= split_bytes:
partition_names.append(str(current_partition_idx))
partitions.append(current_partition)
current_partition = []
current_bytes = 0
current_partition_idx += 1
if len(current_partition) > 0:
partition_names.append(str(current_partition_idx))
partitions.append(current_partition)
current_partition = []
current_bytes = 0
# generate partitions from the third party libraries
for compile_dir in compile_directories:
if compile_dir != src_dir:
partition_names.append(compile_dir.split(os.sep)[-1])
partitions.append(write_dir(compile_dir))
header_file_name = header_file.split(os.sep)[-1]
internal_header_file_name = internal_header_file.split(os.sep)[-1]
partition_fnames = []
current_partition = 0
for partition in partitions:
partition_name = source_file.replace('.cpp', '-%s.cpp' % (partition_names[current_partition],))
temp_partition_name = partition_name + '.tmp'
partition_fnames.append([partition_name, temp_partition_name])
with open_utf8(temp_partition_name, 'w+') as f:
write_license(f)
f.write('#include "%s"\n#include "%s"' % (header_file_name, internal_header_file_name))
f.write(
'''
#ifndef DUCKDB_AMALGAMATION
#error header mismatch
#endif
'''
)
for sfile in partition:
f.write(sfile)
current_partition += 1
copy_if_different(temp_header, header_file)
copy_if_different(temp_internal_header, internal_header_file)
try:
os.remove(temp_header)
os.remove(temp_internal_header)
except:
pass
for p in partition_fnames:
copy_if_different(p[1], p[0])
try:
os.remove(p[1])
except:
pass
def list_include_dirs():
return include_paths
if __name__ == "__main__":
nsplits = 1
for arg in sys.argv:
if arg == '--linenumbers':
linenumbers = True
elif arg == '--no-linenumbers':
linenumbers = False
elif arg.startswith('--header='):
header_file = os.path.join(*arg.split('=', 1)[1].split('/'))
elif arg.startswith('--source='):
source_file = os.path.join(*arg.split('=', 1)[1].split('/'))
elif arg.startswith('--splits='):
nsplits = int(arg.split('=', 1)[1])
elif arg.startswith('--list-sources'):
file_list = list_sources()
print('\n'.join(file_list))
exit(1)
elif arg.startswith('--list-objects'):
file_list = list_sources()
print(' '.join([x.rsplit('.', 1)[0] + '.o' for x in file_list]))
exit(1)
elif arg.startswith('--includes'):
include_dirs = list_include_dirs()
print(' '.join(['-I' + x for x in include_dirs]))
exit(1)
elif arg.startswith('--include-directories'):
include_dirs = list_include_dirs()
print('\n'.join(include_dirs))
exit(1)
if os.path.exists(amal_dir):
shutil.rmtree(amal_dir)
os.makedirs(amal_dir)
if nsplits > 1:
generate_amalgamation_splits(source_file, header_file, nsplits)
else:
generate_amalgamation(source_file, header_file)

View File

@@ -0,0 +1,68 @@
cmake_minimum_required(VERSION 3.15...3.29)
# Usage: cmake -DEXTENSION=path/to/extension.duckdb_extension -DPLATFORM_FILE=README.md -DDUCKDB_VERSION=tag1 -DEXTENSION_VERSION=tag2 -P scripts/append_metadata.cmake
# Currently hardcoded to host up to 8 fields
# Example: ./scripts/append_metadata.sh file.duckdb_extension git_hash_duckdb_file git_hash_extension_file platfrom_file
set(EXTENSION "" CACHE PATH "Path to the extension where to add metadata")
set(NULL_FILE "" CACHE PATH "Path to file containing a single 0 byte")
set(META1 "4" CACHE STRING "Metadata field" FORCE)
set(PLATFORM_FILE "" CACHE PATH "Metadata field: path of file containing duckdb_platform")
set(VERSION_FIELD "" CACHE STRING "Metadata field: path of file containing duckdb_version")
set(EXTENSION_VERSION "" CACHE STRING "Metadata field: path of file containing extension_version")
set(ABI_TYPE "" CACHE STRING "Metadata field: the ABI type of the extension")
set(META6 "" CACHE STRING "Metadata field")
set(META7 "" CACHE STRING "Metadata field")
set(META8 "" CACHE STRING "Metadata field")
# null.txt should contain exactly 1 byte of value \x00
file(READ "${NULL_FILE}" EMPTY_BYTE)
string(REPEAT "${EMPTY_BYTE}" 32 EMPTY_32)
string(REPEAT "${EMPTY_BYTE}" 256 EMPTY_256)
# 0 for custom section
string(APPEND CUSTOM_SECTION "${EMPTY_BYTE}")
# 213 in hex = 531 in decimal, total length of what follows (1 + 16 + 2 + 8x32 + 256)
# [1(continuation) + 0010011(payload) = \x93 -> 147, 0(continuation) + 10(payload) = \x04 -> 4]
# 10 in hex = 16 in decimal, length of name, 1 byte
string(ASCII 147 4 16 CUSTOM_SECTION_2)
string(APPEND CUSTOM_SECTION "${CUSTOM_SECTION_2}")
# the name of the WebAssembly custom section, 16 bytes
string(APPEND CUSTOM_SECTION "duckdb_signature")
# 1000 in hex, 512 in decimal
# [1(continuation) + 0000000(payload) = -> 128, 0(continuation) + 100(payload) -> 4],
# for a grand total of 2 bytes
string(ASCII 128 4 CUSTOM_SECTION_3)
string(APPEND CUSTOM_SECTION "${CUSTOM_SECTION_3}")
# Second metadata-field is special, since content comes from a file
file(READ "${PLATFORM_FILE}" META2)
# Build METADATAx variable by appending and then truncating empty strings
string(SUBSTRING "${META1}${EMPTY_32}" 0 32 METADATA1)
string(SUBSTRING "${META2}${EMPTY_32}" 0 32 METADATA2)
string(SUBSTRING "${VERSION_FIELD}${EMPTY_32}" 0 32 METADATA3)
string(SUBSTRING "${EXTENSION_VERSION}${EMPTY_32}" 0 32 METADATA4)
string(SUBSTRING "${ABI_TYPE}${EMPTY_32}" 0 32 METADATA5)
string(SUBSTRING "${META6}${EMPTY_32}" 0 32 METADATA6)
string(SUBSTRING "${META7}${EMPTY_32}" 0 32 METADATA7)
string(SUBSTRING "${META8}${EMPTY_32}" 0 32 METADATA8)
# Append metadata fields, backwards
string(APPEND CUSTOM_SECTION "${METADATA8}")
string(APPEND CUSTOM_SECTION "${METADATA7}")
string(APPEND CUSTOM_SECTION "${METADATA6}")
string(APPEND CUSTOM_SECTION "${METADATA5}")
string(APPEND CUSTOM_SECTION "${METADATA4}")
string(APPEND CUSTOM_SECTION "${METADATA3}")
string(APPEND CUSTOM_SECTION "${METADATA2}")
string(APPEND CUSTOM_SECTION "${METADATA1}")
# Append signature (yet to be computed)
string(APPEND CUSTOM_SECTION "${EMPTY_256}")
# Append generated custom section to the extension
file(APPEND "${EXTENSION}" "${CUSTOM_SECTION}")

View File

@@ -0,0 +1,99 @@
#!/usr/bin/env python3
import sys
import glob
import subprocess
import os
import tempfile
# Get the directory and construct the patch file pattern
directory = sys.argv[1]
patch_pattern = f"{directory}*.patch"
# Find patch files matching the pattern
patches = glob.glob(patch_pattern)
def raise_error(error_msg):
sys.stderr.write(error_msg + '\n')
sys.exit(1)
patches = sorted(os.listdir(directory))
for patch in patches:
if not patch.endswith('.patch'):
raise_error(
f'Patch file {patch} found in directory {directory} does not end in ".patch" - rename the patch file'
)
# Exit if no patches are found
if not patches:
error_message = (
f"\nERROR: Extension patching enabled, but no patches found in '{directory}'. "
"Please make sure APPLY_PATCHES is only enabled when there are actually patches present. "
"See .github/patches/extensions/README.md for more details."
)
raise_error(error_message)
current_dir = os.getcwd()
print(f"Applying patches at '{current_dir}'")
print(f"Resetting patches in {directory}\n")
# capture the current diff
diff_proc = subprocess.run(["git", "diff"], capture_output=True, check=True)
prev_diff = diff_proc.stdout
output_proc = subprocess.run(["git", "diff", "--numstat"], capture_output=True, check=True)
prev_output_lines = output_proc.stdout.decode('utf8').split('\n')
prev_output_lines.sort()
subprocess.run(["git", "clean", "-f"], check=True)
subprocess.run(["git", "reset", "--hard", "HEAD"], check=True)
def apply_patch(patch_file):
ARGUMENTS = ["patch", "-p1", "--forward", "-i"]
arguments = []
arguments.extend(ARGUMENTS)
arguments.append(patch_file)
try:
subprocess.run(arguments, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except subprocess.CalledProcessError as e:
arguments[1:1] = ['-d', current_dir]
command = " ".join(arguments)
print(f"Failed to apply patch, command to reproduce locally:\n{command}")
print("\nError output:")
print(e.stderr.decode('utf-8'))
print("\nStandard output:")
print(e.stdout.decode('utf-8'))
print("Exiting")
exit(1)
# Apply each patch file using patch
for patch in patches:
print(f"Applying patch: {patch}\n")
apply_patch(os.path.join(directory, patch))
# all patches have applied - check the current diff
output_proc = subprocess.run(["git", "diff", "--numstat"], capture_output=True, check=True)
output_lines = output_proc.stdout.decode('utf8').split('\n')
output_lines.sort()
if len(output_lines) <= len(prev_output_lines) and prev_output_lines != output_lines:
print("Detected local changes - rolling back patch application")
subprocess.run(["git", "clean", "-f"], check=True)
subprocess.run(["git", "reset", "--hard", "HEAD"], check=True)
with tempfile.NamedTemporaryFile() as f:
f.write(prev_diff)
apply_patch(f.name)
print("--------------------------------------------------")
print("Generate a patch file using the following command:")
print("--------------------------------------------------")
print(f"(cd {os.getcwd()} && git diff > {os.path.join(directory, 'fix.patch')})")
print("--------------------------------------------------")
exit(1)

View File

@@ -0,0 +1,126 @@
import json
import os
import sys
import glob
import time
import urllib.request
api_url = 'https://api.github.com/repos/duckdb/duckdb/'
if len(sys.argv) < 2:
print("Usage: [filename1] [filename2] ... ")
exit(1)
# this essentially should run on release tag builds to fill up release assets and master
repo = os.getenv("GITHUB_REPOSITORY", "")
if repo != "duckdb/duckdb":
print("Not running on forks. Exiting.")
exit(0)
ref = os.getenv("GITHUB_REF", '') # this env var is always present just not always used
if ref == 'refs/heads/main':
print("Not running on main branch. Exiting.")
exit(0)
elif ref.startswith('refs/tags/'):
tag = ref.replace('refs/tags/', '')
else:
print("Not running on branches. Exiting.")
exit(0)
print("Running on tag %s" % tag)
token = os.getenv("GH_TOKEN", "")
if token == "":
raise ValueError('need a GitHub token in GH_TOKEN')
def internal_gh_api(suburl, filename='', method='GET'):
url = api_url + suburl
headers = {"Content-Type": "application/json", 'Authorization': 'token ' + token}
body_data = b''
raw_resp = None
if len(filename) > 0:
method = 'POST'
body_data = open(filename, 'rb')
headers["Content-Type"] = "binary/octet-stream"
headers["Content-Length"] = os.path.getsize(local_filename)
url = suburl # cough
req = urllib.request.Request(url, body_data, headers)
req.get_method = lambda: method
print(f'GH API URL: "{url}" Filename: "{filename}" Method: "{method}"')
raw_resp = urllib.request.urlopen(req).read().decode()
if method != 'DELETE':
return json.loads(raw_resp)
else:
return {}
def gh_api(suburl, filename='', method='GET'):
timeout = 1
nretries = 10
success = False
for i in range(nretries + 1):
try:
response = internal_gh_api(suburl, filename, method)
success = True
except urllib.error.HTTPError as e:
print(e.read().decode()) # gah
except Exception as e:
print(e)
if success:
break
print(f"Failed upload, retrying in {timeout} seconds... ({i}/{nretries})")
time.sleep(timeout)
timeout = timeout * 2
if not success:
raise Exception("Failed to open URL " + suburl)
return response
# check if tag exists
resp = gh_api('git/ref/tags/%s' % tag)
if 'object' not in resp or 'sha' not in resp['object']: # or resp['object']['sha'] != sha
raise ValueError('tag %s not found' % tag)
resp = gh_api('releases/tags/%s' % tag)
if 'id' not in resp or 'upload_url' not in resp:
raise ValueError('release does not exist for tag ' % tag)
# double-check that release exists and has correct sha
# disabled to not spam people watching releases
# if 'id' not in resp or 'upload_url' not in resp or 'target_commitish' not in resp or resp['target_commitish'] != sha:
# raise ValueError('release does not point to requested commit %s' % sha)
# TODO this could be a paged response!
assets = gh_api('releases/%s/assets' % resp['id'])
upload_url = resp['upload_url'].split('{')[0] # gah
files = sys.argv[1:]
for filename in files:
if '=' in filename:
parts = filename.split("=")
asset_filename = parts[0]
paths = glob.glob(parts[1])
if len(paths) != 1:
raise ValueError("Could not find file for pattern %s" % parts[1])
local_filename = paths[0]
else:
asset_filename = os.path.basename(filename)
local_filename = filename
# delete if present
for asset in assets:
if asset['name'] == asset_filename:
gh_api('releases/assets/%s' % asset['id'], method='DELETE')
resp = gh_api(f'{upload_url}?name={asset_filename}', filename=local_filename)
if 'id' not in resp:
raise ValueError('upload failed :/ ' + str(resp))
print("%s -> %s" % (local_filename, resp['browser_download_url']))

106
external/duckdb/scripts/asset-upload.py vendored Normal file
View File

@@ -0,0 +1,106 @@
import json
import os
import sys
import glob
import mimetypes
import urllib.request
api_url = 'https://api.github.com/repos/duckdb/duckdb/'
if len(sys.argv) < 2:
print("Usage: [filename1] [filename2] ... ")
exit(1)
# this essentially should run on release tag builds to fill up release assets and main
pr = os.getenv("TRAVIS_PULL_REQUEST", "")
if pr != "false":
print("Not running on PRs. Exiting.")
exit(0)
tag = os.getenv("TRAVIS_TAG", '') # this env var is always present just not always used
if tag == '':
tag = 'main-builds'
print("Running on tag %s" % tag)
if tag == "main-builds" and os.getenv("TRAVIS_BRANCH", "") != "main":
print("Only running on main branch for %s tag. Exiting." % tag)
exit(0)
token = os.getenv("GH_TOKEN", "")
if token == "":
raise ValueError('need a GitHub token in GH_TOKEN')
def gh_api(suburl, filename='', method='GET'):
url = api_url + suburl
headers = {"Content-Type": "application/json", 'Authorization': 'token ' + token}
body_data = b''
if len(filename) > 0:
method = 'POST'
body_data = open(filename, 'rb')
mime_type = mimetypes.guess_type(local_filename)[0]
if mime_type is None:
mime_type = "application/octet-stream"
headers["Content-Type"] = mime_type
headers["Content-Length"] = os.path.getsize(local_filename)
url = suburl # cough
req = urllib.request.Request(url, body_data, headers)
req.get_method = lambda: method
try:
raw_resp = urllib.request.urlopen(req).read().decode()
except urllib.error.HTTPError as e:
raw_resp = e.read().decode() # gah
if method != 'DELETE':
return json.loads(raw_resp)
else:
return {}
# check if tag exists
resp = gh_api('git/ref/tags/%s' % tag)
if 'object' not in resp or 'sha' not in resp['object']: # or resp['object']['sha'] != sha
raise ValueError('tag %s not found' % tag)
resp = gh_api('releases/tags/%s' % tag)
if 'id' not in resp or 'upload_url' not in resp:
raise ValueError('release does not exist for tag ' % tag)
# double-check that release exists and has correct sha
# disabled to not spam people watching releases
# if 'id' not in resp or 'upload_url' not in resp or 'target_commitish' not in resp or resp['target_commitish'] != sha:
# raise ValueError('release does not point to requested commit %s' % sha)
# TODO this could be a paged response!
assets = gh_api('releases/%s/assets' % resp['id'])
upload_url = resp['upload_url'].split('{')[0] # gah
files = sys.argv[1:]
for filename in files:
if '=' in filename:
parts = filename.split("=")
asset_filename = parts[0]
paths = glob.glob(parts[1])
if len(paths) != 1:
raise ValueError("Could not find file for pattern %s" % local_filename)
local_filename = paths[0]
else:
asset_filename = os.path.basename(filename)
local_filename = filename
# delete if present
for asset in assets:
if asset['name'] == asset_filename:
gh_api('releases/assets/%s' % asset['id'], method='DELETE')
resp = gh_api(upload_url + '?name=%s' % asset_filename, filename=local_filename)
if 'id' not in resp:
raise ValueError('upload failed :/ ' + str(resp))
print("%s -> %s" % (local_filename, resp['browser_download_url']))

28
external/duckdb/scripts/build_peg_grammar.sh vendored Executable file
View File

@@ -0,0 +1,28 @@
#!/usr/bin/env bash
set -euo pipefail
# Print each command before executing (optional, for debug)
# set -x
# Activate virtual environment
if [[ -d ".venv" ]]; then
source .venv/bin/activate
else
echo "Error: .venv directory not found"
exit 1
fi
# Run grammar inlining with and without argument
GRAMMAR_FILE="extension/autocomplete/inline_grammar.py"
if [[ ! -f "$GRAMMAR_FILE" ]]; then
echo "Error: $GRAMMAR_FILE not found"
deactivate
exit 1
fi
python "$GRAMMAR_FILE" --grammar-file
python "$GRAMMAR_FILE"
echo "Successfully build PEG grammar files"
# Deactivate virtual environment
deactivate

View File

@@ -0,0 +1,63 @@
import subprocess
import duckdb
import os
import pandas as pd
import argparse
from io import StringIO
parser = argparse.ArgumentParser(description='Cancel all workflows related to a PR.')
parser.add_argument(
'--title',
dest='title',
action='store',
help='The title of the PR for which we want to rerun workflows (or part of the title) - or "master" for all pushes',
required=True,
)
parser.add_argument(
'--repo', dest='repo', action='store', help='The repository to run this workflow on', default='duckdb/duckdb'
)
parser.add_argument(
'--max_workflows',
dest='max_workflows',
action='store',
help='The maximum number of workflows to look at (starting from the latest)',
default=200,
)
args = parser.parse_args()
nlimit = args.max_workflows
query = args.title
proc = subprocess.Popen(
[
'gh',
'run',
'-R',
args.repo,
'list',
'--json',
'displayTitle,databaseId,status,conclusion,headSha,event',
f'--limit={nlimit}',
],
stdout=subprocess.PIPE,
)
text = proc.stdout.read().decode('utf8')
df = pd.read_json(StringIO(text))
if query == 'master':
result = duckdb.query(
f"select databaseId from df WHERE status IN ('queued', 'in_progress') AND event='push'"
).fetchall()
else:
result = duckdb.query(
f"select databaseId from df WHERE status IN ('queued', 'in_progress') AND displayTitle LIKE '%{query}%'"
).fetchall()
if len(result) == 0:
print(
f"No workflows found in the latest {nlimit} workflows that contain the text {query}.\nPerhaps try running with a higher --max_workflows parameter?"
)
exit(1)
for databaseId in [x[0] for x in result]:
os.system(f'gh run -R {args.repo} cancel {databaseId}')

View File

@@ -0,0 +1,33 @@
import re
import sys
post_text = sys.stdin.read()
sql_keyword_list = ["select", "from", "where", "join", "group by", "order by", "having", "with recursive", "union"]
sql_keyword_regex = f"({'|'.join(sql_keyword_list)})"
sql_keywords = len(re.findall(rf"{sql_keyword_regex}", post_text, flags=re.MULTILINE | re.IGNORECASE))
backticked_code_blocks = len(re.findall(r"^```", post_text))
indented_sql_code_lines = len(re.findall(r"^{sql_keyword_regex}", post_text, flags=re.MULTILINE | re.IGNORECASE))
indented_python_code_lines = len(re.findall(r"^ (import|duckdb)", post_text, flags=re.MULTILINE | re.IGNORECASE))
indented_r_code_lines = len(re.findall(r"^ (library|dbExecute)", post_text, flags=re.MULTILINE | re.IGNORECASE))
indented_hashbang_code_lines = len(re.findall(r"^ #!", post_text, flags=re.MULTILINE | re.IGNORECASE))
indented_code_lines = indented_sql_code_lines + indented_python_code_lines + indented_r_code_lines
inline_code_snippets = len(re.findall(r"`", post_text)) // 2
print("Metrics computed by 'check-issue-for-code-formatting.py':")
print(f"- {sql_keywords} SQL keyword(s)")
print(f"- {backticked_code_blocks} backticked code block(s)")
print(
f"- {indented_code_lines} indented code line(s): {indented_sql_code_lines} SQL, {indented_python_code_lines} Python, {indented_r_code_lines} R, {indented_hashbang_code_lines} hashbangs"
)
print(f"- {inline_code_snippets} inline code snippet(s)")
if sql_keywords > 2 and backticked_code_blocks == 0 and indented_code_lines == 0 and inline_code_snippets == 0:
print("The post is likely not properly formatted.")
exit(1)
else:
print("The post is likely properly formatted.")

View File

@@ -0,0 +1,129 @@
import argparse
import os
import math
import re
parser = argparse.ArgumentParser(description='Check code coverage results')
parser.add_argument(
'--uncovered_files',
action='store',
help='Set of files that are not 100% covered',
default=os.path.join(".github", "config", "uncovered_files.csv"),
)
parser.add_argument('--directory', help='Directory of generated HTML files', action='store', default='coverage_html')
parser.add_argument('--fix', help='Fill up the uncovered_files.csv with all files', action='store_true', default=False)
args = parser.parse_args()
if not os.path.exists(args.directory):
print(f"The provided directory ({args.directory}) does not exist, please create it first")
exit(1)
covered_regex = (
r'<a name="(\d+)">[ \t\n]*<span class="lineNum">[ \t\n0-9]+</span><span class="{COVERED_CLASS}">[ \t\n0-9]+:([^<]+)'
)
def get_original_path(path):
return (
path.replace('.gcov.html', '')
.replace(os.getcwd(), '')
.replace('coverage_html' + os.path.sep, '')
.replace('home/runner/work/duckdb/duckdb/', '')
)
def cleanup_line(line):
return line.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"')
partial_coverage_dict = {}
with open(args.uncovered_files, 'r') as f:
for line in f.readlines():
splits = line.split('\t')
partial_coverage_dict[splits[0]] = int(splits[1].strip())
if args.fix:
uncovered_file = open(args.uncovered_files, 'w+')
DASH_COUNT = 80
total_difference = 0
allowed_difference = 0
def check_file(path, partial_coverage_dict):
global any_failed
global total_difference
if not '.cpp' in path and not '.hpp' in path:
# files are named [path].[ch]pp
return
if not '.html' in path:
return
with open(path, 'r') as f:
text = f.read()
original_path = get_original_path(path)
uncovered_lines = re.findall(covered_regex.replace('{COVERED_CLASS}', 'lineNoCov'), text)
covered_lines = re.findall(covered_regex.replace('{COVERED_CLASS}', 'lineCov'), text)
total_lines = len(uncovered_lines) + len(covered_lines)
if total_lines == 0:
# no lines to cover - skip
return
coverage_percentage = round(len(covered_lines) / (total_lines) * 100, 2)
expected_uncovered_lines = 0
if original_path in partial_coverage_dict:
expected_uncovered_lines = partial_coverage_dict[original_path]
if args.fix:
if expected_uncovered_lines == 0 and len(uncovered_lines) == 0:
return
expected_uncovered = max(expected_uncovered_lines, len(uncovered_lines) + 1)
uncovered_file.write(f'{original_path}\t{expected_uncovered}\n')
return
if len(uncovered_lines) > expected_uncovered_lines:
total_difference += len(uncovered_lines) - expected_uncovered_lines
print("-" * DASH_COUNT)
print(f"Coverage failure in file {original_path}")
print("-" * DASH_COUNT)
print(f"Coverage percentage: {coverage_percentage}%")
print(f"Uncovered lines: {len(uncovered_lines)}")
print(f"Covered lines: {len(covered_lines)}")
print("-" * DASH_COUNT)
print(f"Expected uncovered lines: {expected_uncovered_lines}")
print("-" * DASH_COUNT)
print("Uncovered lines")
print("-" * DASH_COUNT)
for e in uncovered_lines:
print(e[0] + ' ' * 8 + cleanup_line(e[1]))
def scan_directory(path):
file_list = []
if os.path.isfile(path):
file_list.append(path)
else:
files = os.listdir(path)
for file in files:
file_list += scan_directory(os.path.join(path, file))
return file_list
files = scan_directory(args.directory)
files.sort()
for file in files:
check_file(file, partial_coverage_dict)
if args.fix:
uncovered_file.close()
if total_difference > allowed_difference:
exit(1)
elif total_difference > 0:
print("-" * DASH_COUNT)
print("SUCCESS-ish")
print("-" * DASH_COUNT)
print(f"{total_difference} lines were uncovered but this falls within the margin of {allowed_difference}")

View File

@@ -0,0 +1,369 @@
#!/usr/bin/env python3
#
# ===- clang-tidy-diff.py - ClangTidy Diff Checker -----------*- python -*--===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===-----------------------------------------------------------------------===#
r"""
ClangTidy Diff Checker
======================
This script reads input from a unified diff, runs clang-tidy on all changed
files and outputs clang-tidy warnings in changed lines only. This is useful to
detect clang-tidy regressions in the lines touched by a specific patch.
Example usage for git/svn users:
git diff -U0 HEAD^ | clang-tidy-diff.py -p1
svn diff --diff-cmd=diff -x-U0 | \
clang-tidy-diff.py -fix -checks=-*,modernize-use-override
"""
import argparse
import glob
import json
import multiprocessing
import os
import re
import shutil
import subprocess
import sys
import tempfile
import threading
import traceback
try:
import yaml
except ImportError:
yaml = None
is_py2 = sys.version[0] == "2"
if is_py2:
import Queue as queue
else:
import queue as queue
def run_tidy(task_queue, lock, timeout, failed_files):
watchdog = None
while True:
command = task_queue.get()
try:
proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if timeout is not None:
watchdog = threading.Timer(timeout, proc.kill)
watchdog.start()
stdout, stderr = proc.communicate()
if proc.returncode != 0:
if proc.returncode < 0:
msg = "Terminated by signal %d : %s\n" % (
-proc.returncode,
" ".join(command),
)
stderr += msg.encode("utf-8")
failed_files.append(command)
with lock:
sys.stdout.write(stdout.decode("utf-8") + "\n")
sys.stdout.flush()
if stderr:
sys.stderr.write(stderr.decode("utf-8") + "\n")
sys.stderr.flush()
except Exception as e:
with lock:
sys.stderr.write("Failed: " + str(e) + ": ".join(command) + "\n")
finally:
with lock:
if not (timeout is None or watchdog is None):
if not watchdog.is_alive():
sys.stderr.write("Terminated by timeout: " + " ".join(command) + "\n")
watchdog.cancel()
task_queue.task_done()
def start_workers(max_tasks, tidy_caller, arguments):
for _ in range(max_tasks):
t = threading.Thread(target=tidy_caller, args=arguments)
t.daemon = True
t.start()
def merge_replacement_files(tmpdir, mergefile):
"""Merge all replacement files in a directory into a single file"""
# The fixes suggested by clang-tidy >= 4.0.0 are given under
# the top level key 'Diagnostics' in the output yaml files
mergekey = "Diagnostics"
merged = []
for replacefile in glob.iglob(os.path.join(tmpdir, "*.yaml")):
content = yaml.safe_load(open(replacefile, "r"))
if not content:
continue # Skip empty files.
merged.extend(content.get(mergekey, []))
if merged:
# MainSourceFile: The key is required by the definition inside
# include/clang/Tooling/ReplacementsYaml.h, but the value
# is actually never used inside clang-apply-replacements,
# so we set it to '' here.
output = {"MainSourceFile": "", mergekey: merged}
with open(mergefile, "w") as out:
yaml.safe_dump(output, out)
else:
# Empty the file:
open(mergefile, "w").close()
def main():
parser = argparse.ArgumentParser(
description="Run clang-tidy against changed files, and " "output diagnostics only for modified " "lines."
)
parser.add_argument(
"-clang-tidy-binary",
metavar="PATH",
default="clang-tidy",
help="path to clang-tidy binary",
)
parser.add_argument(
"-p",
metavar="NUM",
default=0,
help="strip the smallest prefix containing P slashes",
)
parser.add_argument(
"-regex",
metavar="PATTERN",
default=None,
help="custom pattern selecting file paths to check " "(case sensitive, overrides -iregex)",
)
parser.add_argument(
"-iregex",
metavar="PATTERN",
default=r".*\.(cpp|cc|c\+\+|cxx|c|cl|h|hpp|m|mm|inc)",
help="custom pattern selecting file paths to check " "(case insensitive, overridden by -regex)",
)
parser.add_argument(
"-j",
type=int,
default=1,
help="number of tidy instances to be run in parallel.",
)
parser.add_argument("-timeout", type=int, default=None, help="timeout per each file in seconds.")
parser.add_argument("-fix", action="store_true", default=False, help="apply suggested fixes")
parser.add_argument(
"-checks",
help="checks filter, when not specified, use clang-tidy " "default",
default="",
)
parser.add_argument(
"-config-file",
dest="config_file",
help="Specify the path of .clang-tidy or custom config file",
default="",
)
parser.add_argument("-use-color", action="store_true", help="Use colors in output")
parser.add_argument("-path", dest="build_path", help="Path used to read a compile command database.")
if yaml:
parser.add_argument(
"-export-fixes",
metavar="FILE_OR_DIRECTORY",
dest="export_fixes",
help="A directory or a yaml file to store suggested fixes in, "
"which can be applied with clang-apply-replacements. If the "
"parameter is a directory, the fixes of each compilation unit are "
"stored in individual yaml files in the directory.",
)
else:
parser.add_argument(
"-export-fixes",
metavar="DIRECTORY",
dest="export_fixes",
help="A directory to store suggested fixes in, which can be applied "
"with clang-apply-replacements. The fixes of each compilation unit are "
"stored in individual yaml files in the directory.",
)
parser.add_argument(
"-extra-arg",
dest="extra_arg",
action="append",
default=[],
help="Additional argument to append to the compiler " "command line.",
)
parser.add_argument(
"-extra-arg-before",
dest="extra_arg_before",
action="append",
default=[],
help="Additional argument to prepend to the compiler " "command line.",
)
parser.add_argument(
"-quiet",
action="store_true",
default=False,
help="Run clang-tidy in quiet mode",
)
parser.add_argument(
"-load",
dest="plugins",
action="append",
default=[],
help="Load the specified plugin in clang-tidy.",
)
parser.add_argument(
"-allow-no-checks",
action="store_true",
help="Allow empty enabled checks.",
)
clang_tidy_args = []
argv = sys.argv[1:]
if "--" in argv:
clang_tidy_args.extend(argv[argv.index("--") :])
argv = argv[: argv.index("--")]
args = parser.parse_args(argv)
# Extract changed lines for each file.
filename = None
lines_by_file = {}
for line in sys.stdin:
match = re.search('^\\+\\+\\+\\ "?(.*?/){%s}([^ \t\n"]*)' % args.p, line)
if match:
filename = match.group(2)
if filename is None:
continue
if args.regex is not None:
if not re.match("^%s$" % args.regex, filename):
continue
else:
if not re.match("^%s$" % args.iregex, filename, re.IGNORECASE):
continue
match = re.search(r"^@@.*\+(\d+)(,(\d+))?", line)
if match:
start_line = int(match.group(1))
line_count = 1
if match.group(3):
line_count = int(match.group(3))
if line_count == 0:
continue
end_line = start_line + line_count - 1
lines_by_file.setdefault(filename, []).append([start_line, end_line])
if not any(lines_by_file):
print("No relevant changes found.")
sys.exit(0)
max_task_count = args.j
if max_task_count == 0:
max_task_count = multiprocessing.cpu_count()
max_task_count = min(len(lines_by_file), max_task_count)
combine_fixes = False
export_fixes_dir = None
delete_fixes_dir = False
if args.export_fixes is not None:
# if a directory is given, create it if it does not exist
if args.export_fixes.endswith(os.path.sep) and not os.path.isdir(args.export_fixes):
os.makedirs(args.export_fixes)
if not os.path.isdir(args.export_fixes):
if not yaml:
raise RuntimeError(
"Cannot combine fixes in one yaml file. Either install PyYAML or specify an output directory."
)
combine_fixes = True
if os.path.isdir(args.export_fixes):
export_fixes_dir = args.export_fixes
if combine_fixes:
export_fixes_dir = tempfile.mkdtemp()
delete_fixes_dir = True
# Tasks for clang-tidy.
task_queue = queue.Queue(max_task_count)
# A lock for console output.
lock = threading.Lock()
# List of files with a non-zero return code.
failed_files = []
# Run a pool of clang-tidy workers.
start_workers(max_task_count, run_tidy, (task_queue, lock, args.timeout, failed_files))
# Form the common args list.
common_clang_tidy_args = []
if args.fix:
common_clang_tidy_args.append("-fix")
if args.checks != "":
common_clang_tidy_args.append("-checks=" + args.checks)
if args.config_file != "":
common_clang_tidy_args.append("-config-file=" + args.config_file)
if args.quiet:
common_clang_tidy_args.append("-quiet")
if args.build_path is not None:
common_clang_tidy_args.append("-p=%s" % args.build_path)
if args.use_color:
common_clang_tidy_args.append("--use-color")
if args.allow_no_checks:
common_clang_tidy_args.append("--allow-no-checks")
for arg in args.extra_arg:
common_clang_tidy_args.append("-extra-arg=%s" % arg)
for arg in args.extra_arg_before:
common_clang_tidy_args.append("-extra-arg-before=%s" % arg)
for plugin in args.plugins:
common_clang_tidy_args.append("-load=%s" % plugin)
for name in lines_by_file:
line_filter_json = json.dumps([{"name": name, "lines": lines_by_file[name]}], separators=(",", ":"))
# Run clang-tidy on files containing changes.
command = [args.clang_tidy_binary]
command.append("-line-filter=" + line_filter_json)
if args.export_fixes is not None:
# Get a temporary file. We immediately close the handle so clang-tidy can
# overwrite it.
(handle, tmp_name) = tempfile.mkstemp(suffix=".yaml", dir=export_fixes_dir)
os.close(handle)
command.append("-export-fixes=" + tmp_name)
command.extend(common_clang_tidy_args)
command.append(name)
command.extend(clang_tidy_args)
task_queue.put(command)
# Application return code
return_code = 0
# Wait for all threads to be done.
task_queue.join()
# Application return code
return_code = 0
if failed_files:
return_code = 1
if combine_fixes:
print("Writing fixes to " + args.export_fixes + " ...")
try:
merge_replacement_files(export_fixes_dir, args.export_fixes)
except:
sys.stderr.write("Error exporting fixes.\n")
traceback.print_exc()
return_code = 1
if delete_fixes_dir:
shutil.rmtree(export_fixes_dir)
sys.exit(return_code)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,19 @@
#!/bin/bash
rm -f hash_concats
touch hash_concats
split -b 1M $1
FILES="x*"
for f in $FILES
do
# sha256 a segment
openssl dgst -binary -sha256 $f >> hash_concats
rm $f
done
# sha256 the concatenation
openssl dgst -binary -sha256 hash_concats > hash_composite
cat hash_composite

27
external/duckdb/scripts/coverage_check.sh vendored Executable file
View File

@@ -0,0 +1,27 @@
#!/bin/bash
set -e
# prepare coverage file
lcov --config-file .github/workflows/lcovrc --zerocounters --directory .
lcov --config-file .github/workflows/lcovrc --capture --initial --directory . --base-directory . --no-external --output-file coverage.info
# build with coverage enabled
mkdir -p build/coverage
(cd build/coverage && cmake -E env CXXFLAGS="--coverage" cmake -DBUILD_EXTENSIONS="parquet;json;jemalloc;autocomplete;icu" -DENABLE_SANITIZER=0 -DCMAKE_BUILD_TYPE=Debug ../.. && cmake --build .)
# run tests
build/coverage/test/unittest
build/coverage/test/unittest "[detailed_profiler]"
build/coverage/test/unittest test/sql/tpch/tpch_sf01.test_slow
python3 -m pytest --shell-binary build/coverage/duckdb tools/shell/tests/
# finalize coverage file
lcov --config-file .github/workflows/lcovrc --directory . --base-directory . --no-external --capture --output-file coverage.info
lcov --config-file .github/workflows/lcovrc --remove coverage.info $(< .github/workflows/lcov_exclude) -o lcov.info
# generate coverage html
genhtml -o coverage_html lcov.info
# check that coverage passes threshold
# python3 scripts/check_coverage.py

View File

@@ -0,0 +1,63 @@
import json, os, sys, glob, mimetypes, urllib.request, re
api_url = 'https://api.github.com/repos/duckdb/duckdb/'
if len(sys.argv) < 2:
print("Usage: [last_tag] ")
exit(1)
token = os.getenv("GH_TOKEN", "")
if token == "":
raise ValueError('need a GitHub token in GH_TOKEN')
# amazingly this is the entire code of the pypy package `linkheader-parser`
def extract(link_header):
"""Extract links and their relations from a Link Header Field."""
links = [l.strip() for l in link_header.split(',')]
rels = {}
pattern = r'<(?P<url>.*)>;\s*rel="(?P<rel>.*)"'
for link in links:
group_dict = re.match(pattern, link).groupdict()
rels[group_dict['rel']] = group_dict['url']
return rels
def gh_api(suburl, full_url=''):
if full_url == '':
url = api_url + suburl
else:
url = full_url
headers = {"Content-Type": "application/json", 'Authorization': 'token ' + token}
req = urllib.request.Request(url, b'', headers)
req.get_method = lambda: 'GET'
next_link = None
try:
resp = urllib.request.urlopen(req)
if not resp.getheader("Link") is None:
link_data = extract(resp.getheader("Link"))
if "next" in link_data:
next_link = link_data["next"]
raw_resp = resp.read().decode()
except urllib.error.HTTPError as e:
raw_resp = e.read().decode() # gah
ret_json = json.loads(raw_resp)
if next_link is not None:
return ret_json + gh_api('', full_url=next_link)
return ret_json
# get time of tag
old_release = gh_api('releases/tags/%s' % sys.argv[1])
print(old_release["published_at"])
pulls = gh_api('pulls?base=main&state=closed')
for p in pulls:
if p["merged_at"] is None:
continue
if p["merged_at"] < old_release["published_at"]:
continue
print(" - #%s: %s" % (p["number"], p["title"]))

View File

@@ -0,0 +1,43 @@
###
# This script copies all extensions in a build folder from their cmake-produced structure into the extension repository
# structure of ./<duckdb_version>/<build_archictecture>/<extension_name>.duckdb_extension
# Note that it requires duckdb_platofrom_out file to be populated with the platform
import os
import sys
import subprocess
import glob
import shutil
if len(sys.argv) != 6:
print(
"Usage: scripts/create_local_extension_repo.py <duckdb_version> <duckdb_platform_out> <path/to/duckdb/build> <path/to/local_repo> <postfix>"
)
exit(1)
duckdb_version = sys.argv[1]
duckdb_platform_out = sys.argv[2]
extension_path = sys.argv[3]
dst_path = sys.argv[4]
postfix = sys.argv[5]
if os.name == 'nt':
duckdb_platform_out = duckdb_platform_out.replace("/", "\\")
extension_path = extension_path.replace("/", "\\")
dst_path = dst_path.replace("/", "\\")
with open(duckdb_platform_out, 'r') as f:
lines = f.readlines()
duckdb_platform = lines[0]
# Create destination path
dest_path = os.path.join(dst_path, duckdb_version, duckdb_platform)
if not os.path.exists(dest_path):
os.makedirs(dest_path)
# Now copy over the extensions to the correct path
glob_string = os.path.join(extension_path, 'extension', '*', '*.' + postfix)
for file in glob.glob(glob_string):
dest_file = os.path.join(dest_path, os.path.basename(file))
shutil.copy(file, dest_file)

147
external/duckdb/scripts/create_patch.py vendored Normal file
View File

@@ -0,0 +1,147 @@
import os
import argparse
import sys
import re
import subprocess
from typing import List, Dict
from pathlib import Path
SCRIPT_DIR = os.path.dirname(__file__)
parser = argparse.ArgumentParser(description="Generate a patch file for a DuckDB extension.")
parser.add_argument(
"repository_path",
type=str,
help="Path to the repository where the changes live that should be turned into a patch.",
)
parser.add_argument(
"extension_name",
type=str,
help="Name of the extension to patch, should match the name in `.github/config/extensions/<extension_name>.cmake`.",
)
parser.add_argument("patch_name", type=str, help="Name for the patch file to create.")
parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the patch file if it already exists.")
args = parser.parse_args()
def verify_git_tag():
# Locate the cmake file to extract the GIT_TAG from
cmake_path = Path(SCRIPT_DIR) / '..' / ".github" / "config" / "extensions" / f"{args.extension_name}.cmake"
if not cmake_path.is_file():
print(f"Error: Extension CMake file not found: {cmake_path}")
sys.exit(1)
cmake_content = cmake_path.read_text()
# Extract GIT_TAG from the cmake file
match = re.search(r"\bGIT_TAG\s+([^\s\)]+)", cmake_content)
if not match:
print(f"Error: Could not find GIT_TAG in {cmake_path}")
sys.exit(1)
git_tag_in_cmake = match.group(1)
# Get the current commit hash in repository_path
try:
result = subprocess.run(
["git", "rev-parse", "HEAD"],
cwd=args.repository_path,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True,
)
current_commit = result.stdout.strip()
except subprocess.CalledProcessError as e:
print(f"Error: Failed to run git in {args.repository_path}{e.stderr.strip()}")
sys.exit(1)
# Compare the tags
if git_tag_in_cmake != current_commit:
print(
f"Error: GIT_TAG in {cmake_path} is {git_tag_in_cmake}, "
f"but repository {args.repository_path} is checked out at {current_commit}."
)
sys.exit(1)
def create_patch():
# Collect changes with git diff
try:
diff_result = subprocess.run(
["git", "diff", "--ignore-submodules"],
cwd=args.repository_path,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True,
)
except subprocess.CalledProcessError as e:
print(f"Error: Failed to run git diff — {e.stderr.strip()}")
sys.exit(1)
new_patch_content = diff_result.stdout
if not new_patch_content.strip():
print("⚠️ No changes detected in repository; no patch will be created.")
sys.exit(0)
def parse_patch_files_and_lines(patch_text):
changes = {}
current_file = None
for line in patch_text.splitlines():
if line.startswith("diff --git"):
parts = line.split()
if len(parts) >= 3:
# Format: diff --git a/file b/file
current_file = parts[2][2:] # remove 'a/'
changes.setdefault(current_file, set())
elif line.startswith("@@") and current_file:
# Format: @@ -old_start,old_count +new_start,new_count @@
m = re.match(r"@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@", line)
if m:
start = int(m.group(1))
length = int(m.group(2) or "1")
for l in range(start, start + length):
changes[current_file].add(l)
return changes
new_changes = parse_patch_files_and_lines(new_patch_content)
# Check conflicts with existing patches
patch_dir = (Path(SCRIPT_DIR) / ".." / ".github" / "patches" / "extensions" / args.extension_name).resolve()
patch_dir.mkdir(parents=True, exist_ok=True)
for existing_patch in patch_dir.glob("*.patch"):
if existing_patch.name == f"{args.patch_name}.patch":
if not args.overwrite:
print(f"A patch by the name '{args.patch_name}.patch' already exists, failed to create patch")
sys.exit(1)
else:
continue
existing_changes = parse_patch_files_and_lines(existing_patch.read_text())
for file, lines in new_changes.items():
if file in existing_changes:
overlap = lines & existing_changes[file]
if overlap:
print(f"❌ Conflict detected with existing patch: {existing_patch.name}")
print(f" File: {file}")
print(f" Overlapping lines: {sorted(overlap)}")
sys.exit(1)
# Save patch file
patch_dir = (Path(SCRIPT_DIR) / ".." / ".github" / "patches" / "extensions" / args.extension_name).resolve()
patch_dir.mkdir(parents=True, exist_ok=True)
patch_path = patch_dir / f"{args.patch_name}.patch"
patch_path.write_text(diff_result.stdout)
verify_git_tag()
create_patch()

View File

@@ -0,0 +1,71 @@
import subprocess
import sys
import os
if len(sys.argv) < 2 or not os.path.isfile(sys.argv[1]):
print("Usage: [libduckdb dynamic library file, release build]")
exit(1)
res = subprocess.run('nm -g -C -P'.split(' ') + [sys.argv[1]], check=True, capture_output=True)
if res.returncode != 0:
raise ValueError('Failed to run `nm`')
culprits = []
whitelist = [
'@GLIBC',
'@CXXABI',
'__gnu_cxx::',
'std::',
'N6duckdb',
'duckdb::',
'duckdb_miniz::',
'duckdb_fmt::',
'duckdb_hll::',
'duckdb_moodycamel::',
'duckdb_yyjson::',
'duckdb_',
'RefCounter',
'registerTMCloneTable',
'RegisterClasses',
'Unwind_Resume',
'__gmon_start',
'_fini',
'_init',
'_version',
'_end',
'_edata',
'__bss_start',
'__udivti3',
'__popcount',
'Adbc',
'ErrorArrayStream',
'ErrorFromArrayStream',
]
for symbol in res.stdout.decode('utf-8').split('\n'):
if len(symbol.strip()) == 0:
continue
if symbol.endswith(' U'): # undefined because dynamic linker
continue
if symbol.endswith(' U 0 0') and "random_device" not in symbol: # undefined because dynamic linker
continue
is_whitelisted = False
for entry in whitelist:
if entry in symbol and "random_device" not in symbol:
is_whitelisted = True
if is_whitelisted:
continue
culprits.append(symbol)
if len(culprits) > 0:
print("Found leaked symbols. Either white-list above or change visibility:")
for symbol in culprits:
print(symbol)
sys.exit(1)
sys.exit(0)

View File

@@ -0,0 +1,51 @@
#!/bin/bash
# Uploads all extensions found in <base_dir_glob> (default: build/release/extension/*)
# this script is used by DuckDB CI to upload all extensions at once
# Usage: ./extension-upload-all.sh <architecture> <duckdb_version> [<base_dir_glob>]
# The directory that the script lives in, thanks @Tishj
script_dir="$(dirname "$(readlink -f "$0")")"
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Usage: ./extension-upload-all.sh <architecture> <duckdb_version> [<base_dir_glob>]"
exit 1
fi
if [ -z "$3" ]; then
BASE_DIR="build/release/extension/*"
else
BASE_DIR="$3"
fi
set -e
# Ensure we do nothing on failed globs
shopt -s nullglob
# Print dry run / real run
if [ "$DUCKDB_DEPLOY_SCRIPT_MODE" == "for_real" ]; then
echo "Deploying extensions.."
else
echo "Deploying extensions.. (DRY RUN)"
fi
if [[ $1 == wasm* ]]; then
FILES="$BASE_DIR/*.duckdb_extension.wasm"
else
FILES="$BASE_DIR/*.duckdb_extension"
fi
for f in $FILES
do
if [[ $1 == wasm* ]]; then
ext_name=`basename $f .duckdb_extension.wasm`
else
ext_name=`basename $f .duckdb_extension`
fi
echo "found extension: '$ext_name'"
# args: <name> <extension_version> <duckdb_version> <architecture> <s3_bucket> <copy_to_latest> <copy_to_versioned> [<path_to_ext>]
$script_dir/extension-upload-single.sh $ext_name "" "$2" "$1" "duckdb-core-extensions" true false "$(dirname "$f")"
done

View File

@@ -0,0 +1,109 @@
#!/bin/bash
# This script deploys the extension binaries that are currently deployed to the nightly bucket to the main bucket
# WARNING: don't use this script if you don't know exactly what you're doing. To deploy a binary:
# - Run the script with ./extension-upload-from-nightly.sh <extension_name> <duckdb_version> (<nightly_commit>)
# - CHECK the output of the dry run thoroughly
# - If successful, set the DUCKDB_DEPLOY_SCRIPT_MODE env variable to the correct value
# - run the script again now deploying for real
# - check the output
# - unset the DUCKDB_DEPLOY_SCRIPT_MODE env var
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Usage: ./extension-upload-from-nightly.sh <extension_name> <duckdb_version> (<nightly_commit>)"
exit 1
fi
if [ -z "$3" ]; then
BASE_NIGHTLY_DIR="$2"
else
BASE_NIGHTLY_DIR="$1/$3/$2"
fi
# CONFIG
FROM_BUCKET=duckdb-extensions-nightly
TO_BUCKET=duckdb-core-extensions
CLOUDFRONT_DISTRIBUTION_ID=E2Z28NDMI4PVXP
### COPY THE FILES
## REAL_RUN is to be used to move non-Wasm extensions
REAL_RUN="aws s3 cp s3://$FROM_BUCKET/$BASE_NIGHTLY_DIR s3://$TO_BUCKET/$2 --recursive --exclude '*' --include '*/$1.duckdb_extension.gz' --acl public-read --region us-east-2"
DRY_RUN="$REAL_RUN --dryrun"
## REAL_RUN_WASM is to be used to move Wasm extensions to new style path (no extra duckdb-wasm)
REAL_RUN_WASM="aws s3 cp s3://$FROM_BUCKET/$BASE_NIGHTLY_DIR s3://$TO_BUCKET/$2 --recursive --exclude '*' --include '*/$1.duckdb_extension.wasm' --acl public-read --content-encoding br --content-type='application/wasm' --region us-east-2"
DRY_RUN_WASM="$REAL_RUN_WASM --dryrun"
if [ "$DUCKDB_DEPLOY_SCRIPT_MODE" == "for_real" ]; then
echo "DEPLOYING"
echo "> FROM: $FROM_BUCKET"
echo "> TO : $TO_BUCKET"
echo "> AWS CLI deploy: "
eval "$REAL_RUN"
eval "$REAL_RUN_WASM"
else
echo "DEPLOYING (DRY RUN)"
echo "> FROM: $FROM_BUCKET"
echo "> TO : $TO_BUCKET"
echo "> AWS CLI Dry run: "
eval "$DRY_RUN"
eval "$DRY_RUN_WASM"
fi
echo ""
### INVALIDATE THE CLOUDFRONT CACHE AND CLOUDFLARE
# For double checking we are invalidating the correct domain
CLOUDFRONT_ORIGINS=`aws cloudfront get-distribution --id $CLOUDFRONT_DISTRIBUTION_ID --query 'Distribution.DistributionConfig.Origins.Items[*].DomainName' --output text`
# Parse the dry run output
output=$(eval "$DRY_RUN" && eval "$DRY_RUN_WASM" && eval "$DRY_RUN_WASM_OLD_STYLE")
s3_paths=()
while IFS= read -r line; do
if [[ $line == *"copy:"* ]]; then
s3_path=$(echo $line | grep -o 's3://[^ ]*' | awk 'NR%2==0' | awk -F "s3://$TO_BUCKET" '{print $2}' | cut -d' ' -f1)
s3_paths+=("$s3_path")
fi
done <<< "$output"
if [ "$DUCKDB_DEPLOY_SCRIPT_MODE" == "for_real" ]; then
echo "CLOUDFRONT INVALIDATION"
echo "> Total files: ${#s3_paths[@]}"
echo "> Domain: $CLOUDFRONT_ORIGINS"
for path in "${s3_paths[@]}"; do
aws cloudfront create-invalidation --distribution-id "$CLOUDFRONT_DISTRIBUTION_ID" --paths "$path"
done
else
echo "INVALIDATION (DRY RUN)"
echo "> Total files: ${#s3_paths[@]}"
echo "> Domain: $CLOUDFRONT_ORIGINS"
echo "> Paths:"
for path in "${s3_paths[@]}"; do
echo " $path"
done
fi
echo ""
if [ ! -z "$CLOUDFLARE_CACHE_PURGE_TOKEN" ]; then
if [ "$DUCKDB_DEPLOY_SCRIPT_MODE" == "for_real" ]; then
echo "CLOUDFLARE INVALIDATION"
echo "> Total files: ${#s3_paths[@]}"
for path in "${s3_paths[@]}"; do
curl --request POST --url https://api.cloudflare.com/client/v4/zones/84f631c38b77d4631b561207f2477332/purge_cache --header 'Content-Type: application/json' --header "Authorization: Bearer $CLOUDFLARE_CACHE_PURGE_TOKEN" --data "{\"files\": [\"http://extensions.duckdb.org$path\"]}"
echo ""
done
else
echo "CLOUDFLARE INVALIDATION (DRY RUN)"
echo "> Total files: ${#s3_paths[@]}"
echo "> Domain: $CLOUDFRONT_ORIGINS"
echo "> Paths:"
for path in "${s3_paths[@]}"; do
echo " http://extensions.duckdb.org$path"
done
fi
else
echo "##########################################"
echo "WARNING! CLOUDFLARE INVALIDATION DISABLED!"
echo "##########################################"
fi

View File

@@ -0,0 +1,56 @@
#!/bin/bash
# Uploads all extensions found in <base_dir_glob> (default: build/release/extension/*)
# this script is used by DuckDB CI to upload all extensions at once
# Usage: ./extension-upload-all.sh <base_dir_glob>
# Expected directory structure: <base_dir_glob>/<duckdb_version>/<architecture>/
# The directory that the script lives in, thanks @Tishj
script_dir="$(dirname "$(readlink -f "$0")")"
if [ -z "$1" ]; then
BASE_DIR="build/release/repository/*"
else
BASE_DIR="$1"
fi
echo $BASE_DIR
set -e
# Ensure we do nothing on failed globs
shopt -s nullglob
if [ "$DUCKDB_DEPLOY_SCRIPT_MODE" == "for_real" ]; then
echo "Deploying extensions.."
else
echo "Deploying extensions.. (DRY RUN)"
fi
for version_dir in $BASE_DIR/*; do
duckdb_version=$(basename "$version_dir")
for arch_dir in "$version_dir"/*; do
architecture=$(basename "$arch_dir")
if [[ $architecture == wasm* ]]; then
FILES="$arch_dir/*.duckdb_extension.wasm"
else
FILES="$arch_dir/*.duckdb_extension"
fi
for f in $FILES; do
if [[ $architecture == wasm* ]]; then
ext_name=`basename $f .duckdb_extension.wasm`
else
ext_name=`basename $f .duckdb_extension`
fi
echo "Processing extension: $ext_name (architecture: $architecture, version: $duckdb_version, path: $f)"
# args: <name> <extension_version> <duckdb_version> <architecture> <s3_bucket> <copy_to_latest> <copy_to_versioned> [<path_to_ext>]
$script_dir/extension-upload-single.sh $ext_name "" "$duckdb_version" "$architecture" "duckdb-core-extensions" true false "$(dirname "$f")"
done
echo ""
done
done

View File

@@ -0,0 +1,94 @@
#!/bin/bash
# Main extension uploading script
# Note: use the DUCKDB_DEPLOY_SCRIPT_MODE variable to disable dryrun mode
# Usage: ./extension-upload-single.sh <name> <extension_version> <duckdb_version> <architecture> <s3_bucket> <copy_to_latest> <copy_to_versioned> [<path_to_ext>]
# <name> : Name of the extension
# <extension_version> : Version (commit / version tag) of the extension
# <duckdb_version> : Version (commit / version tag) of DuckDB
# <architecture> : Architecture target of the extension binary
# <s3_bucket> : S3 bucket to upload to
# <copy_to_latest> : Set this as the latest version ("true" / "false", default: "false")
# <copy_to_versioned> : Set this as a versioned version that will not be overwritten
# <path_to_ext> : (optional) Search this path for the extension
set -e
if [ -z "$8" ]; then
BASE_EXT_DIR="/tmp/extension"
else
BASE_EXT_DIR="$8"
fi
if [[ $4 == wasm* ]]; then
ext="$BASE_EXT_DIR/$1.duckdb_extension.wasm"
else
ext="$BASE_EXT_DIR/$1.duckdb_extension"
fi
script_dir="$(dirname "$(readlink -f "$0")")"
# calculate SHA256 hash of extension binary
cat $ext > $ext.append
( command -v truncate && truncate -s -256 $ext.append ) || ( command -v gtruncate && gtruncate -s -256 $ext.append ) || exit 1
# (Optionally) Sign binary
if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then
echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem
$script_dir/compute-extension-hash.sh $ext.append > $ext.hash
openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign
rm -f private.pem
else
# Default to 256 zeros
dd if=/dev/zero of=$ext.sign bs=256 count=1
fi
# append signature to extension binary
cat $ext.sign >> $ext.append
# compress extension binary
if [[ $4 == wasm_* ]]; then
brotli < $ext.append > "$ext.compressed"
else
gzip < $ext.append > "$ext.compressed"
fi
set -e
# Abort if AWS key is not set
if [ -z "$AWS_ACCESS_KEY_ID" ]; then
echo "No AWS key found, skipping.."
exit 0
fi
# Set dry run unless guard var is set
DRY_RUN_PARAM="--dryrun"
if [ "$DUCKDB_DEPLOY_SCRIPT_MODE" == "for_real" ]; then
DRY_RUN_PARAM=""
fi
# upload versioned version
if [[ $7 = 'true' ]]; then
if [ -z "$3" ]; then
echo "extension-upload-single.sh called with upload_versioned=true but no extension version was passed"
exit 1
fi
if [[ $4 == wasm* ]]; then
aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm $DRY_RUN_PARAM --acl public-read --content-encoding br --content-type="application/wasm"
else
aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz $DRY_RUN_PARAM --acl public-read
fi
fi
# upload to latest version
if [[ $6 = 'true' ]]; then
if [[ $4 == wasm* ]]; then
aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm $DRY_RUN_PARAM --acl public-read --content-encoding br --content-type="application/wasm"
else
aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz $DRY_RUN_PARAM --acl public-read
fi
fi

View File

@@ -0,0 +1,64 @@
#!/bin/bash
set -e
set -x
CMAKE_CONFIG=Release
EXT_BASE_PATH=build/release
if [ "${FORCE_32_BIT:0}" == "1" ]; then
FORCE_32_BIT_FLAG="-DFORCE_32_BIT=1"
else
FORCE_32_BIT_FLAG=""
fi
FILES="${EXT_BASE_PATH}/extension/*/*.duckdb_extension"
EXTENSION_LIST=""
for f in $FILES
do
ext=`basename $f .duckdb_extension`
EXTENSION_LIST="${EXTENSION_LIST}-$ext"
done
mkdir -p testext
cd testext
if [ "$2" = "oote" ]; then
CMAKE_ROOT="../duckdb"
else
CMAKE_ROOT=".."
fi
cmake -DCMAKE_BUILD_TYPE=${CMAKE_CONFIG} ${FORCE_32_BIT_FLAG} -DEXTENSION_TESTS_ONLY=1 -DDUCKDB_EXTENSION_CONFIGS=".github/config/in_tree_extensions.cmake;.github/config/out_of_tree_extensions.cmake" ${CMAKE_ROOT}
cmake --build . --config ${CMAKE_CONFIG}
cd ..
duckdb_path="testext/duckdb"
unittest_path="testext/test/unittest"
if [ ! -f "${duckdb_path}" ]; then
duckdb_path="testext/${CMAKE_CONFIG}/duckdb.exe"
unittest_path="testext/test/${CMAKE_CONFIG}/unittest.exe"
fi
${duckdb_path} -c "FROM duckdb_extensions()"
for f in $FILES
do
ext=`basename $f .duckdb_extension`
install_path=${ext}
unsigned_flag=
if [ "$1" = "local" ]
then
install_path=${f}
unsigned_flag=-unsigned
fi
echo ${install_path}
${duckdb_path} ${unsigned_flag} -c "FORCE INSTALL '${install_path}'"
${duckdb_path} ${unsigned_flag} -c "LOAD '${ext}'"
done
# Only run tests for non-local, we have tested in enough other ways
if [ "$1" != "local" ]
then
${unittest_path} --autoloading all --skip-compiled
fi

View File

@@ -0,0 +1,59 @@
#!/bin/bash
# Usage: ./extension-upload-wasm.sh <architecture> <commithash or version_tag>
set -e
# Ensure we do nothing on failed globs
shopt -s nullglob
if [[ -z "${DUCKDB_EXTENSION_SIGNING_PK}" ]]; then
# no private key provided, use the test private key (NOT SAFE)
# this is made so private.pem at the end of the block will be in
# a valid state, and the rest of the signing process can be tested
# even without providing the key
cp test/mbedtls/private.pem private.pem
else
# actual private key provided
echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem
fi
FILES="build/to_be_deployed/$2/$1/*.duckdb_extension.wasm"
for f in $FILES
do
ext=`basename $f .duckdb_extension.wasm`
echo $ext
# calculate SHA256 hash of extension binary
cat $f > $f.append
# 0 for custom section
# 113 in hex = 275 in decimal, total length of what follows (1 + 16 + 2 + 256)
# [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02]
echo -n -e '\x00' >> $f.append
echo -n -e '\x93\x02' >> $f.append
# 10 in hex = 16 in decimal, length of name, 1 byte
echo -n -e '\x10' >> $f.append
echo -n -e 'duckdb_signature' >> $f.append
# the name of the WebAssembly custom section, 16 bytes
# 100 in hex, 256 in decimal
# [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)],
# for a grand total of 2 bytes
echo -n -e '\x80\x02' >> $f.append
# the actual payload, 256 bytes, to be added later
scripts/compute-extension-hash.sh $f.append > $f.hash
# encrypt hash with extension signing private key to create signature
openssl pkeyutl -sign -in $f.hash -inkey private.pem -pkeyopt digest:sha256 -out $f.sign
# append signature to extension binary
cat $f.sign >> $f.append
# compress extension binary
brotli < $f.append > "$f.brotli"
# upload compressed extension binary to S3
if [[ -z "${AWS_SECRET_ACCESS_KEY}" ]]; then
#AWS_SECRET_ACCESS_KEY is empty -> dry run
aws s3 cp $f.brotli s3://duckdb-core-extensions/$2/$1/$ext.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" --dryrun
else
aws s3 cp $f.brotli s3://duckdb-core-extensions/$2/$1/$ext.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
fi
done
# remove private key
rm private.pem

461
external/duckdb/scripts/format.py vendored Normal file
View File

@@ -0,0 +1,461 @@
#!/usr/bin/python
# this script is used to format the source directory
import os
import time
import sys
import inspect
import subprocess
import difflib
import re
import tempfile
import uuid
import concurrent.futures
import argparse
import shutil
import traceback
from python_helpers import open_utf8
try:
ver = subprocess.check_output(('black', '--version'), text=True)
if int(ver.split(' ')[1].split('.')[0]) < 24:
print('you need to run `pip install "black>=24"`', ver)
exit(-1)
except Exception as e:
print('you need to run `pip install "black>=24"`', e)
exit(-1)
try:
ver = subprocess.check_output(('clang-format', '--version'), text=True)
if '11.' not in ver:
print('you need to run `pip install clang_format==11.0.1 - `', ver)
exit(-1)
except Exception as e:
print('you need to run `pip install clang_format==11.0.1 - `', e)
exit(-1)
cpp_format_command = 'clang-format --sort-includes=0 -style=file'
cmake_format_command = 'cmake-format'
try:
subprocess.check_output(('cmake-format', '--version'), text=True)
except Exception as e:
print('you need to run `pip install cmake-format`', e)
exit(-1)
extensions = [
'.cpp',
'.ipp',
'.c',
'.hpp',
'.h',
'.cc',
'.hh',
'CMakeLists.txt',
'.test',
'.test_slow',
'.test_coverage',
'.benchmark',
'.py',
'.java',
]
formatted_directories = ['src', 'benchmark', 'test', 'tools', 'examples', 'extension', 'scripts']
ignored_files = [
'tpch_constants.hpp',
'tpcds_constants.hpp',
'_generated',
'tpce_flat_input.hpp',
'test_csv_header.hpp',
'duckdb.cpp',
'duckdb.hpp',
'json.hpp',
'sqlite3.h',
'shell.c',
'termcolor.hpp',
'test_insert_invalid.test',
'httplib.hpp',
'os_win.c',
'glob.c',
'printf.c',
'helper.hpp',
'single_thread_ptr.hpp',
'types.hpp',
'default_views.cpp',
'default_functions.cpp',
'release.h',
'genrand.cpp',
'address.cpp',
'visualizer_constants.hpp',
'icu-collate.cpp',
'icu-collate.hpp',
'yyjson.cpp',
'yyjson.hpp',
'duckdb_pdqsort.hpp',
'pdqsort.h',
'stubdata.cpp',
'nf_calendar.cpp',
'nf_calendar.h',
'nf_localedata.cpp',
'nf_localedata.h',
'nf_zformat.cpp',
'nf_zformat.h',
'expr.cc',
'function_list.cpp',
'inlined_grammar.hpp',
]
ignored_directories = [
'.eggs',
'__pycache__',
'dbgen',
os.path.join('tools', 'rpkg', 'src', 'duckdb'),
os.path.join('tools', 'rpkg', 'inst', 'include', 'cpp11'),
os.path.join('extension', 'tpcds', 'dsdgen'),
os.path.join('extension', 'jemalloc', 'jemalloc'),
os.path.join('extension', 'icu', 'third_party'),
os.path.join('tools', 'nodejs', 'src', 'duckdb'),
]
format_all = False
check_only = True
confirm = True
silent = False
force = False
parser = argparse.ArgumentParser(prog='python scripts/format.py', description='Format source directory files')
parser.add_argument(
'revision', nargs='?', default='HEAD', help='Revision number or --all to format all files (default: HEAD)'
)
parser.add_argument('--check', action='store_true', help='Only print differences (default)')
parser.add_argument('--fix', action='store_true', help='Fix the files')
parser.add_argument('-a', '--all', action='store_true', help='Format all files')
parser.add_argument('-d', '--directories', nargs='*', default=[], help='Format specified directories')
parser.add_argument('-y', '--noconfirm', action='store_true', help='Skip confirmation prompt')
parser.add_argument('-q', '--silent', action='store_true', help='Suppress output')
parser.add_argument('-f', '--force', action='store_true', help='Force formatting')
args = parser.parse_args()
revision = args.revision
if args.check and args.fix:
parser.print_usage()
exit(1)
check_only = not args.fix
confirm = not args.noconfirm
silent = args.silent
force = args.force
format_all = args.all
if args.directories:
formatted_directories = args.directories
def file_is_ignored(full_path):
if os.path.basename(full_path) in ignored_files:
return True
dirnames = os.path.sep.join(full_path.split(os.path.sep)[:-1])
for ignored_directory in ignored_directories:
if ignored_directory in dirnames:
return True
return False
def can_format_file(full_path):
global extensions, formatted_directories, ignored_files
if not os.path.isfile(full_path):
return False
fname = full_path.split(os.path.sep)[-1]
found = False
# check file extension
for ext in extensions:
if full_path.endswith(ext):
found = True
break
if not found:
return False
# check ignored files
if file_is_ignored(full_path):
return False
# now check file directory
for dname in formatted_directories:
if full_path.startswith(dname):
return True
return False
action = "Formatting"
if check_only:
action = "Checking"
def get_changed_files(revision):
proc = subprocess.Popen(['git', 'diff', '--name-only', revision], stdout=subprocess.PIPE)
files = proc.stdout.read().decode('utf8').split('\n')
changed_files = []
for f in files:
if not can_format_file(f):
continue
if file_is_ignored(f):
continue
changed_files.append(f)
return changed_files
if os.path.isfile(revision):
print(action + " individual file: " + revision)
changed_files = [revision]
elif os.path.isdir(revision):
print(action + " files in directory: " + revision)
changed_files = [os.path.join(revision, x) for x in os.listdir(revision)]
print("Changeset:")
for fname in changed_files:
print(fname)
elif not format_all:
if revision == 'main':
# fetch new changes when comparing to the master
os.system("git fetch origin main:main")
print(action + " since branch or revision: " + revision)
changed_files = get_changed_files(revision)
if len(changed_files) == 0:
print("No changed files found!")
exit(0)
print("Changeset:")
for fname in changed_files:
print(fname)
else:
print(action + " all files")
if confirm and not check_only:
print("The files listed above will be reformatted.")
result = input("Continue with changes (y/n)?\n")
if result != 'y':
print("Aborting.")
exit(0)
format_commands = {
'.cpp': cpp_format_command,
'.ipp': cpp_format_command,
'.c': cpp_format_command,
'.hpp': cpp_format_command,
'.h': cpp_format_command,
'.hh': cpp_format_command,
'.cc': cpp_format_command,
'.txt': cmake_format_command,
'.py': 'black --quiet - --skip-string-normalization --line-length 120 --stdin-filename',
'.java': cpp_format_command,
}
difference_files = []
header_top = "//===----------------------------------------------------------------------===//\n"
header_top += "// DuckDB\n" + "//\n"
header_bottom = "//\n" + "//\n"
header_bottom += "//===----------------------------------------------------------------------===//\n\n"
base_dir = os.path.join(os.getcwd(), 'src/include')
def get_formatted_text(f, full_path, directory, ext):
if not can_format_file(full_path):
if not force:
print(
"File "
+ full_path
+ " is not normally formatted - but attempted to format anyway. Use --force if formatting is desirable"
)
exit(1)
if f == 'list.hpp':
# fill in list file
file_list = [
os.path.join(dp, f)
for dp, dn, filenames in os.walk(directory)
for f in filenames
if os.path.splitext(f)[1] == '.hpp' and not f.endswith("list.hpp")
]
file_list = [x.replace('src/include/', '') for x in file_list]
file_list.sort()
result = ""
for x in file_list:
result += '#include "%s"\n' % (x)
return result
if ext == ".hpp" and directory.startswith("src/include"):
with open_utf8(full_path, 'r') as f:
lines = f.readlines()
# format header in files
header_middle = "// " + os.path.relpath(full_path, base_dir) + "\n"
text = header_top + header_middle + header_bottom
is_old_header = True
for line in lines:
if not (line.startswith("//") or line.startswith("\n")) and is_old_header:
is_old_header = False
if not is_old_header:
text += line
if ext == '.test' or ext == '.test_slow' or ext == '.test_coverage' or ext == '.benchmark':
f = open_utf8(full_path, 'r')
lines = f.readlines()
f.close()
found_name = False
found_group = False
group_name = full_path.split('/')[-2]
new_path_line = '# name: ' + full_path + '\n'
new_group_line = '# group: [' + group_name + ']' + '\n'
found_diff = False
# Find description.
found_description = False
for line in lines:
if line.lower().startswith('# description:') or line.lower().startswith('#description:'):
if found_description:
print("Error formatting file " + full_path + ", multiple lines starting with # description found")
exit(1)
found_description = True
new_description_line = '# description: ' + line.split(':', 1)[1].strip() + '\n'
# Filter old meta.
meta = ['#name:', '# name:', '#description:', '# description:', '#group:', '# group:']
lines = [line for line in lines if not any(line.lower().startswith(m) for m in meta)]
# Clean up empty leading lines.
while lines and not lines[0].strip():
lines.pop(0)
# Ensure header is prepended.
header = [new_path_line]
if found_description:
header.append(new_description_line)
header.append(new_group_line)
header.append('\n')
return ''.join(header + lines)
proc_command = format_commands[ext].split(' ') + [full_path]
proc = subprocess.Popen(
proc_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=open(full_path) if ext == '.py' else None
)
new_text = proc.stdout.read().decode('utf8')
stderr = proc.stderr.read().decode('utf8')
if len(stderr) > 0:
print(os.getcwd())
print("Failed to format file " + full_path)
print(' '.join(proc_command))
print(stderr)
exit(1)
new_text = new_text.replace('\r', '')
new_text = re.sub(r'\n*$', '', new_text)
return new_text + '\n'
def file_is_generated(text):
if '// This file is automatically generated by scripts/' in text:
return True
return False
def format_file(f, full_path, directory, ext):
global difference_files
with open_utf8(full_path, 'r') as f:
old_text = f.read()
# do not format auto-generated files
if file_is_generated(old_text) and ext != '.py':
return
old_lines = old_text.split('\n')
new_text = get_formatted_text(f, full_path, directory, ext)
if ext in ('.cpp', '.hpp'):
new_text = new_text.replace('ARGS &&...args', 'ARGS &&... args')
if check_only:
new_lines = new_text.split('\n')
old_lines = [x for x in old_lines if '...' not in x]
new_lines = [x for x in new_lines if '...' not in x]
diff_result = difflib.unified_diff(old_lines, new_lines)
total_diff = ""
for diff_line in diff_result:
total_diff += diff_line + "\n"
total_diff = total_diff.strip()
if len(total_diff) > 0:
print("----------------------------------------")
print("----------------------------------------")
print("Found differences in file " + full_path)
print("----------------------------------------")
print("----------------------------------------")
print(total_diff)
difference_files.append(full_path)
else:
tmpfile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
with open_utf8(tmpfile, 'w+') as f:
f.write(new_text)
shutil.move(tmpfile, full_path)
class ToFormatFile:
def __init__(self, filename, full_path, directory):
self.filename = filename
self.full_path = full_path
self.directory = directory
self.ext = '.' + filename.split('.')[-1]
def format_directory(directory):
files = os.listdir(directory)
files.sort()
result = []
for f in files:
full_path = os.path.join(directory, f)
if os.path.isdir(full_path):
if f in ignored_directories or full_path in ignored_directories:
continue
result += format_directory(full_path)
elif can_format_file(full_path):
result += [ToFormatFile(f, full_path, directory)]
return result
files = []
if format_all:
try:
os.system(cmake_format_command.replace("${FILE}", "CMakeLists.txt"))
except:
pass
for direct in formatted_directories:
files += format_directory(direct)
else:
for full_path in changed_files:
splits = full_path.split(os.path.sep)
fname = splits[-1]
dirname = os.path.sep.join(splits[:-1])
files.append(ToFormatFile(fname, full_path, dirname))
def process_file(f):
if not silent:
print(f.full_path)
try:
format_file(f.filename, f.full_path, f.directory, f.ext)
except:
print(traceback.format_exc())
sys.exit(1)
# Create thread for each file
with concurrent.futures.ThreadPoolExecutor() as executor:
try:
threads = [executor.submit(process_file, f) for f in files]
# Wait for all tasks to complete
concurrent.futures.wait(threads)
except KeyboardInterrupt:
executor.shutdown(wait=True, cancel_futures=True)
raise
if check_only:
if len(difference_files) > 0:
print("")
print("")
print("")
print("Failed format-check: differences were found in the following files:")
for fname in difference_files:
print("- " + fname)
print('Run "make format-fix" to fix these differences automatically')
exit(1)
else:
print("Passed format-check")
exit(0)

View File

@@ -0,0 +1,81 @@
import os
import re
import json
header = '''//===----------------------------------------------------------------------===//
// DuckDB
//
// duckdb/catalog/default/builtin_types/types.hpp
//
//
//===----------------------------------------------------------------------===//
// This file is generated by scripts/generate_builtin_types.py
#pragma once
#include "duckdb/common/types.hpp"
#include "duckdb/common/array.hpp"
namespace duckdb {
'''
footer = '''} // namespace duckdb
'''
def normalize_path_separators(x):
return os.path.sep.join(x.split('/'))
def legal_struct_name(name):
return name.isalnum()
def get_struct_name(function_name):
return function_name.replace('_', ' ').title().replace(' ', '') + 'Fun'
def sanitize_string(text):
return text.replace('"', '\\"')
new_text = header
type_entries = []
json_path = normalize_path_separators(f'src/include/duckdb/catalog/default/builtin_types/types.json')
with open(json_path, 'r') as f:
parsed_json = json.load(f)
# Extract all the types from the json
for type in parsed_json:
names = type['names']
type_id = type['id']
type_entries += ['\t{' + f'''"{name}", LogicalTypeId::{type_id}''' + '}' for name in names]
TYPE_COUNT = len(type_entries)
new_text += '''
struct DefaultType {
const char *name;
LogicalTypeId type;
};
'''
new_text += f'''
using builtin_type_array = std::array<DefaultType, {TYPE_COUNT}>;
'''
new_text += '''
static constexpr const builtin_type_array BUILTIN_TYPES{{
'''
type_text = ",\n".join(type_entries)
new_text += type_text
new_text += '''
}};
'''
new_text += footer
with open('src/include/duckdb/catalog/default/builtin_types/types.hpp', 'w+') as f:
f.write(new_text)

1002
external/duckdb/scripts/generate_c_api.py vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,98 @@
# this script generates data for the TPC-H dbgen
import os
from python_helpers import open_utf8
def get_csv_text(fpath, add_null_terminator=False):
with open(fpath, 'rb') as f:
text = bytearray(f.read())
result_text = ""
first = True
for byte in text:
if first:
result_text += str(byte)
else:
result_text += ", " + str(byte)
first = False
if add_null_terminator:
result_text += ", 0"
return result_text
def write_dir(dirname, varname):
files = os.listdir(dirname)
files.sort()
result = ""
aggregated_result = "const char *%s[] = {\n" % (varname,)
for fname in files:
file_varname = "%s_%s" % (varname, fname.split('.')[0])
result += "const uint8_t %s[] = {" % (file_varname,) + get_csv_text(os.path.join(dirname, fname), True) + "};\n"
aggregated_result += "\t(const char*) %s,\n" % (file_varname,)
aggregated_result = aggregated_result[:-2] + "\n};\n"
return result + aggregated_result
# ------------------------------------------- #
# ------------------------------------------- #
# ------------- TPC-H ------------ #
# ------------------------------------------- #
# ------------------------------------------- #
tpch_dir = 'extension/tpch/dbgen'
tpch_queries = os.path.join(tpch_dir, 'queries')
tpch_answers_sf001 = os.path.join(tpch_dir, 'answers', 'sf0.01')
tpch_answers_sf01 = os.path.join(tpch_dir, 'answers', 'sf0.1')
tpch_answers_sf1 = os.path.join(tpch_dir, 'answers', 'sf1')
tpch_header = os.path.join(tpch_dir, 'include', 'tpch_constants.hpp')
def create_tpch_header(tpch_dir):
result = """/* THIS FILE WAS AUTOMATICALLY GENERATED BY generate_csv_header.py */
#pragma once
const int TPCH_QUERIES_COUNT = 22;
"""
# write the queries
result += write_dir(tpch_queries, "TPCH_QUERIES")
result += write_dir(tpch_answers_sf001, "TPCH_ANSWERS_SF0_01")
result += write_dir(tpch_answers_sf01, "TPCH_ANSWERS_SF0_1")
result += write_dir(tpch_answers_sf1, "TPCH_ANSWERS_SF1")
with open_utf8(tpch_header, 'w+') as f:
f.write(result)
print(tpch_header)
create_tpch_header(tpch_dir)
# ------------------------------------------- #
# ------------------------------------------- #
# ------------- TPC-DS ------------ #
# ------------------------------------------- #
# ------------------------------------------- #
tpcds_dir = 'extension/tpcds/dsdgen'
tpcds_queries = os.path.join(tpcds_dir, 'queries')
tpcds_answers_sf001 = os.path.join(tpcds_dir, 'answers', 'sf0.01')
tpcds_answers_sf1 = os.path.join(tpcds_dir, 'answers', 'sf1')
tpcds_header = os.path.join(tpcds_dir, 'include', 'tpcds_constants.hpp')
def create_tpcds_header(tpch_dir):
result = """/* THIS FILE WAS AUTOMATICALLY GENERATED BY generate_csv_header.py */
#pragma once
const int TPCDS_QUERIES_COUNT = 99;
const int TPCDS_TABLE_COUNT = 24;
"""
# write the queries
result += write_dir(tpcds_queries, "TPCDS_QUERIES")
result += write_dir(tpcds_answers_sf001, "TPCDS_ANSWERS_SF0_01")
result += write_dir(tpcds_answers_sf1, "TPCDS_ANSWERS_SF1")
with open_utf8(tpcds_header, 'w+') as f:
f.write(result)
print(tpcds_header)
create_tpcds_header(tpcds_dir)

View File

@@ -0,0 +1,245 @@
import os
import csv
import re
import argparse
import glob
os.chdir(os.path.dirname(__file__))
# Dont generate serialization for these enums
blacklist = [
"RegexOptions",
"Flags",
"ContainerType",
"Type",
"DictionaryAppendState",
"DictFSSTMode",
"ComplexJSONType",
]
enum_util_header_file = os.path.join("..", "src", "include", "duckdb", "common", "enum_util.hpp")
enum_util_source_file = os.path.join("..", "src", "common", "enum_util.cpp")
# Overrides conversions for the following enums:
overrides = {
"LogicalTypeId": {
"SQLNULL": "NULL",
"TIMESTAMP_TZ": "TIMESTAMP WITH TIME ZONE",
"TIME_TZ": "TIME WITH TIME ZONE",
"TIMESTAMP_SEC": "TIMESTAMP_S",
},
"JoinType": {"OUTER": "FULL"},
"OrderType": {
"ORDER_DEFAULT": ["ORDER_DEFAULT", "DEFAULT"],
"DESCENDING": ["DESCENDING", "DESC"],
"ASCENDING": ["ASCENDING", "ASC"],
},
"OrderByNullType": {
"ORDER_DEFAULT": ["ORDER_DEFAULT", "DEFAULT"],
"NULLS_FIRST": ["NULLS FIRST", "NULLS_FIRST"],
"NULLS_LAST": ["NULLS LAST", "NULLS_LAST"],
},
"CheckpointAbort": {
"NO_ABORT": "NONE",
"DEBUG_ABORT_BEFORE_TRUNCATE": "BEFORE_TRUNCATE",
"DEBUG_ABORT_BEFORE_HEADER": "BEFORE_HEADER",
"DEBUG_ABORT_AFTER_FREE_LIST_WRITE": "AFTER_FREE_LIST_WRITE",
},
"SampleMethod": {"SYSTEM_SAMPLE": "System", "BERNOULLI_SAMPLE": "Bernoulli", "RESERVOIR_SAMPLE": "Reservoir"},
"TableReferenceType": {"EMPTY_FROM": "EMPTY"},
"LogLevel": {
"LOG_TRACE": "TRACE",
"LOG_DEBUG": "DEBUG",
"LOG_INFO": "INFO",
"LOG_WARN": "WARN",
"LOG_ERROR": "ERROR",
"LOG_FATAL": "FATAL",
},
"RequestType": {
"GET_REQUEST": "GET",
"PUT_REQUEST": "PUT",
"HEAD_REQUEST": "HEAD",
"DELETE_REQUEST": "DELETE",
"POST_REQUEST": "POST",
},
"ArrowFormatVersion": {"V1_0": "1.0", "V1_1": "1.1", "V1_2": "1.2", "V1_3": "1.3", "V1_4": "1.4", "V1_5": "1.5"},
}
# get all the headers
hpp_files = []
for root, dirs, files in os.walk(os.path.join("..", "src")):
for file in files:
# Dont include the generated header itself recursively
if file == "enum_util.hpp":
continue
if 'amalgamation' in root:
continue
if file.endswith(".hpp"):
hpp_files.append(os.path.join(root, file))
def remove_prefix(str, prefix):
if str.startswith(prefix):
return str[len(prefix) :]
return str
# get all the enum classes
enums = []
enum_paths = []
enum_path_set = set()
for hpp_file in hpp_files:
with open(hpp_file, "r") as f:
text = f.read()
for res in re.finditer(r"enum class (\w*)\s*:\s*(\w*)\s*{((?:\s*[^}])*)}", text, re.MULTILINE):
file_path = remove_prefix(os.path.relpath(hpp_file, os.path.join("..", "src")), "include/")
enum_name = res.group(1)
if enum_name in blacklist:
print(f"Skipping {enum_name} because it is blacklisted")
continue
enum_type = res.group(2)
enum_members = []
# Capture All members: \w+(\s*\=\s*-?\w*)?
# group one is the member name
# group two is the member value
# First clean group from comments
s = res.group(3)
s = re.sub(r"\/\/.*", "", s)
s = re.sub(r"\/\*.*\*\/", "", s)
enum_values = {}
for member in re.finditer(r"(\w+)(\s*\=\s*-?\w*)?", s):
key = member.group(1)
strings = [key]
if enum_name in overrides and key in overrides[enum_name]:
override = overrides[enum_name][key]
if isinstance(override, list):
print(f"Overriding {enum_name}::{key} to one of {override}")
strings = override
else:
print(f"Overriding {enum_name}::{key} to {override}")
strings = [override]
if member.group(2):
# If the member has a value, make sure it isnt already covered by another member
# If it is, we cant do anything else than ignore it
value = remove_prefix(member.group(2).strip(), "=").strip()
if value not in enum_values and value not in dict(enum_members):
enum_members.append((key, strings))
else:
print(f"Skipping {enum_name}::{key} because it has a duplicate value {value}")
else:
enum_members.append((key, strings))
if not file_path in enum_path_set:
enum_path_set.add(file_path)
enum_paths.append(file_path)
enums.append((enum_name, enum_type, enum_members))
enum_paths.sort()
enums.sort(key=lambda x: x[0])
header = """//-------------------------------------------------------------------------
// This file is automatically generated by scripts/generate_enum_util.py
// Do not edit this file manually, your changes will be overwritten
// If you want to exclude an enum from serialization, add it to the blacklist in the script
//
// Note: The generated code will only work properly if the enum is a top level item in the duckdb namespace
// If the enum is nested in a class, or in another namespace, the generated code will not compile.
// You should move the enum to the duckdb namespace, manually write a specialization or add it to the blacklist
//-------------------------------------------------------------------------\n\n
"""
# Write the enum util header
with open(enum_util_header_file, "w") as f:
f.write(header)
f.write('#pragma once\n\n')
f.write('#include <stdint.h>\n')
f.write('#include "duckdb/common/string.hpp"\n\n')
f.write("namespace duckdb {\n\n")
f.write(
"""struct EnumUtil {
// String -> Enum
template <class T>
static T FromString(const char *value) = delete;
template <class T>
static T FromString(const string &value) { return FromString<T>(value.c_str()); }
// Enum -> String
template <class T>
static const char *ToChars(T value) = delete;
template <class T>
static string ToString(T value) { return string(ToChars<T>(value)); }
};\n\n"""
)
# Forward declare all enums
for enum_name, enum_type, _ in enums:
f.write(f"enum class {enum_name} : {enum_type};\n\n")
f.write("\n")
# Forward declare all enum serialization functions
for enum_name, enum_type, _ in enums:
f.write(f"template<>\nconst char* EnumUtil::ToChars<{enum_name}>({enum_name} value);\n\n")
f.write("\n")
# Forward declare all enum dserialization functions
for enum_name, enum_type, _ in enums:
f.write(f"template<>\n{enum_name} EnumUtil::FromString<{enum_name}>(const char *value);\n\n")
f.write("\n")
f.write("}\n")
with open(enum_util_source_file, "w") as f:
f.write(header)
f.write('#include "duckdb/common/enum_util.hpp"\n')
# Write the includes
for enum_path in enum_paths:
f.write(f'#include "{enum_path}"\n')
f.write("\n")
f.write("namespace duckdb {\n\n")
for enum_name, enum_type, enum_members in enums:
enum_string_array = "Get" + enum_name + "Values()"
# Write the enum from string
f.write(f"const StringUtil::EnumStringLiteral *{enum_string_array} {{\n")
f.write(f"\tstatic constexpr StringUtil::EnumStringLiteral values[] {{\n")
member_count = 0
for key, strings in enum_members:
for str_val in strings:
if member_count != 0:
f.write(",\n")
f.write(f"\t\t{{ static_cast<uint32_t>({enum_name}::{key}), \"{str_val}\" }}")
member_count += 1
f.write("\n\t};")
f.write("\n\treturn values;")
f.write("\n}\n\n")
f.write(f"template<>\nconst char* EnumUtil::ToChars<{enum_name}>({enum_name} value) {{\n")
f.write(
f"\treturn StringUtil::EnumToString({enum_string_array}, {member_count}, \"{enum_name}\", static_cast<uint32_t>(value));\n"
)
f.write("}\n\n")
# Write the string to enum
f.write(f"template<>\n{enum_name} EnumUtil::FromString<{enum_name}>(const char *value) {{\n")
f.write(
f"\treturn static_cast<{enum_name}>(StringUtil::StringToEnum({enum_string_array}, {member_count}, \"{enum_name}\", value));"
)
f.write("\n}\n\n")
f.write("}\n\n")

View File

@@ -0,0 +1,161 @@
import os
import json
import re
targets = [{'source': 'extension/json/include/', 'target': 'extension/json'}]
file_list = []
for target in targets:
source_base = os.path.sep.join(target['source'].split('/'))
target_base = os.path.sep.join(target['target'].split('/'))
for fname in os.listdir(source_base):
if '_enums.json' not in fname:
continue
file_list.append(
{
'source': os.path.join(source_base, fname),
'include_path': fname.replace('.json', '.hpp'),
'target_hpp': os.path.join(source_base, fname.replace('.json', '.hpp')),
'target_cpp': os.path.join(target_base, fname.replace('.json', '.cpp')),
}
)
header = '''//===----------------------------------------------------------------------===//
// This file is automatically generated by scripts/generate_enums.py
// Do not edit this file manually, your changes will be overwritten
//===----------------------------------------------------------------------===//
${INCLUDE_LIST}
namespace duckdb {
'''
footer = '''
} // namespace duckdb
'''
include_base = '#include "${FILENAME}"\n'
enum_header = '\nenum class ${ENUM_NAME} : ${ENUM_TYPE} {\n'
enum_footer = '};'
enum_value = '\t${ENUM_MEMBER} = ${ENUM_VALUE},\n'
enum_util_header = '''
template<>
const char* EnumUtil::ToChars<${ENUM_NAME}>(${ENUM_NAME} value);
template<>
${ENUM_NAME} EnumUtil::FromString<${ENUM_NAME}>(const char *value);
'''
enum_util_conversion_begin = '''
template<>
const char* EnumUtil::ToChars<${ENUM_NAME}>(${ENUM_NAME} value) {
switch(value) {
'''
enum_util_switch = '\tcase ${ENUM_NAME}::${ENUM_MEMBER}:\n\t\treturn "${ENUM_MEMBER}";\n'
enum_util_conversion_end = ''' default:
throw NotImplementedException(StringUtil::Format("Enum value of type ${ENUM_NAME}: '%d' not implemented", value));
}
}
'''
from_string_begin = '''
template<>
${ENUM_NAME} EnumUtil::FromString<${ENUM_NAME}>(const char *value) {
'''
from_string_comparison = ''' if (StringUtil::Equals(value, "${ENUM_MEMBER}")) {
return ${ENUM_NAME}::${ENUM_MEMBER};
}
'''
from_string_end = ''' throw NotImplementedException(StringUtil::Format("Enum value of type ${ENUM_NAME}: '%s' not implemented", value));
}
'''
class EnumMember:
def __init__(self, entry, index):
self.comment = None
self.index = index
if type(entry) == str:
self.name = entry
else:
self.name = entry['name']
if 'comment' in entry:
self.comment = entry['comment']
if 'index' in entry:
self.index = int(entry['index'])
class EnumClass:
def __init__(self, entry):
self.name = entry['name']
self.type = 'uint8_t'
self.values = []
index = 0
for value_entry in entry['values']:
self.values.append(EnumMember(value_entry, index))
index += 1
for entry in file_list:
source_path = entry['source']
target_header = entry['target_hpp']
target_source = entry['target_cpp']
include_path = entry['include_path']
with open(source_path, 'r') as f:
json_data = json.load(f)
include_list = ['duckdb/common/constants.hpp', 'duckdb/common/enum_util.hpp']
enums = []
for entry in json_data:
if 'includes' in entry:
include_list += entry['includes']
enums.append(EnumClass(entry))
with open(target_header, 'w+') as f:
include_text = '#pragma once\n\n'
include_text += ''.join([include_base.replace('${FILENAME}', x) for x in include_list])
f.write(header.replace('${INCLUDE_LIST}', include_text))
for enum in enums:
f.write(enum_header.replace('${ENUM_NAME}', enum.name).replace('${ENUM_TYPE}', enum.type))
for value in enum.values:
if value.comment is not None:
f.write('\t//! ' + value.comment + '\n')
f.write(enum_value.replace('${ENUM_MEMBER}', value.name).replace('${ENUM_VALUE}', str(value.index)))
f.write(enum_footer)
f.write('\n')
for enum in enums:
f.write(enum_util_header.replace('${ENUM_NAME}', enum.name))
f.write(footer)
with open(target_source, 'w+') as f:
source_include_list = [include_path, 'duckdb/common/string_util.hpp']
f.write(
header.replace(
'${INCLUDE_LIST}', ''.join([include_base.replace('${FILENAME}', x) for x in source_include_list])
)
)
for enum in enums:
f.write(enum_util_conversion_begin.replace('${ENUM_NAME}', enum.name))
for value in enum.values:
f.write(enum_util_switch.replace('${ENUM_MEMBER}', value.name).replace('${ENUM_NAME}', enum.name))
f.write(enum_util_conversion_end.replace('${ENUM_NAME}', enum.name))
f.write(from_string_begin.replace('${ENUM_NAME}', enum.name))
for value in enum.values:
f.write(from_string_comparison.replace('${ENUM_MEMBER}', value.name).replace('${ENUM_NAME}', enum.name))
f.write(from_string_end.replace('${ENUM_NAME}', enum.name))
f.write(footer)

View File

@@ -0,0 +1,972 @@
import os
import csv
import re
import argparse
import glob
from typing import Set, Tuple, cast
import pathlib
from typing import NamedTuple
from typing import List, Dict
import json
os.chdir(os.path.join(os.path.dirname(__file__), '..'))
# Example usage:
parser = argparse.ArgumentParser(description='Generates/Validates extension_functions.hpp file')
parser.add_argument(
'--validate',
action=argparse.BooleanOptionalAction,
help='If set will validate that extension_entries.hpp is up to date, otherwise it generates the extension_functions.hpp file.',
)
parser.add_argument(
'--extension_repository',
action='store',
help="The repository to look for the '**/<extension>.duckdb_extension' files",
default='build/release/repository',
)
parser.add_argument(
'--shell',
action='store',
help="Path to the DuckDB shell",
default='build/release/duckdb',
)
parser.add_argument(
'--extensions',
action='store',
help="Comma separated list of extensions - if not provided this is read from the extension configuration",
default='',
)
args = parser.parse_args()
EXTENSIONS_PATH = os.path.join("build", "extension_configuration", "extensions.csv")
DUCKDB_PATH = os.path.join(*args.shell.split('/'))
HEADER_PATH = os.path.join("src", "include", "duckdb", "main", "extension_entries.hpp")
EXTENSION_DEPENDENCIES = {
'iceberg': [
'avro',
'parquet',
]
}
from enum import Enum
class CatalogType(str, Enum):
SCALAR = "CatalogType::SCALAR_FUNCTION_ENTRY"
TABLE = "CatalogType::TABLE_FUNCTION_ENTRY"
AGGREGATE = "CatalogType::AGGREGATE_FUNCTION_ENTRY"
PRAGMA = "CatalogType::PRAGMA_FUNCTION_ENTRY"
MACRO = "CatalogType::MACRO_ENTRY"
TABLE_MACRO = "CatalogType::TABLE_MACRO_ENTRY"
parameter_type_map = {"TIMESTAMP WITH TIME ZONE": "TIMESTAMPTZ", "TIME WITH TIME ZONE": "TIMETZ"}
def catalog_type_from_type(catalog_type: str) -> CatalogType:
TYPE_MAP = {
CatalogType.SCALAR.value: CatalogType.SCALAR,
CatalogType.TABLE.value: CatalogType.TABLE,
CatalogType.AGGREGATE.value: CatalogType.AGGREGATE,
CatalogType.PRAGMA.value: CatalogType.PRAGMA,
CatalogType.MACRO.value: CatalogType.MACRO,
CatalogType.TABLE_MACRO.value: CatalogType.TABLE_MACRO,
}
if catalog_type not in TYPE_MAP:
raise Exception(f"Unrecognized function type: '{catalog_type}'")
return TYPE_MAP[catalog_type]
def catalog_type_from_string(catalog_type: str) -> CatalogType:
TYPE_MAP = {
CatalogType.SCALAR.name.lower(): CatalogType.SCALAR,
CatalogType.TABLE.name.lower(): CatalogType.TABLE,
CatalogType.AGGREGATE.name.lower(): CatalogType.AGGREGATE,
CatalogType.PRAGMA.name.lower(): CatalogType.PRAGMA,
CatalogType.MACRO.name.lower(): CatalogType.MACRO,
CatalogType.TABLE_MACRO.name.lower(): CatalogType.TABLE_MACRO,
}
if catalog_type not in TYPE_MAP:
raise Exception(f"Unrecognized function type: '{catalog_type}'")
return TYPE_MAP[catalog_type]
def parse_records(text):
records = [] # Will hold all parsed records
current_record = [] # Holds items for the current record
current_item = [] # Accumulates characters for the current item
in_quote = False # True if we're inside a double-quoted string
inside_braces = False # True if we're inside a { ... } block
for char in text:
if char == '"':
# Toggle the quote state and always include the quote.
in_quote = not in_quote
elif char == '{' and not in_quote:
# Start of a new record.
inside_braces = True
# Reset any previous record state.
current_record = []
current_item = []
elif char == '}' and not in_quote and inside_braces:
# End of the current record.
token = ''.join(current_item).strip()
if token:
current_record.append(token)
records.append(current_record)
# Reset state for subsequent records.
current_record = []
current_item = []
inside_braces = False
elif char == ',' and not in_quote and inside_braces:
# A comma outside quotes indicates the end of the current item.
token = ''.join(current_item).strip()
if token:
current_record.append(token)
current_item = []
else:
# Otherwise, just add the character if we're inside braces.
if inside_braces:
current_item.append(char)
return records
class LogicalType(NamedTuple):
type: str
class Function(NamedTuple):
name: str
type: CatalogType
class FunctionOverload(NamedTuple):
name: str
type: CatalogType
parameters: Tuple
return_type: LogicalType
class ExtensionFunctionOverload(NamedTuple):
extension: str
name: str
type: CatalogType
parameters: Tuple
return_type: LogicalType
@staticmethod
def create_map(input: List[Tuple[str, str, str, str]]) -> Dict[Function, List["ExtensionFunctionOverload"]]:
output: Dict[Function, List["ExtensionFunctionOverload"]] = {}
for x in input:
function = Function(x[0], catalog_type_from_type(x[2]))
# parse the signature
signature = x[3]
splits = signature.split('>')
return_type = LogicalType(splits[1])
parameters = [LogicalType(param) for param in splits[0][1:-1].split(',')]
extension_function = ExtensionFunctionOverload(x[1], function.name, function.type, parameters, return_type)
if function not in output:
output[function] = []
output[function].append(extension_function)
return output
class ExtensionFunction(NamedTuple):
extension: str
name: str
type: CatalogType
@staticmethod
def create_map(input: List[Tuple[str, str, str]]) -> Dict[Function, "ExtensionFunction"]:
output: Dict[Function, "ExtensionFunction"] = {}
for x in input:
key = Function(x[0], catalog_type_from_type(x[2]))
output[key] = ExtensionFunction(x[1], key.name, key.type)
return output
class ExtensionSetting(NamedTuple):
extension: str
name: str
@staticmethod
def create_map(input: List[Tuple[str, str]]) -> Dict[str, "ExtensionSetting"]:
output: Dict[str, "ExtensionSetting"] = {}
for x in input:
output[x[0]] = ExtensionSetting(x[1], x[0])
return output
class ExtensionSecretType(NamedTuple):
extension: str
name: str
@staticmethod
def create_map(input: List[Tuple[str, str]]) -> Dict[str, "ExtensionSecretType"]:
output: Dict[str, "ExtensionSecretType"] = {}
for x in input:
output[x[0]] = ExtensionSecretType(x[1], x[0])
return output
class ExtensionCopyFunction(NamedTuple):
extension: str
name: str
@staticmethod
def create_map(input: List[Tuple[str, str]]) -> Dict[str, "ExtensionCopyFunction"]:
output: Dict[str, "ExtensionCopyFunction"] = {}
for x in input:
output[x[0]] = ExtensionCopyFunction(x[1], x[0])
return output
class ExtensionType(NamedTuple):
extension: str
name: str
@staticmethod
def create_map(input: List[Tuple[str, str]]) -> Dict[str, "ExtensionType"]:
output: Dict[str, "ExtensionType"] = {}
for x in input:
output[x[0]] = ExtensionType(x[1], x[0])
return output
class ParsedEntries:
def __init__(self, file_path):
self.path = file_path
self.functions = {}
self.function_overloads = {}
self.settings = {}
self.secret_types = {}
self.types = {}
self.copy_functions = {}
file = open(file_path, 'r')
file_blob = file.read()
# Get the extension functions
ext_functions_file_blob = get_slice_of_file("EXTENSION_FUNCTIONS", file_blob)
res = parse_records(ext_functions_file_blob)
res = [(x[0], x[1], x[2]) for x in res]
self.functions = ExtensionFunction.create_map(res)
# Get the extension function overloads
ext_function_overloads_file_blob = get_slice_of_file("EXTENSION_FUNCTION_OVERLOADS", file_blob)
res = parse_records(ext_function_overloads_file_blob)
res = [(x[0], x[1], x[2], x[3]) for x in res]
self.function_overloads = ExtensionFunctionOverload.create_map(res)
# Get the extension settings
ext_settings_file_blob = get_slice_of_file("EXTENSION_SETTINGS", file_blob)
res = parse_records(ext_settings_file_blob)
res = [(x[0], x[1]) for x in res]
self.settings = ExtensionSetting.create_map(res)
# Get the extension secret types
ext_secret_types_file_blob = get_slice_of_file("EXTENSION_SECRET_TYPES", file_blob)
res = parse_records(ext_secret_types_file_blob)
res = [(x[0], x[1]) for x in res]
self.secret_types = ExtensionSecretType.create_map(res)
# Get the extension types
ext_copy_functions_blob = get_slice_of_file("EXTENSION_COPY_FUNCTIONS", file_blob)
res = parse_records(ext_copy_functions_blob)
res = [(x[0], x[1]) for x in res]
self.copy_functions = ExtensionCopyFunction.create_map(res)
# Get the extension types
ext_types_file_blob = get_slice_of_file("EXTENSION_TYPES", file_blob)
res = parse_records(ext_types_file_blob)
res = [(x[0], x[1]) for x in res]
self.types = ExtensionType.create_map(res)
def strip_unloaded_extensions(self, extensions: List[str], functions):
return [x for x in functions if x.extension not in extensions]
def filter_entries(self, extensions: List[str]):
self.functions = {k: v for k, v in self.functions.items() if v.extension not in extensions}
self.function_overloads = {
k: self.strip_unloaded_extensions(extensions, v)
for k, v in self.function_overloads.items()
if len(self.strip_unloaded_extensions(extensions, v)) > 0
}
self.copy_functions = {k: v for k, v in self.copy_functions.items() if v.extension not in extensions}
self.settings = {k: v for k, v in self.settings.items() if v.extension not in extensions}
self.secret_types = {k: v for k, v in self.secret_types.items() if v.extension not in extensions}
self.types = {k: v for k, v in self.types.items() if v.extension not in extensions}
def check_prerequisites():
if not os.path.isfile(DUCKDB_PATH):
print(f"{DUCKDB_PATH} not found")
print(
"please run 'GENERATE_EXTENSION_ENTRIES=1 BUILD_ALL_EXT=1 make release', you might have to manually add DONT_LINK to all extension_configs"
)
exit(1)
if len(args.extensions) == 0 and not os.path.isfile(EXTENSIONS_PATH):
print(f"{EXTENSIONS_PATH} not found and --extensions it not set")
print("Either:")
print(
"* run 'GENERATE_EXTENSION_ENTRIES=1 BUILD_ALL_EXT=1 make release', you might have to manually add DONT_LINK to all extension_configs"
)
print("* Specify a comma separated list of extensions using --extensions")
exit(1)
if not os.path.isdir(args.extension_repository):
print(f"provided --extension_repository '{args.extension_repository}' is not a valid directory")
exit(1)
# Parses the extension config files for which extension names there are to be expected
def get_extension_names() -> List[str]:
if len(args.extensions) > 0:
return args.extensions.split(',')
extension_names = []
with open(EXTENSIONS_PATH) as f:
# Skip the csv header
next(f)
for line in f:
extension_name = line.split(',')[0].rstrip()
if "jemalloc" in extension_name:
# We skip jemalloc as it doesn't produce a loadable extension but is in the config
continue
extension_names.append(extension_name)
return extension_names
def get_query(sql_query, load_query) -> list:
# Optionally perform a LOAD of an extension
# Then perform a SQL query, fetch the output
query = f'{DUCKDB_PATH} -json -unsigned -c "{load_query}{sql_query}" '
query_result = os.popen(query).read()
result = [x for x in query_result[1:-2].split("\n") if x != '']
return result
def transform_parameter(parameter) -> LogicalType:
parameter = parameter.upper()
if parameter.endswith('[]'):
return LogicalType(transform_parameter(parameter[0 : len(parameter) - 2]).type + '[]')
if parameter in parameter_type_map:
return LogicalType(parameter_type_map[parameter])
return LogicalType(parameter)
def transform_parameters(parameters) -> FunctionOverload:
parameters = parameters[1:-1].split(', ')
return tuple(transform_parameter(param) for param in parameters)
def get_functions(load="") -> (Set[Function], Dict[Function, List[FunctionOverload]]):
GET_FUNCTIONS_QUERY = """
select distinct
function_name,
function_type,
parameter_types,
return_type
from duckdb_functions()
ORDER BY function_name, function_type;
"""
# ['name_1,type_1', ..., 'name_n,type_n']
results = set(get_query(GET_FUNCTIONS_QUERY, load))
functions = set()
function_overloads = {}
for x in results:
if x[-1] == ',':
# Remove the trailing comma
x = x[:-1]
function_name, function_type, parameter_types, return_type = [
x.lower() if x else "null" for x in json.loads(x).values()
]
function_parameters = transform_parameters(parameter_types)
function_return = transform_parameter(return_type)
function = Function(function_name, catalog_type_from_string(function_type))
function_overload = FunctionOverload(
function_name, catalog_type_from_string(function_type), function_parameters, function_return
)
if function not in functions:
functions.add(function)
function_overloads[function] = [function_overload]
else:
function_overloads[function].append(function_overload)
return (functions, function_overloads)
def get_settings(load="") -> Set[str]:
GET_SETTINGS_QUERY = """
select distinct
name
from duckdb_settings();
"""
settings = set(get_query(GET_SETTINGS_QUERY, load))
res = set()
for x in settings:
if x[-1] == ',':
# Remove the trailing comma
x = x[:-1]
name = json.loads(x)['name']
res.add(name)
return res
def get_secret_types(load="") -> Set[str]:
GET_SECRET_TYPES_QUERY = """
select distinct
type
from duckdb_secret_types();
"""
secret_types = set(get_query(GET_SECRET_TYPES_QUERY, load))
res = set()
for x in secret_types:
if x[-1] == ',':
# Remove the trailing comma
x = x[:-1]
type = json.loads(x)['type']
res.add(type)
return res
class ExtensionData:
def __init__(self):
# Map of extension -> ExtensionFunction
self.function_map: Dict[Function, ExtensionFunction] = {}
# Map of extension -> ExtensionSetting
self.settings_map: Dict[str, ExtensionSetting] = {}
# Map of extension -> ExtensionSecretType
self.secret_types_map: Dict[str, ExtensionSecretType] = {}
# Map of function -> extension function overloads
self.function_overloads: Dict[Function, List[ExtensionFunctionOverload]] = {}
# All function overloads (also ones that will not be written to the file)
self.all_function_overloads: Dict[Function, List[ExtensionFunctionOverload]] = {}
self.base_settings: Set[str] = set()
self.base_secret_types: Set[str] = set()
self.base_functions: Set[Function] = set()
self.extension_settings: Dict[str, Set[str]] = {}
self.extension_secret_types: Dict[str, Set[str]] = {}
self.extension_functions: Dict[str, Set[Function]] = {}
self.added_extensions: Set[str] = set()
# Map of extension -> extension_path
self.extensions: Dict[str, str] = get_extension_path_map()
self.stored_functions: Dict[str, List[Function]] = {
'arrow': [Function("scan_arrow_ipc", CatalogType.TABLE), Function("to_arrow_ipc", CatalogType.TABLE)],
'spatial': [],
}
self.stored_settings: Dict[str, List[str]] = {'arrow': [], 'spatial': []}
def set_base(self):
(functions, function_overloads) = get_functions()
self.base_functions: Set[Function] = functions
self.base_settings: Set[str] = get_settings()
self.base_secret_types: Set[str] = get_secret_types()
def add_entries(self, entries: ParsedEntries):
self.function_map.update(entries.functions)
self.function_overloads.update(entries.function_overloads)
self.settings_map.update(entries.settings)
self.secret_types_map.update(entries.secret_types)
def load_dependencies(self, extension_name: str) -> str:
if extension_name not in EXTENSION_DEPENDENCIES:
return ''
res = ''
dependencies = EXTENSION_DEPENDENCIES[extension_name]
for item in dependencies:
if item not in self.extensions:
print(f"Could not load extension '{extension_name}', dependency '{item}' is missing")
exit(1)
extension_path = self.extensions[item]
print(f"Load {item} at {extension_path}")
res += f"LOAD '{extension_path}';"
return res
def add_extension(self, extension_name: str):
if extension_name in EXTENSION_DEPENDENCIES:
for item in EXTENSION_DEPENDENCIES[extension_name]:
if item not in self.added_extensions:
self.add_extension(item)
if extension_name in self.extensions:
# Perform a LOAD and add the added settings/functions/secret_types
extension_path = self.extensions[extension_name]
print(f"Load {extension_name} at {extension_path}")
load = self.load_dependencies(extension_name)
load += f"LOAD '{extension_path}';"
(functions, function_overloads) = get_functions(load)
extension_functions = list(functions)
extension_settings = list(get_settings(load))
extension_secret_types = list(get_secret_types(load))
self.add_settings(extension_name, extension_settings)
self.add_secret_types(extension_name, extension_secret_types)
self.add_functions(extension_name, extension_functions, function_overloads)
elif extension_name in self.stored_functions or extension_name in self.stored_settings:
# Retrieve the list of settings/functions from our hardcoded list
extension_functions = self.stored_functions[extension_name]
extension_settings = self.stored_settings[extension_name]
extension_secret_types = self.stored_secret_types[extension_name]
print(f"Loading {extension_name} from stored functions: {extension_functions}")
self.add_settings(extension_name, extension_settings)
self.add_secret_types(extension_name, extension_secret_types)
self.add_functions(extension_name, extension_functions)
else:
error = f"""Missing extension {extension_name} and not found in stored_functions/stored_settings/stored_secret_types
Please double check if '{args.extension_repository}' is the right location to look for ./**/*.duckdb_extension files"""
print(error)
exit(1)
self.added_extensions.add(extension_name)
def add_settings(self, extension_name: str, settings_list: List[str]):
extension_name = extension_name.lower()
base_settings = set()
base_settings.update(self.base_settings)
if extension_name in EXTENSION_DEPENDENCIES:
dependencies = EXTENSION_DEPENDENCIES[extension_name]
for item in dependencies:
assert item in self.extension_settings
base_settings.update(self.extension_settings[item])
added_settings: Set[str] = set(settings_list) - base_settings
self.extension_settings[extension_name] = added_settings
settings_to_add: Dict[str, ExtensionSetting] = {}
for setting in added_settings:
setting_name = setting.lower()
settings_to_add[setting_name] = ExtensionSetting(extension_name, setting_name)
self.settings_map.update(settings_to_add)
def add_secret_types(self, extension_name: str, secret_types_list: List[str]):
extension_name = extension_name.lower()
base_secret_types = set()
base_secret_types.update(self.base_secret_types)
if extension_name in EXTENSION_DEPENDENCIES:
dependencies = EXTENSION_DEPENDENCIES[extension_name]
for item in dependencies:
assert item in self.extension_secret_types
base_secret_types.update(self.extension_secret_types[item])
added_secret_types: Set[str] = set(secret_types_list) - base_secret_types
self.extension_secret_types[extension_name] = added_secret_types
secret_types_to_add: Dict[str, ExtensionSecretType] = {}
for secret_type in added_secret_types:
secret_type_name = secret_type.lower()
secret_types_to_add[secret_type_name] = ExtensionSecretType(extension_name, secret_type_name)
self.secret_types_map.update(secret_types_to_add)
def get_extension_overloads(
self, extension_name: str, overloads: Dict[Function, List[FunctionOverload]]
) -> Dict[Function, List[ExtensionFunctionOverload]]:
result = {}
for function, function_overloads in overloads.items():
extension_overloads = []
for overload in function_overloads:
extension_overloads.append(
ExtensionFunctionOverload(
extension_name, overload.name, overload.type, overload.parameters, overload.return_type
)
)
result[function] = extension_overloads
return result
def add_functions(
self, extension_name: str, function_list: List[Function], overloads: Dict[Function, List[FunctionOverload]]
):
extension_name = extension_name.lower()
base_functions = set()
base_functions.update(self.base_functions)
if extension_name in EXTENSION_DEPENDENCIES:
dependencies = EXTENSION_DEPENDENCIES[extension_name]
for item in dependencies:
assert item in self.extension_functions
base_functions.update(self.extension_functions[item])
overloads = self.get_extension_overloads(extension_name, overloads)
added_functions: Set[Function] = set(function_list) - base_functions
self.extension_functions[extension_name] = added_functions
functions_to_add: Dict[Function, ExtensionFunction] = {}
for function in added_functions:
if function in self.function_overloads:
# function is in overload map - add overloads
self.function_overloads[function] += overloads[function]
elif function in self.function_map:
# function is in function map and we are trying to add it again
# this means the function is present in multiple extensions
# remove from function map, and add to overload map
self.function_overloads[function] = self.all_function_overloads[function] + overloads[function]
del self.function_map[function]
else:
functions_to_add[function] = ExtensionFunction(extension_name, function.name, function.type)
self.all_function_overloads.update(overloads)
self.function_map.update(functions_to_add)
def validate(self):
parsed_entries = ParsedEntries(HEADER_PATH)
if self.function_map != parsed_entries.functions:
print("Function map mismatches:")
print_map_diff(self.function_map, parsed_entries.functions)
exit(1)
if self.settings_map != parsed_entries.settings:
print("Settings map mismatches:")
print_map_diff(self.settings_map, parsed_entries.settings)
exit(1)
if self.secret_types_map != parsed_entries.secret_types:
print("SecretTypes map mismatches:")
print_map_diff(self.secret_types_map, parsed_entries.secret_types)
exit(1)
print("All entries found: ")
print(" > functions: " + str(len(parsed_entries.functions)))
print(" > settings: " + str(len(parsed_entries.settings)))
print(" > secret_types: " + str(len(parsed_entries.secret_types)))
def verify_export(self):
if len(self.function_map) == 0 or len(self.settings_map) == 0 or len(self.secret_types_map) == 0:
print(
"""
The provided configuration produced an empty function map or empty settings map or empty secret types map
This is likely caused by building DuckDB with extensions linked in
"""
)
exit(1)
def export_functions(self) -> str:
result = """
static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = {\n"""
sorted_function = sorted(self.function_map)
for func in sorted_function:
function: ExtensionFunction = self.function_map[func]
result += "\t{"
result += f'"{function.name}", "{function.extension}", {function.type.value}'
result += "},\n"
result += "}; // END_OF_EXTENSION_FUNCTIONS\n"
return result
def export_function_overloads(self) -> str:
result = """
static constexpr ExtensionFunctionOverloadEntry EXTENSION_FUNCTION_OVERLOADS[] = {\n"""
sorted_function = sorted(self.function_overloads)
for func in sorted_function:
overloads: List[ExtensionFunctionOverload] = sorted(self.function_overloads[func])
for overload in overloads:
result += "\t{"
result += f'"{overload.name}", "{overload.extension}", {overload.type.value}, "'
signature = "["
signature += ",".join([parameter.type for parameter in overload.parameters])
signature += "]>" + overload.return_type.type
result += signature
result += '"},\n'
result += "}; // END_OF_EXTENSION_FUNCTION_OVERLOADS\n"
return result
def export_settings(self) -> str:
result = """
static constexpr ExtensionEntry EXTENSION_SETTINGS[] = {\n"""
sorted_settings = sorted(self.settings_map)
for settings_name in sorted_settings:
setting: ExtensionSetting = self.settings_map[settings_name]
result += "\t{"
result += f'"{settings_name.lower()}", "{setting.extension}"'
result += "},\n"
result += "}; // END_OF_EXTENSION_SETTINGS\n"
return result
def export_secret_types(self) -> str:
result = """
static constexpr ExtensionEntry EXTENSION_SECRET_TYPES[] = {\n"""
sorted_secret_types = sorted(self.secret_types_map)
for secret_types_name in sorted_secret_types:
secret_type: ExtensionSecretType = self.secret_types_map[secret_types_name]
result += "\t{"
result += f'"{secret_types_name.lower()}", "{secret_type.extension}"'
result += "},\n"
result += "}; // END_OF_EXTENSION_SECRET_TYPES\n"
return result
# Get the slice of the file containing the var (assumes // END_OF_<varname> comment after var)
def get_slice_of_file(var_name, file_str):
begin = file_str.find(var_name)
end = file_str.find("END_OF_" + var_name)
return file_str[begin:end]
def print_map_diff(d1, d2):
s1 = sorted(set(d1.items()))
s2 = sorted(set(d2.items()))
diff1 = str(set(s1) - set(s2))
diff2 = str(set(s2) - set(s1))
print("Diff between maps: " + diff1 + "\n")
print("Diff between maps: " + diff2 + "\n")
def get_extension_path_map() -> Dict[str, str]:
extension_paths: Dict[str, str] = {}
# extension_repository = pathlib.Path('../build/release/repository')
extension_repository = args.extension_repository
for location in glob.iglob(extension_repository + '/**/*.duckdb_extension', recursive=True):
name, _ = os.path.splitext(os.path.basename(location))
print(f"Located extension: {name} in path: '{location}'")
extension_paths[name] = location
return extension_paths
def write_header(data: ExtensionData):
INCLUDE_HEADER = """//===----------------------------------------------------------------------===//
// DuckDB
//
// duckdb/main/extension_entries.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include \"duckdb/common/unordered_map.hpp\"
#include \"duckdb/common/enums/catalog_type.hpp\"
// NOTE: this file is generated by scripts/generate_extensions_function.py.
// Example usage to refresh one extension (replace "icu" with the desired extension):
// GENERATE_EXTENSION_ENTRIES=1 make debug
// python3 scripts/generate_extensions_function.py --extensions icu --shell build/debug/duckdb --extension_repository build/debug/repository
// Check out the check-load-install-extensions job in .github/workflows/LinuxRelease.yml for more details
namespace duckdb {
struct ExtensionEntry {
char name[48];
char extension[48];
};
struct ExtensionFunctionEntry {
char name[48];
char extension[48];
CatalogType type;
};
struct ExtensionFunctionOverloadEntry {
char name[48];
char extension[48];
CatalogType type;
char signature[96];
};
"""
INCLUDE_FOOTER = """
// Note: these are currently hardcoded in scripts/generate_extensions_function.py
// TODO: automate by passing though to script via duckdb
static constexpr ExtensionEntry EXTENSION_COPY_FUNCTIONS[] = {
{"parquet", "parquet"},
{"json", "json"},
{"avro", "avro"}
}; // END_OF_EXTENSION_COPY_FUNCTIONS
// Note: these are currently hardcoded in scripts/generate_extensions_function.py
// TODO: automate by passing though to script via duckdb
static constexpr ExtensionEntry EXTENSION_TYPES[] = {
{"json", "json"},
{"inet", "inet"},
{"geometry", "spatial"}
}; // END_OF_EXTENSION_TYPES
// Note: these are currently hardcoded in scripts/generate_extensions_function.py
// TODO: automate by passing though to script via duckdb
static constexpr ExtensionEntry EXTENSION_COLLATIONS[] = {
{"af", "icu"}, {"am", "icu"}, {"ar", "icu"}, {"ar_sa", "icu"}, {"as", "icu"}, {"az", "icu"},
{"be", "icu"}, {"bg", "icu"}, {"bn", "icu"}, {"bo", "icu"}, {"br", "icu"}, {"bs", "icu"},
{"ca", "icu"}, {"ceb", "icu"}, {"chr", "icu"}, {"cs", "icu"}, {"cy", "icu"}, {"da", "icu"},
{"de", "icu"}, {"de_at", "icu"}, {"dsb", "icu"}, {"dz", "icu"}, {"ee", "icu"}, {"el", "icu"},
{"en", "icu"}, {"en_us", "icu"}, {"eo", "icu"}, {"es", "icu"}, {"et", "icu"}, {"fa", "icu"},
{"fa_af", "icu"}, {"ff", "icu"}, {"fi", "icu"}, {"fil", "icu"}, {"fo", "icu"}, {"fr", "icu"},
{"fr_ca", "icu"}, {"fy", "icu"}, {"ga", "icu"}, {"gl", "icu"}, {"gu", "icu"}, {"ha", "icu"},
{"haw", "icu"}, {"he", "icu"}, {"he_il", "icu"}, {"hi", "icu"}, {"hr", "icu"}, {"hsb", "icu"},
{"hu", "icu"}, {"hy", "icu"}, {"id", "icu"}, {"id_id", "icu"}, {"ig", "icu"}, {"is", "icu"},
{"it", "icu"}, {"ja", "icu"}, {"ka", "icu"}, {"kk", "icu"}, {"kl", "icu"}, {"km", "icu"},
{"kn", "icu"}, {"ko", "icu"}, {"kok", "icu"}, {"ku", "icu"}, {"ky", "icu"}, {"lb", "icu"},
{"lkt", "icu"}, {"ln", "icu"}, {"lo", "icu"}, {"lt", "icu"}, {"lv", "icu"}, {"mk", "icu"},
{"ml", "icu"}, {"mn", "icu"}, {"mr", "icu"}, {"ms", "icu"}, {"mt", "icu"}, {"my", "icu"},
{"nb", "icu"}, {"nb_no", "icu"}, {"ne", "icu"}, {"nl", "icu"}, {"nn", "icu"}, {"om", "icu"},
{"or", "icu"}, {"pa", "icu"}, {"pa_in", "icu"}, {"pl", "icu"}, {"ps", "icu"}, {"pt", "icu"},
{"ro", "icu"}, {"ru", "icu"}, {"sa", "icu"}, {"se", "icu"}, {"si", "icu"}, {"sk", "icu"},
{"sl", "icu"}, {"smn", "icu"}, {"sq", "icu"}, {"sr", "icu"}, {"sr_ba", "icu"}, {"sr_me", "icu"},
{"sr_rs", "icu"}, {"sv", "icu"}, {"sw", "icu"}, {"ta", "icu"}, {"te", "icu"}, {"th", "icu"},
{"tk", "icu"}, {"to", "icu"}, {"tr", "icu"}, {"ug", "icu"}, {"uk", "icu"}, {"ur", "icu"},
{"uz", "icu"}, {"vi", "icu"}, {"wae", "icu"}, {"wo", "icu"}, {"xh", "icu"}, {"yi", "icu"},
{"yo", "icu"}, {"yue", "icu"}, {"yue_cn", "icu"}, {"zh", "icu"}, {"zh_cn", "icu"}, {"zh_hk", "icu"},
{"zh_mo", "icu"}, {"zh_sg", "icu"}, {"zh_tw", "icu"}, {"zu", "icu"}}; // END_OF_EXTENSION_COLLATIONS
// Note: these are currently hardcoded in scripts/generate_extensions_function.py
// TODO: automate by passing though to script via duckdb
static constexpr ExtensionEntry EXTENSION_FILE_PREFIXES[] = {
{"http://", "httpfs"}, {"https://", "httpfs"}, {"s3://", "httpfs"}, {"s3a://", "httpfs"}, {"s3n://", "httpfs"},
{"gcs://", "httpfs"}, {"gs://", "httpfs"}, {"r2://", "httpfs"}, {"azure://", "azure"}, {"az://", "azure"},
{"abfss://", "azure"}, {"hf://", "httpfs"}
}; // END_OF_EXTENSION_FILE_PREFIXES
// Note: these are currently hardcoded in scripts/generate_extensions_function.py
// TODO: automate by passing though to script via duckdb
static constexpr ExtensionEntry EXTENSION_FILE_POSTFIXES[] = {
{".parquet", "parquet"},
{".json", "json"},
{".jsonl", "json"},
{".ndjson", "json"},
{".shp", "spatial"},
{".gpkg", "spatial"},
{".fgb", "spatial"},
{".xlsx", "excel"},
{".avro", "avro"},
}; // END_OF_EXTENSION_FILE_POSTFIXES
// Note: these are currently hardcoded in scripts/generate_extensions_function.py
// TODO: automate by passing though to script via duckdb
static constexpr ExtensionEntry EXTENSION_FILE_CONTAINS[] = {
{".parquet?", "parquet"},
{".json?", "json"},
{".ndjson?", ".jsonl?"},
{".jsonl?", ".ndjson?"}
}; // EXTENSION_FILE_CONTAINS
// Note: these are currently hardcoded in scripts/generate_extensions_function.py
// TODO: automate by passing though to script via duckdb
static constexpr ExtensionEntry EXTENSION_SECRET_PROVIDERS[] = {{"s3/config", "httpfs"},
{"gcs/config", "httpfs"},
{"r2/config", "httpfs"},
{"s3/credential_chain", "aws"},
{"gcs/credential_chain", "aws"},
{"r2/credential_chain", "aws"},
{"aws/credential_chain", "aws"},
{"azure/access_token", "azure"},
{"azure/config", "azure"},
{"azure/credential_chain", "azure"},
{"azure/service_principal", "azure"},
{"huggingface/config", "httfps"},
{"huggingface/credential_chain", "httpfs"},
{"bearer/config", "httpfs"},
{"mysql/config", "mysql_scanner"},
{"postgres/config", "postgres_scanner"}
}; // EXTENSION_SECRET_PROVIDERS
static constexpr const char *AUTOLOADABLE_EXTENSIONS[] = {
"avro",
"aws",
"azure",
"autocomplete",
"core_functions",
"delta",
"ducklake",
"encodings",
"excel",
"fts",
"httpfs",
"iceberg",
"inet",
"icu",
"json",
"motherduck",
"mysql_scanner",
"parquet",
"sqlite_scanner",
"sqlsmith",
"postgres_scanner",
"tpcds",
"tpch",
"uc_catalog",
"ui"
}; // END_OF_AUTOLOADABLE_EXTENSIONS
} // namespace duckdb"""
data.verify_export()
file = open(HEADER_PATH, 'w')
file.write(INCLUDE_HEADER)
exported_functions = data.export_functions()
file.write(exported_functions)
exported_overloads = data.export_function_overloads()
file.write(exported_overloads)
exported_settings = data.export_settings()
file.write(exported_settings)
exported_secret_types = data.export_secret_types()
file.write(exported_secret_types)
file.write(INCLUDE_FOOTER)
file.close()
# Extensions that can be autoloaded, but are not buildable by DuckDB CI
HARDCODED_EXTENSION_FUNCTIONS = ExtensionFunction.create_map(
[
("delta_scan", "delta", "CatalogType::TABLE_FUNCTION_ENTRY"),
]
)
def main():
check_prerequisites()
extension_names: List[str] = get_extension_names()
extension_data = ExtensionData()
# Collect the list of functions/settings without any extensions loaded
extension_data.set_base()
# TODO: add 'purge' option to ignore existing entries ??
parsed_entries = ParsedEntries(HEADER_PATH)
parsed_entries.filter_entries(extension_names)
# Add the entries we parsed from the HEADER_PATH
extension_data.add_entries(parsed_entries)
for extension_name in extension_names:
print(extension_name)
# For every extension, add the functions/settings added by the extension
extension_data.add_extension(extension_name)
# Add hardcoded extension entries (
for key, value in HARDCODED_EXTENSION_FUNCTIONS.items():
extension_data.function_map[key] = value
if args.validate:
extension_data.validate()
return
write_header(extension_data)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,93 @@
# use flex to generate the scanner file for the parser
# the following version of bison is used:
# flex 2.5.35 Apple(flex-32)
import os
import subprocess
import re
from sys import platform
import sys
from python_helpers import open_utf8
flex_bin = 'flex'
pg_path = os.path.join('third_party', 'libpg_query')
namespace = 'duckdb_libpgquery'
for arg in sys.argv[1:]:
if arg.startswith("--flex="):
flex_bin = arg.replace("--flex=", "")
elif arg.startswith("--custom_dir_prefix"):
pg_path = arg.split("=")[1] + pg_path
elif arg.startswith("--namespace"):
namespace = arg.split("=")[1]
else:
raise Exception("Unrecognized argument: " + arg + ", expected --flex, --custom_dir_prefix, --namespace")
flex_file_path = os.path.join(pg_path, 'scan.l')
target_file = os.path.join(pg_path, 'src_backend_parser_scan.cpp')
proc = subprocess.Popen(
[flex_bin, '--nounistd', '-o', target_file, flex_file_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout = proc.stdout.read().decode('utf8')
stderr = proc.stderr.read().decode('utf8')
if proc.returncode != None or len(stderr) > 0:
print("Flex failed")
print("stdout: ", stdout)
print("stderr: ", stderr)
exit(1)
with open_utf8(target_file, 'r') as f:
text = f.read()
# convert this from 'int' to 'yy_size_t' to avoid triggering a warning
text = text.replace('int yy_buf_size;\n', 'yy_size_t yy_buf_size;\n')
# add the libpg_query namespace
text = text.replace(
'''
#ifndef FLEXINT_H
#define FLEXINT_H
''',
'''
#ifndef FLEXINT_H
#define FLEXINT_H
namespace '''
+ namespace
+ ''' {
''',
)
text = text.replace('register ', '')
text = text + "\n} /* " + namespace + " */\n"
text = re.sub('(?:[(]void[)][ ]*)?fprintf', '//', text)
text = re.sub('exit[(]', 'throw std::runtime_error(msg); //', text)
text = re.sub(r'\n\s*if\s*[(]\s*!\s*yyin\s*[)]\s*\n\s*yyin\s*=\s*stdin;\s*\n', '\n', text)
text = re.sub(r'\n\s*if\s*[(]\s*!\s*yyout\s*[)]\s*\n\s*yyout\s*=\s*stdout;\s*\n', '\n', text)
file_null = 'NULL' if platform == 'linux' else '[(]FILE [*][)] 0'
text = re.sub(
rf'[#]ifdef\s*YY_STDINIT\n\s*yyin = stdin;\n\s*yyout = stdout;\n[#]else\n\s*yyin = {file_null};\n\s*yyout = {file_null};\n[#]endif',
' yyin = (FILE *) 0;\n yyout = (FILE *) 0;',
text,
)
if 'stdin;' in text:
print("STDIN not removed!")
# exit(1)
if 'stdout' in text:
print("STDOUT not removed!")
# exit(1)
if 'fprintf(' in text:
print("PRINTF not removed!")
# exit(1)
if 'exit(' in text:
print("EXIT not removed!")
# exit(1)
with open_utf8(target_file, 'w+') as f:
f.write(text)

View File

@@ -0,0 +1,259 @@
import os
import json
from pathlib import Path
function_groups = {
('src', 'include/duckdb', 'function'): ['scalar', 'aggregate'],
('extension', 'core_functions/include', 'core_functions'): ['scalar', 'aggregate'],
}
def get_header():
return '''//===----------------------------------------------------------------------===//
// DuckDB
//
// {HEADER}_functions.hpp
//
//
//===----------------------------------------------------------------------===//
// This file is automatically generated by scripts/generate_functions.py
// Do not edit this file manually, your changes will be overwritten
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb/function/function_set.hpp"
namespace duckdb {
'''
def get_footer():
return '''} // namespace duckdb
'''
def main():
function_type_set = {}
for (root, include_dir, group), function_types in sorted(function_groups.items()):
all_functions_group = []
group_dir = Path(group)
for function_type in function_types:
type_dir = Path(root).joinpath(group_dir.joinpath(function_type))
relative_function_paths = sorted(
[f'{group}/{function_type}/{f.name}' for f in type_dir.iterdir() if f.is_dir()]
)
for function_path in relative_function_paths:
if Path(normalize_path_separators(f'{root}/{function_path}/functions.json')).exists():
create_header_file(root, include_dir, function_path, all_functions_group, function_type_set)
create_function_list_file(root, group, all_functions_group)
def normalize_path_separators(x):
return os.path.sep.join(x.split('/'))
def legal_struct_name(name):
return name.isalnum()
def get_struct_name(function_name):
return function_name.replace('_', ' ').title().replace(' ', '') + 'Fun'
def get_parameter_line(variants):
if not all(
isinstance(variant['parameters'], list)
and all(isinstance(param, dict) for param in variant['parameters'])
and all('name' in param.keys() for param in variant['parameters'])
for variant in variants
):
raise ValueError(
f"invalid parameters for variants {variants}\nParameters should have format: \"parameters\": [{{\"name\": <param_name>, \"type\": <param_type>}}, ...]"
)
return "\\001".join(
",".join(
param['name'] + "::" + param['type'] if ('type' in param) else param['name']
for param in variant['parameters']
)
for variant in variants
)
def get_description_line(variants):
return "\\001".join([variant['description'] for variant in variants])
def get_example_line(variants):
return "\\001".join([example_from_json(variant) for variant in variants])
def example_from_json(json_record):
if 'example' in json_record:
example_line = sanitize_string(json_record['example'])
elif 'examples' in json_record:
example_line = examples_to_line(json_record['examples'])
else:
example_line = ''
return example_line
def examples_to_line(example_list):
return "\\002".join([sanitize_string(example) for example in example_list])
def get_category_line(variants):
return "\\001".join([categories_from_json(variant) for variant in variants])
def categories_from_json(json_record):
if 'categories' in json_record:
category_line = ','.join([category.strip() for category in json_record['categories']])
else:
category_line = ''
return category_line
def sanitize_string(text):
return text.replace('\\', '\\\\').replace('"', '\\"')
def create_header_file(root, include_dir, path, all_function_list, function_type_set):
header_path = normalize_path_separators(f'{root}/{include_dir}/{path}_functions.hpp')
json_path = normalize_path_separators(f'{root}/{path}/functions.json')
with open(json_path, 'r') as f:
parsed_json = json.load(f)
new_text = get_header().replace('{HEADER}', path)
for entry in parsed_json:
function_text = ''
if 'struct' in entry:
struct_name = entry['struct']
else:
struct_name = get_struct_name(entry['name'])
if not legal_struct_name(struct_name):
print(f'Struct name {struct_name} is not a valid struct name!')
exit(1)
if struct_name in function_type_set:
raise Exception("Duplicate entry " + struct_name)
function_type_set[struct_name] = entry['type']
if entry['type'] == 'scalar_function':
function_text = 'static ScalarFunction GetFunction();'
all_function_list.append([entry['name'], f"DUCKDB_SCALAR_FUNCTION({struct_name})"])
elif entry['type'] == 'scalar_function_set':
function_text = 'static ScalarFunctionSet GetFunctions();'
all_function_list.append([entry['name'], f"DUCKDB_SCALAR_FUNCTION_SET({struct_name})"])
elif entry['type'] == 'aggregate_function':
function_text = 'static AggregateFunction GetFunction();'
all_function_list.append([entry['name'], f"DUCKDB_AGGREGATE_FUNCTION({struct_name})"])
elif entry['type'] == 'aggregate_function_set':
function_text = 'static AggregateFunctionSet GetFunctions();'
all_function_list.append([entry['name'], f"DUCKDB_AGGREGATE_FUNCTION_SET({struct_name})"])
else:
print("Unknown entry type " + entry['type'] + ' for entry ' + struct_name)
exit(1)
if 'variants' in entry:
parameter_line = get_parameter_line(entry['variants'])
description_line = get_description_line(entry['variants'])
example_line = get_example_line(entry['variants'])
category_line = get_category_line(entry['variants'])
else:
parameter_line = entry['parameters'].replace(' ', '') if 'parameters' in entry else ''
description_line = sanitize_string(entry['description'])
example_line = example_from_json(entry)
category_line = categories_from_json(entry)
if 'extra_functions' in entry:
for func_text in entry['extra_functions']:
function_text += '\n ' + func_text
new_text += (
'''struct {STRUCT} {
static constexpr const char *Name = "{NAME}";
static constexpr const char *Parameters = "{PARAMETERS}";
static constexpr const char *Description = "{DESCRIPTION}";
static constexpr const char *Example = "{EXAMPLE}";
static constexpr const char *Categories = "{CATEGORIES}";
{FUNCTION}
};
'''.replace(
'{STRUCT}', struct_name
)
.replace('{NAME}', entry['name'])
.replace('{PARAMETERS}', parameter_line)
.replace('{DESCRIPTION}', description_line)
.replace('{EXAMPLE}', example_line)
.replace('{CATEGORIES}', category_line)
.replace('{FUNCTION}', function_text)
)
alias_count = 1
if 'aliases' in entry:
for alias in entry['aliases']:
alias_struct_name = get_struct_name(alias)
if not legal_struct_name(alias_struct_name):
alias_struct_name = struct_name + 'Alias'
if alias_count > 1:
alias_struct_name += str(alias_count)
alias_count += 1
aliased_type = entry['type']
if aliased_type == 'scalar_function':
all_function_list.append([alias, f"DUCKDB_SCALAR_FUNCTION_ALIAS({alias_struct_name})"])
elif aliased_type == 'scalar_function_set':
all_function_list.append([alias, f"DUCKDB_SCALAR_FUNCTION_SET_ALIAS({alias_struct_name})"])
elif aliased_type == 'aggregate_function':
all_function_list.append([alias, f"DUCKDB_AGGREGATE_FUNCTION_ALIAS({alias_struct_name})"])
elif aliased_type == 'aggregate_function_set':
all_function_list.append([alias, f"DUCKDB_AGGREGATE_FUNCTION_SET_ALIAS({alias_struct_name})"])
else:
print("Unknown entry type " + aliased_type + ' for entry ' + struct_name)
exit(1)
function_type_set[alias_struct_name] = aliased_type
new_text += (
'''struct {STRUCT} {
using ALIAS = {ALIAS};
static constexpr const char *Name = "{NAME}";
};
'''.replace(
'{STRUCT}', alias_struct_name
)
.replace('{NAME}', alias)
.replace('{ALIAS}', struct_name)
)
new_text += get_footer()
with open(header_path, 'w+') as f:
f.write(new_text)
def create_function_list_file(root, group, all_function_list):
function_list_file = normalize_path_separators(f'{root}/{group}/function_list.cpp')
with open(function_list_file, 'r') as f:
text = f.read()
static_function = f'static const StaticFunctionDefinition {group}[]' ' = {'
pos = text.find(static_function)
header = text[:pos]
footer_lines = text[pos:].split('\n')
footer = ''
for i in range(len(footer_lines)):
if len(footer_lines[i]) == 0:
footer = '\n'.join(footer_lines[i:])
break
new_text = header
new_text += static_function + '\n'
all_function_list = sorted(all_function_list, key=lambda x: x[0])
for entry in all_function_list:
new_text += '\t' + entry[1] + ',\n'
new_text += '\tFINAL_FUNCTION\n};\n'
new_text += footer
with open(function_list_file, 'w+') as f:
f.write(new_text)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,309 @@
# use bison to generate the parser files
# the following version of bison is used:
# bison (GNU Bison) 2.3
import os
import subprocess
import re
import sys
from python_helpers import open_utf8
bison_location = "bison"
base_dir = 'third_party/libpg_query/grammar'
pg_dir = 'third_party/libpg_query'
namespace = 'duckdb_libpgquery'
counterexamples = False
run_update = False
verbose = False
for arg in sys.argv[1:]:
if arg.startswith("--bison="):
bison_location = arg.replace("--bison=", "")
elif arg.startswith("--counterexamples"):
counterexamples = True
elif arg.startswith("--update"):
run_update = True
# allow a prefix to the source and target directories
elif arg.startswith("--custom_dir_prefix"):
base_dir = arg.split("=")[1] + base_dir
pg_dir = arg.split("=")[1] + pg_dir
elif arg.startswith("--namespace"):
namespace = arg.split("=")[1]
elif arg.startswith("--verbose"):
verbose = True
else:
raise Exception(
"Unrecognized argument: "
+ arg
+ ", expected --counterexamples, --bison=/loc/to/bison, --custom_dir_prefix, --namespace, --verbose"
)
template_file = os.path.join(base_dir, 'grammar.y')
target_file = os.path.join(base_dir, 'grammar.y.tmp')
header_file = os.path.join(base_dir, 'grammar.hpp')
source_file = os.path.join(base_dir, 'grammar.cpp')
type_dir = os.path.join(base_dir, 'types')
rule_dir = os.path.join(base_dir, 'statements')
result_source = os.path.join(base_dir, 'grammar_out.cpp')
result_header = os.path.join(base_dir, 'grammar_out.hpp')
target_source_loc = os.path.join(pg_dir, 'src_backend_parser_gram.cpp')
target_header_loc = os.path.join(pg_dir, 'include/parser/gram.hpp')
kwlist_header = os.path.join(pg_dir, 'include/parser/kwlist.hpp')
# parse the keyword lists
def read_list_from_file(fname):
with open_utf8(fname, 'r') as f:
return [x.strip() for x in f.read().split('\n') if len(x.strip()) > 0]
kwdir = os.path.join(base_dir, 'keywords')
unreserved_keywords = read_list_from_file(os.path.join(kwdir, 'unreserved_keywords.list'))
colname_keywords = read_list_from_file(os.path.join(kwdir, 'column_name_keywords.list'))
func_name_keywords = read_list_from_file(os.path.join(kwdir, 'func_name_keywords.list'))
type_name_keywords = read_list_from_file(os.path.join(kwdir, 'type_name_keywords.list'))
reserved_keywords = read_list_from_file(os.path.join(kwdir, 'reserved_keywords.list'))
def strip_p(x):
if x.endswith("_P"):
return x[:-2]
else:
return x
unreserved_keywords.sort(key=lambda x: strip_p(x))
colname_keywords.sort(key=lambda x: strip_p(x))
func_name_keywords.sort(key=lambda x: strip_p(x))
type_name_keywords.sort(key=lambda x: strip_p(x))
reserved_keywords.sort(key=lambda x: strip_p(x))
statements = read_list_from_file(os.path.join(base_dir, 'statements.list'))
statements.sort()
if len(statements) < 0:
print("Need at least one statement")
exit(1)
# verify there are no duplicate keywords and create big sorted list of keywords
kwdict = {}
for kw in unreserved_keywords:
kwdict[kw] = 'UNRESERVED_KEYWORD'
for kw in colname_keywords:
kwdict[kw] = 'COL_NAME_KEYWORD'
for kw in func_name_keywords:
kwdict[kw] = 'TYPE_FUNC_NAME_KEYWORD'
for kw in type_name_keywords:
kwdict[kw] = 'TYPE_FUNC_NAME_KEYWORD'
for kw in reserved_keywords:
kwdict[kw] = 'RESERVED_KEYWORD'
kwlist = [(x, kwdict[x]) for x in kwdict.keys()]
kwlist.sort(key=lambda x: strip_p(x[0]))
# now generate kwlist.h
# PG_KEYWORD("abort", ABORT_P, UNRESERVED_KEYWORD)
kwtext = (
"""
namespace """
+ namespace
+ """ {
#define PG_KEYWORD(a,b,c) {a,b,c},
const PGScanKeyword ScanKeywords[] = {
"""
)
for tpl in kwlist:
kwtext += 'PG_KEYWORD("%s", %s, %s)\n' % (strip_p(tpl[0]).lower(), tpl[0], tpl[1])
kwtext += (
"""
};
const int NumScanKeywords = lengthof(ScanKeywords);
} // namespace """
+ namespace
+ """
"""
)
with open_utf8(kwlist_header, 'w+') as f:
f.write(kwtext)
# generate the final main.y.tmp file
# first read the template file
with open_utf8(template_file, 'r') as f:
text = f.read()
# now perform a series of replacements in the file to construct the final yacc file
def get_file_contents(fpath, add_line_numbers=False):
with open_utf8(fpath, 'r') as f:
result = f.read()
if add_line_numbers:
return '#line 1 "%s"\n' % (fpath,) + result
else:
return result
# grammar.hpp
text = text.replace("{{{ GRAMMAR_HEADER }}}", get_file_contents(header_file, True))
# grammar.cpp
text = text.replace("{{{ GRAMMAR_SOURCE }}}", get_file_contents(source_file, True))
# keyword list
kw_token_list = "%token <keyword> " + " ".join([x[0] for x in kwlist])
text = text.replace("{{{ KEYWORDS }}}", kw_token_list)
# statements
stmt_list = "stmt: " + "\n\t| ".join(statements) + "\n\t| /*EMPTY*/\n\t{ $$ = NULL; }\n"
text = text.replace("{{{ STATEMENTS }}}", stmt_list)
# keywords
# keywords can EITHER be reserved, unreserved, or some combination of (col_name, type_name, func_name)
# that means duplicates are ONLY allowed between (col_name, type_name and func_name)
# having a keyword be both reserved and unreserved is an error
# as is having a keyword both reserved and col_name, for example
# verify that this is the case
reserved_dict = {}
unreserved_dict = {}
other_dict = {}
for r in reserved_keywords:
if r in reserved_dict:
print("Duplicate keyword " + r + " in reserved keywords")
exit(1)
reserved_dict[r] = True
for ur in unreserved_keywords:
if ur in unreserved_dict:
print("Duplicate keyword " + ur + " in unreserved keywords")
exit(1)
if ur in reserved_dict:
print("Keyword " + ur + " is marked as both unreserved and reserved")
exit(1)
unreserved_dict[ur] = True
def add_to_other_keywords(kw, list_name):
global unreserved_dict
global reserved_dict
global other_dict
if kw in unreserved_dict:
print("Keyword " + kw + " is marked as both unreserved and " + list_name)
exit(1)
if kw in reserved_dict:
print("Keyword " + kw + " is marked as both reserved and " + list_name)
exit(1)
other_dict[kw] = True
for cr in colname_keywords:
add_to_other_keywords(cr, "colname")
type_func_name_dict = {}
for tr in type_name_keywords:
add_to_other_keywords(tr, "typename")
type_func_name_dict[tr] = True
for fr in func_name_keywords:
add_to_other_keywords(fr, "funcname")
type_func_name_dict[fr] = True
type_func_name_keywords = list(type_func_name_dict.keys())
type_func_name_keywords.sort()
all_keywords = list(reserved_dict.keys()) + list(unreserved_dict.keys()) + list(other_dict.keys())
all_keywords.sort()
other_keyword = list(other_dict.keys())
other_keyword.sort()
kw_definitions = "unreserved_keyword: " + " | ".join(unreserved_keywords) + "\n"
kw_definitions += "col_name_keyword: " + " | ".join(colname_keywords) + "\n"
kw_definitions += "func_name_keyword: " + " | ".join(func_name_keywords) + "\n"
kw_definitions += "type_name_keyword: " + " | ".join(type_name_keywords) + "\n"
kw_definitions += "other_keyword: " + " | ".join(other_keyword) + "\n"
kw_definitions += "type_func_name_keyword: " + " | ".join(type_func_name_keywords) + "\n"
kw_definitions += "reserved_keyword: " + " | ".join(reserved_keywords) + "\n"
text = text.replace("{{{ KEYWORD_DEFINITIONS }}}", kw_definitions)
# types
def concat_dir(dname, extension, add_line_numbers=False):
result = ""
for fname in os.listdir(dname):
fpath = os.path.join(dname, fname)
if os.path.isdir(fpath):
result += concat_dir(fpath, extension)
else:
if not fname.endswith(extension):
continue
result += get_file_contents(fpath, add_line_numbers)
return result
type_definitions = concat_dir(type_dir, ".yh")
# add statement types as well
for stmt in statements:
type_definitions += "%type <node> " + stmt + "\n"
text = text.replace("{{{ TYPES }}}", type_definitions)
# grammar rules
grammar_rules = concat_dir(rule_dir, ".y", True)
text = text.replace("{{{ GRAMMAR RULES }}}", grammar_rules)
# finally write the yacc file into the target file
with open_utf8(target_file, 'w+') as f:
f.write(text)
# generate the bison
cmd = [bison_location]
if counterexamples:
print("Attempting to print counterexamples (-Wcounterexamples)")
cmd += ["-Wcounterexamples"]
if run_update:
cmd += ["--update"]
if verbose:
cmd += ["--verbose"]
cmd += ["-o", result_source, "-d", target_file]
print(' '.join(cmd))
proc = subprocess.Popen(cmd, stderr=subprocess.PIPE)
res = proc.wait(timeout=10) # ensure CI does not hang as was seen when running with Bison 3.x release.
if res != 0:
text = proc.stderr.read().decode('utf8')
print(text)
if 'shift/reduce' in text and not counterexamples:
print("---------------------------------------------------------------------")
print("In case of shift/reduce conflicts, try re-running with --counterexamples")
print("Note: this requires a more recent version of Bison (e.g. version 3.8)")
print("On a Macbook you can obtain this using \"brew install bison\"")
if counterexamples and 'time limit exceeded' in text:
print("---------------------------------------------------------------------")
print(
"The counterexamples time limit was exceeded. This likely means that no useful counterexample was generated."
)
print("")
print("The counterexamples time limit can be increased by setting the TIME_LIMIT environment variable, e.g.:")
print("export TIME_LIMIT=100")
exit(1)
os.rename(result_source, target_source_loc)
os.rename(result_header, target_header_loc)
with open_utf8(target_source_loc, 'r') as f:
text = f.read()
text = text.replace('#include "grammar_out.hpp"', '#include "include/parser/gram.hpp"')
text = text.replace('yynerrs = 0;', 'yynerrs = 0; (void)yynerrs;')
with open_utf8(target_source_loc, 'w+') as f:
f.write(text)

View File

@@ -0,0 +1,399 @@
# Script that takes src/include/duckdb/common/enums/optimizer_type.hpp, extracts the optimizer types
# and adds them to the metrics types.
# Then it creates a new file src/include/duckdb/common/enums/metric_type.hpp with the new metrics types as enums.
# and generates both test/sql/pragma/profiling/test_default_profiling_settings.test
# and test/sql/pragma/profiling/test_custom_profiling_optimizer.test
import re
import os
os.chdir(os.path.dirname(__file__))
metrics_header_file = os.path.join("..", "src", "include", "duckdb", "common", "enums", "metric_type.hpp")
metrics_cpp_file = os.path.join("..", "src", "common", "enums", "metric_type.cpp")
optimizer_file = os.path.join("..", "src", "include", "duckdb", "common", "enums", "optimizer_type.hpp")
metrics = [
"ATTACH_LOAD_STORAGE_LATENCY",
"ATTACH_REPLAY_WAL_LATENCY",
"BLOCKED_THREAD_TIME",
"CHECKPOINT_LATENCY",
"CPU_TIME",
"CUMULATIVE_CARDINALITY",
"CUMULATIVE_ROWS_SCANNED",
"EXTRA_INFO",
"LATENCY",
"OPERATOR_CARDINALITY",
"OPERATOR_NAME",
"OPERATOR_ROWS_SCANNED",
"OPERATOR_TIMING",
"OPERATOR_TYPE",
"QUERY_NAME",
"RESULT_SET_SIZE",
"ROWS_RETURNED",
"SYSTEM_PEAK_BUFFER_MEMORY",
"SYSTEM_PEAK_TEMP_DIR_SIZE",
"TOTAL_BYTES_READ",
"TOTAL_BYTES_WRITTEN",
"WAITING_TO_ATTACH_LATENCY",
]
phase_timing_metrics = [
"ALL_OPTIMIZERS",
"CUMULATIVE_OPTIMIZER_TIMING",
"PHYSICAL_PLANNER",
"PHYSICAL_PLANNER_COLUMN_BINDING",
"PHYSICAL_PLANNER_CREATE_PLAN",
"PHYSICAL_PLANNER_RESOLVE_TYPES",
"PLANNER",
"PLANNER_BINDING",
]
query_global_metrics = [
"ATTACH_LOAD_STORAGE_LATENCY",
"ATTACH_REPLAY_WAL_LATENCY",
"BLOCKED_THREAD_TIME",
"CHECKPOINT_LATENCY",
"SYSTEM_PEAK_BUFFER_MEMORY",
"SYSTEM_PEAK_TEMP_DIR_SIZE",
"WAITING_TO_ATTACH_LATENCY",
]
optimizer_types = []
# Regular expression to match the enum values
enum_pattern = r'\s*([A-Z_]+)\s*=\s*\d+,?|\s*([A-Z_]+),?'
inside_enum = False
# open the optimizer file and extract the optimizer types
with open(optimizer_file, "r") as f:
for line in f:
line = line.strip()
if line.startswith("enum class OptimizerType"):
inside_enum = True
continue
if inside_enum and line.startswith("};"):
break
if inside_enum:
match = re.match(enum_pattern, line)
if match:
optimizer_type = match[1] if match[1] else match[2]
if optimizer_type == "INVALID":
continue
optimizer_types.append(optimizer_type)
header = """//-------------------------------------------------------------------------
// DuckDB
//
//
// duckdb/common/enums/metrics_type.hpp
//
// This file is automatically generated by scripts/generate_metric_enums.py
// Do not edit this file manually, your changes will be overwritten
//-------------------------------------------------------------------------\n
"""
typedefs = """struct MetricsTypeHashFunction {
uint64_t operator()(const MetricsType &index) const {
return std::hash<uint8_t>()(static_cast<uint8_t>(index));
}
};
typedef unordered_set<MetricsType, MetricsTypeHashFunction> profiler_settings_t;
typedef unordered_map<MetricsType, Value, MetricsTypeHashFunction> profiler_metrics_t;
"""
get_optimizer_metric_fun = 'GetOptimizerMetrics()'
get_phase_timing_metric_fun = 'GetPhaseTimingMetrics()'
get_optimizer_metric_by_type_fun = 'GetOptimizerMetricByType(OptimizerType type)'
get_optimizer_type_by_metric_fun = 'GetOptimizerTypeByMetric(MetricsType type)'
is_optimizer_metric_fun = 'IsOptimizerMetric(MetricsType type)'
is_phase_timing_metric_fun = 'IsPhaseTimingMetric(MetricsType type)'
is_query_global_metric_fun = 'IsQueryGlobalMetric(MetricsType type)'
metrics_class = 'MetricsUtils'
# Write the metric type header file
with open(metrics_header_file, "w") as f:
f.write(header)
f.write('#pragma once\n\n')
f.write('#include "duckdb/common/types/value.hpp"\n')
f.write('#include "duckdb/common/unordered_set.hpp"\n')
f.write('#include "duckdb/common/unordered_map.hpp"\n')
f.write('#include "duckdb/common/constants.hpp"\n')
f.write('#include "duckdb/common/enum_util.hpp"\n')
f.write('#include "duckdb/common/enums/optimizer_type.hpp"\n\n')
f.write("namespace duckdb {\n\n")
f.write("enum class MetricsType : uint8_t {\n")
for metric in metrics:
f.write(f" {metric},\n")
for metric in phase_timing_metrics:
f.write(f" {metric},\n")
for metric in optimizer_types:
f.write(f" OPTIMIZER_{metric},\n")
f.write("};\n\n")
f.write(typedefs)
f.write('class MetricsUtils {\n')
f.write('public:\n')
f.write(f' static profiler_settings_t {get_optimizer_metric_fun};\n')
f.write(f' static profiler_settings_t {get_phase_timing_metric_fun};\n\n')
f.write(f' static MetricsType {get_optimizer_metric_by_type_fun};\n')
f.write(f' static OptimizerType {get_optimizer_type_by_metric_fun};\n\n')
f.write(f' static bool {is_optimizer_metric_fun};\n')
f.write(f' static bool {is_phase_timing_metric_fun};\n')
f.write(f' static bool {is_query_global_metric_fun};\n')
f.write('};\n\n')
f.write("} // namespace duckdb\n")
# Write the metric_type.cpp file
with open(metrics_cpp_file, "w") as f:
f.write(header)
f.write('#include "duckdb/common/enums/metric_type.hpp"\n')
f.write("namespace duckdb {\n\n")
f.write(f'profiler_settings_t {metrics_class}::{get_optimizer_metric_fun} {{\n')
f.write(f" return {{\n")
for metric in optimizer_types:
f.write(f" MetricsType::OPTIMIZER_{metric},\n")
f.write(" };\n")
f.write("}\n\n")
f.write(f'profiler_settings_t {metrics_class}::{get_phase_timing_metric_fun} {{\n')
f.write(f" return {{\n")
for metric in phase_timing_metrics:
f.write(f" MetricsType::{metric},\n")
f.write(" };\n")
f.write("}\n\n")
f.write(f'MetricsType {metrics_class}::{get_optimizer_metric_by_type_fun} {{\n')
f.write(' switch(type) {\n')
for metric in optimizer_types:
f.write(f" case OptimizerType::{metric}:\n")
f.write(f" return MetricsType::OPTIMIZER_{metric};\n")
f.write(' default:\n')
f.write(
' throw InternalException("OptimizerType %s cannot be converted to a MetricsType", '
'EnumUtil::ToString(type));\n'
)
f.write(' };\n')
f.write('}\n\n')
f.write(f'OptimizerType {metrics_class}::{get_optimizer_type_by_metric_fun} {{\n')
f.write(' switch(type) {\n')
for metric in optimizer_types:
f.write(f" case MetricsType::OPTIMIZER_{metric}:\n")
f.write(f" return OptimizerType::{metric};\n")
f.write(' default:\n')
f.write(' return OptimizerType::INVALID;\n')
f.write(' };\n')
f.write('}\n\n')
f.write(f'bool {metrics_class}::{is_optimizer_metric_fun} {{\n')
f.write(' switch(type) {\n')
for metric in optimizer_types:
f.write(f" case MetricsType::OPTIMIZER_{metric}:\n")
f.write(' return true;\n')
f.write(' default:\n')
f.write(' return false;\n')
f.write(' };\n')
f.write('}\n\n')
f.write(f'bool {metrics_class}::{is_phase_timing_metric_fun} {{\n')
f.write(' switch(type) {\n')
for metric in phase_timing_metrics:
f.write(f" case MetricsType::{metric}:\n")
f.write(' return true;\n')
f.write(' default:\n')
f.write(' return false;\n')
f.write(' };\n')
f.write('}\n\n')
f.write(f'bool {metrics_class}::{is_query_global_metric_fun} {{\n')
f.write(' switch(type) {\n')
for metric in query_global_metrics:
f.write(f" case MetricsType::{metric}:\n")
f.write(' return true;\n')
f.write(' default:\n')
f.write(' return false;\n')
f.write(' };\n')
f.write('}\n\n')
f.write("} // namespace duckdb\n")
# Generate the test files
test_names = ["test_default_profiling_settings", "test_custom_profiling_optimizer"]
test_descriptions = ["default", "custom optimizer"]
test_files = [os.path.join("..", "test", "sql", "pragma", "profiling", f"{name}.test") for name in test_names]
def write_statement(f, statement_type, statement):
f.write(f"statement {statement_type}\n")
f.write(statement + "\n\n")
def write_query(f, options, query):
f.write(f"query {options}\n")
f.write(query + "\n")
f.write("----\n")
def write_default_query(f):
query = "SELECT unnest(['Maia', 'Thijs', 'Mark', 'Hannes', 'Tom', 'Max', 'Carlo', 'Sam', 'Tania']) AS names ORDER BY random();"
write_statement(f, "ok", query)
write_statement(f, "ok", "PRAGMA disable_profiling;")
def write_get_custom_profiling_settings(f):
query = """
SELECT unnest(res) FROM (
SELECT current_setting('custom_profiling_settings') AS raw_setting,
raw_setting.trim('{}') AS setting,
string_split(setting, ', ') AS res
) ORDER BY ALL;
""".strip()
write_query(f, "I", query)
def write_custom_profiling_optimizer(f):
write_statement(f, "ok", "PRAGMA custom_profiling_settings='{\"ALL_OPTIMIZERS\": \"true\"}';")
write_default_query(f)
query = """
SELECT * FROM (
SELECT unnest(res) str FROM (
SELECT current_setting('custom_profiling_settings') as raw_setting,
raw_setting.trim('{}') AS setting,
string_split(setting, ', ') AS res
)
) WHERE '"true"' NOT in str
ORDER BY ALL \
""".strip()
write_query(f, "I", query)
f.write("\n")
write_statement(f, "ok", "PRAGMA custom_profiling_settings='{}'")
write_default_query(f)
write_get_custom_profiling_settings(f)
f.write("(empty)\n\n")
write_statement(f, "ok", "PRAGMA custom_profiling_settings='{\"OPTIMIZER_JOIN_ORDER\": \"true\"}'")
write_default_query(f)
write_get_custom_profiling_settings(f)
f.write("\"OPTIMIZER_JOIN_ORDER\": \"true\"\n\n")
write_statement(
f, "ok", "CREATE OR REPLACE TABLE metrics_output AS SELECT * FROM '__TEST_DIR__/profiling_output.json';"
)
query = """
SELECT
CASE WHEN optimizer_join_order > 0 THEN 'true'
ELSE 'false' END
FROM metrics_output;
""".strip()
write_query(f, "I", query)
f.write("true\n\n")
write_statement(f, "ok", "SET disabled_optimizers = 'JOIN_ORDER';")
write_statement(f, "ok", "PRAGMA custom_profiling_settings='{\"OPTIMIZER_JOIN_ORDER\": \"true\"}'")
write_default_query(f)
write_get_custom_profiling_settings(f)
f.write("(empty)\n\n")
write_statement(f, "ok", "PRAGMA custom_profiling_settings='{\"CUMULATIVE_OPTIMIZER_TIMING\": \"true\"}';")
write_default_query(f)
write_statement(
f, "ok", "CREATE OR REPLACE TABLE metrics_output AS SELECT * FROM '__TEST_DIR__/profiling_output.json';"
)
query = """
SELECT
CASE WHEN cumulative_optimizer_timing > 0 THEN 'true'
ELSE 'false' END
FROM metrics_output;
""".strip()
write_query(f, "I", query)
f.write("true\n\n")
f.write("# All phase timings must be collected when using detailed profiling mode.\n\n")
write_statement(f, "ok", "RESET custom_profiling_settings;")
write_statement(f, "ok", "SET profiling_mode = 'detailed';")
write_default_query(f)
query = """
SELECT * FROM (
SELECT unnest(res) str FROM (
SELECT current_setting('custom_profiling_settings') AS raw_setting,
raw_setting.trim('{}') AS setting,
string_split(setting, ', ') AS res
)
)
WHERE '"true"' NOT IN str
ORDER BY ALL
""".strip()
write_query(f, "I", query)
f.write("\n")
write_statement(f, "ok", "RESET custom_profiling_settings;")
write_statement(f, "ok", "SET profiling_mode = 'standard';")
# Create the test files
for test_file, name, description in zip(test_files, test_names, test_descriptions):
with open(test_file, "w") as f:
display_name = test_file.replace("../", "")
f.write(f"# name: {display_name}\n")
f.write(f"# description: Test {description} profiling settings.\n")
f.write("# group: [profiling]\n\n")
f.write("# This file is automatically generated by scripts/generate_metric_enums.py\n")
f.write("# Do not edit this file manually, your changes will be overwritten\n\n")
f.write("require json\n\n")
write_statement(f, "ok", "PRAGMA enable_verification;")
write_statement(f, "ok", "PRAGMA enable_profiling = 'json';")
write_statement(f, "ok", "PRAGMA profiling_output = '__TEST_DIR__/profiling_output.json';")
if name == "test_custom_profiling_optimizer":
write_custom_profiling_optimizer(f)
write_default_query(f)
write_get_custom_profiling_settings(f)
metrics.sort()
for metric in metrics:
f.write(f'"{metric}": "true"\n')
f.write("\n")
write_statement(
f, "ok", "CREATE OR REPLACE TABLE metrics_output AS SELECT * FROM '__TEST_DIR__/profiling_output.json';"
)
write_statement(f, "ok", "SELECT cpu_time, extra_info, rows_returned, latency FROM metrics_output;")

View File

@@ -0,0 +1,39 @@
# this script re-generates the binary file used for Test deserialized plans from file
# before running this script, increment the version number in src/planner/logical_operator.cpp and
# recompile (make debug)
# Note that the test is not linked unless you BUILD_TPCH=1
import os
import subprocess
from python_helpers import open_utf8
shell_proc = os.path.join('build', 'debug', 'test', 'unittest')
gen_binary_file = os.path.join('test', 'api', 'serialized_plans', 'serialized_plans.binary')
def try_remove_file(fname):
try:
os.remove(fname)
except:
pass
try_remove_file(gen_binary_file)
def run_test(test):
print(test)
env = os.environ.copy()
env["GEN_PLAN_STORAGE"] = "1"
res = subprocess.run([shell_proc, test], capture_output=True, env=env)
stdout = res.stdout.decode('utf8').strip()
stderr = res.stderr.decode('utf8').strip()
if res.returncode != 0:
print("Failed to create binary file!")
print("----STDOUT----")
print(stdout)
print("----STDERR----")
print(stderr)
run_test("Generate serialized plans file")

View File

@@ -0,0 +1,30 @@
#!/usr/bin/env bash
#Note: DONT run as root
set -e
DUCKDB_PATH=duckdb
if test -f build/release/duckdb; then
DUCKDB_PATH=build/release/duckdb
elif test -f build/reldebug/duckdb; then
DUCKDB_PATH=build/reldebug/duckdb
elif test -f build/debug/duckdb; then
DUCKDB_PATH=build/debug/duckdb
fi
mkdir -p data/parquet-testing/presigned
generate_large_parquet_query=$(cat <<EOF
CALL DBGEN(sf=1);
COPY lineitem TO 'data/parquet-testing/presigned/presigned-url-lineitem.parquet' (FORMAT 'parquet');
EOF
)
$DUCKDB_PATH -c "$generate_large_parquet_query"
mkdir -p data/attach_test/
# Generate Storage Version
$DUCKDB_PATH data/attach_test/attach.db < test/sql/storage_version/generate_storage_version.sql
$DUCKDB_PATH data/attach_test/lineitem_sf1.db -c "CALL dbgen(sf=1)"

View File

@@ -0,0 +1,858 @@
import os
import json
import re
import argparse
from enum import Enum
from typing import Dict, Optional, Tuple, List
parser = argparse.ArgumentParser(description='Generate serialization code')
parser.add_argument('--source', type=str, help='Source directory')
parser.add_argument('--target', type=str, help='Target directory')
args = parser.parse_args()
class MemberVariableStatus(Enum):
# Both serialized and deserialized
EXISTING = 1
# Not serialized, but is deserialized
READ_ONLY = 2
# Not serialized, not deserialized
DELETED = 3
def get_file_list():
if args.source is None:
targets = [
{'source': 'src/include/duckdb/storage/serialization', 'target': 'src/storage/serialization'},
{'source': 'extension/parquet/include/', 'target': 'extension/parquet'},
{'source': 'extension/json/include/', 'target': 'extension/json'},
]
else:
targets = [
{'source': args.source, 'target': args.target},
]
file_list = []
for target in targets:
source_base = os.path.sep.join(target['source'].split('/'))
target_base = os.path.sep.join(target['target'].split('/'))
for fname in os.listdir(source_base):
if '.json' not in fname:
continue
if '_enums.json' in fname:
continue
file_list.append(
{
'source': os.path.join(source_base, fname),
'target': os.path.join(target_base, 'serialize_' + fname.replace('.json', '.cpp')),
}
)
return file_list
scripts_dir = os.path.dirname(os.path.abspath(__file__))
version_map_path = os.path.join(scripts_dir, '..', 'src', 'storage', 'version_map.json')
version_map_file = file = open(version_map_path)
version_map = json.load(version_map_file)
def verify_serialization_versions(version_map):
serialization = version_map['serialization']['values']
if list(serialization.keys())[-1] != 'latest':
print(f"The version map ({version_map_path}) for serialization versions must end in 'latest'!")
exit(1)
verify_serialization_versions(version_map)
def lookup_serialization_version(version: str):
if version.lower() == "latest":
print(
f"'latest' is not an allowed 'version' to use in serialization JSON files, please provide a duckdb version"
)
versions = version_map['serialization']['values']
if version not in versions:
from packaging.version import Version
current_version = Version(version)
# This version does not exist in the version map
# Which is allowed for unreleased versions, they will get mapped to 'latest' instead
last_registered_version = Version(list(versions.keys())[-2])
if current_version < last_registered_version:
# The version was lower than the last defined version, which is not allowed
print(
f"Specified version ({current_version}) could not be found in the version_map.json, and it is lower than the last defined version ({last_registered_version})!"
)
exit(1)
if hasattr(versions, 'latest'):
# We have already mapped a version to 'latest', check that the versions match
latest_version = getattr(versions, 'latest')
if current_version != latest_version:
print(
f"Found more than one version that is not present in the version_map.json!: Current: {current_version}, Latest: {latest_version}"
)
exit(1)
else:
setattr(lookup_serialization_version, 'latest', current_version)
return versions['latest']
return versions[version]
INCLUDE_FORMAT = '#include "{filename}"\n'
HEADER = '''//===----------------------------------------------------------------------===//
// This file is automatically generated by scripts/generate_serialization.py
// Do not edit this file manually, your changes will be overwritten
//===----------------------------------------------------------------------===//
{include_list}
namespace duckdb {{
'''
FOOTER = '''
} // namespace duckdb
'''
TEMPLATED_BASE_FORMAT = '''
template <typename {template_name}>'''
SERIALIZE_BASE_FORMAT = '''
void {class_name}::Serialize(Serializer &serializer) const {{
{members}}}
'''
SERIALIZE_ELEMENT_FORMAT = (
'\tserializer.WriteProperty<{property_type}>({property_id}, "{property_key}", {property_name}{property_default});\n'
)
BASE_SERIALIZE_FORMAT = '\t{base_class_name}::Serialize(serializer);\n'
POINTER_RETURN_FORMAT = '{pointer}<{class_name}>'
DESERIALIZE_BASE_FORMAT = '''
{deserialize_return} {class_name}::Deserialize(Deserializer &deserializer) {{
{members}
}}
'''
SWITCH_CODE_FORMAT = '''\tswitch ({switch_variable}) {{
{case_statements}\tdefault:
\t\tthrow SerializationException("Unsupported type for deserialization of {base_class}!");
\t}}
'''
SET_DESERIALIZE_PARAMETER_FORMAT = '\tdeserializer.Set<{property_type}>({property_name});\n'
UNSET_DESERIALIZE_PARAMETER_FORMAT = '\tdeserializer.Unset<{property_type}>();\n'
GET_DESERIALIZE_PARAMETER_FORMAT = 'deserializer.Get<{property_type}>()'
TRY_GET_DESERIALIZE_PARAMETER_FORMAT = 'deserializer.TryGet<{property_type}>()'
SWITCH_HEADER_FORMAT = '\tcase {enum_type}::{enum_value}:\n'
SWITCH_STATEMENT_FORMAT = (
SWITCH_HEADER_FORMAT
+ '''\t\tresult = {class_deserialize}::Deserialize(deserializer);
\t\tbreak;
'''
)
DESERIALIZE_ELEMENT_FORMAT = '\tauto {property_name} = deserializer.ReadProperty<{property_type}>({property_id}, "{property_key}"{property_default});\n'
DESERIALIZE_ELEMENT_BASE_FORMAT = '\tauto {property_name} = deserializer.ReadProperty<unique_ptr<{base_property}>>({property_id}, "{property_key}"{property_default});\n'
DESERIALIZE_ELEMENT_CLASS_FORMAT = '\tdeserializer.ReadProperty<{property_type}>({property_id}, "{property_key}", result{assignment}{property_name}{property_default});\n'
DESERIALIZE_ELEMENT_CLASS_BASE_FORMAT = '\tauto {property_name} = deserializer.ReadProperty<unique_ptr<{base_property}>>({property_id}, "{property_key}"{property_default});\n\tresult{assignment}{property_name} = unique_ptr_cast<{base_property}, {derived_property}>(std::move({property_name}));\n'
MOVE_LIST = [
'string',
'ParsedExpression*',
'CommonTableExpressionMap',
'LogicalType',
'ColumnDefinition',
'BaseStatistics',
'BoundLimitNode',
]
REFERENCE_LIST = ['ClientContext', 'bound_parameter_map_t', 'Catalog']
def is_container(type):
return '<' in type and 'CSVOption' not in type
def is_pointer(type):
return type.endswith('*') or type.startswith('shared_ptr<')
def is_zeroable(type):
return type in [
'bool',
'int8_t',
'int16_t',
'int32_t',
'int64_t',
'uint8_t',
'uint16_t',
'uint32_t',
'uint64_t',
'idx_t',
'size_t',
'int',
]
def requires_move(type):
return is_container(type) or is_pointer(type) or type in MOVE_LIST
def replace_pointer(type):
return re.sub('([a-zA-Z0-9]+)[*]', 'unique_ptr<\\1>', type)
def get_default_argument(default_value):
return f'{default_value}'.lower() if type(default_value) == bool else f'{default_value}'
def get_deserialize_element_template(
template,
property_name,
property_key,
property_id,
property_type,
has_default,
default_value,
status: MemberVariableStatus,
pointer_type,
):
if status == MemberVariableStatus.READ_ONLY and not has_default:
print("'read_only' status is not allowed without a default value")
exit(1)
# read_method = 'ReadProperty'
assignment = '.' if pointer_type == 'none' else '->'
default_argument = '' if default_value is None else f', {get_default_argument(default_value)}'
if status == MemberVariableStatus.DELETED:
template = template.replace(', result{assignment}{property_name}', '').replace(
'ReadProperty', 'ReadDeletedProperty'
)
elif has_default and default_value is None:
template = template.replace('ReadProperty', 'ReadPropertyWithDefault')
elif has_default and default_value is not None:
template = template.replace('ReadProperty', 'ReadPropertyWithExplicitDefault')
template = template.format(
property_name=property_name,
property_key=property_key,
property_id=str(property_id),
property_default=default_argument,
property_type=property_type,
assignment=assignment,
)
if status == MemberVariableStatus.DELETED:
template = template.replace(f'auto {property_name} = ', '')
return template
def get_deserialize_assignment(property_name, property_type, pointer_type):
assignment = '.' if pointer_type == 'none' else '->'
property = property_name.replace('.', '_')
if requires_move(property_type):
property = f'std::move({property})'
return f'\tresult{assignment}{property_name} = {property};\n'
def get_return_value(pointer_type, class_name):
if pointer_type == 'none':
return class_name
return POINTER_RETURN_FORMAT.format(pointer=pointer_type, class_name=class_name)
def generate_return(class_entry):
if class_entry.base is None or class_entry.constructor_method is not None:
return '\treturn result;'
else:
return '\treturn std::move(result);'
def parse_status(status: str):
if status == 'deleted':
return MemberVariableStatus.DELETED
if status == 'read_only':
return MemberVariableStatus.READ_ONLY
if status == 'existing':
return MemberVariableStatus.EXISTING
valid_options = ['deleted', 'read_only', 'existing']
valid_options_string = ", ".join(valid_options)
print(f"Invalid 'status' ('{status}') encountered, valid options are: {valid_options_string}")
exit(1)
# FIXME: python has __slots__ for this, so it's enforced by Python itself
# see: https://wiki.python.org/moin/UsingSlots
supported_member_entries = [
'id',
'name',
'type',
'property',
'serialize_property',
'deserialize_property',
'base',
'default',
'status',
'version',
]
def has_default_by_default(type):
if is_pointer(type):
return True
if is_container(type):
if 'IndexVector' in type:
return False
if 'CSVOption' in type:
return False
return True
if type == 'string':
return True
if is_zeroable(type):
return True
return False
class MemberVariable:
def __init__(self, entry):
self.id = entry['id']
self.name = entry['name']
self.type = entry['type']
self.base = None
self.has_default = False
self.default = None
self.status: MemberVariableStatus = MemberVariableStatus.EXISTING
self.version: str = 'v0.10.2'
if 'property' in entry:
self.serialize_property = entry['property']
self.deserialize_property = entry['property']
else:
self.serialize_property = self.name
self.deserialize_property = self.name
if 'version' in entry:
self.version = entry['version']
if 'serialize_property' in entry:
self.serialize_property = entry['serialize_property']
if 'deserialize_property' in entry:
self.deserialize_property = entry['deserialize_property']
if 'default' in entry:
self.has_default = True
self.default = entry['default']
if 'status' in entry:
self.status = parse_status(entry['status'])
if self.default is None:
# default default
self.has_default = has_default_by_default(self.type)
if 'base' in entry:
self.base = entry['base']
for key in entry.keys():
if key not in supported_member_entries:
print(
f"Unsupported key \"{key}\" in member variable, key should be in set {str(supported_member_entries)}"
)
supported_serialize_entries = [
'class',
'class_type',
'pointer_type',
'base',
'enum',
'constructor',
'constructor_method',
'custom_implementation',
'custom_switch_code',
'members',
'return_type',
'set_parameters',
'includes',
'finalize_deserialization',
]
class SerializableClass:
def __init__(self, entry):
self.name = entry['class']
self.is_base_class = 'class_type' in entry
self.base = None
self.base_object = None
self.enum_value = None
self.enum_entries = []
self.set_parameter_names = []
self.set_parameters = []
self.pointer_type = 'unique_ptr'
self.constructor: Optional[List[str]] = None
self.constructor_method = None
self.members: Optional[List[MemberVariable]] = None
self.custom_implementation = False
self.custom_switch_code = None
self.children: Dict[str, SerializableClass] = {}
self.return_type = self.name
self.return_class = self.name
self.finalize_deserialization = None
if 'finalize_deserialization' in entry:
self.finalize_deserialization = entry['finalize_deserialization']
if self.is_base_class:
self.enum_value = entry['class_type']
if 'pointer_type' in entry:
self.pointer_type = entry['pointer_type']
if 'base' in entry:
self.base = entry['base']
self.enum_entries = entry['enum']
if type(self.enum_entries) is str:
self.enum_entries = [self.enum_entries]
self.return_type = self.base
if 'constructor' in entry:
self.constructor = entry['constructor']
if not isinstance(self.constructor, list):
print(f"constructor for {self.name}, must be of type [], but is of type {str(type(self.constructor))}")
exit(1)
if 'constructor_method' in entry:
self.constructor_method = entry['constructor_method']
if self.constructor is not None:
print(
"Not allowed to mix 'constructor_method' and 'constructor', 'constructor_method' will implicitly receive all parameters"
)
exit(1)
if 'custom_implementation' in entry and entry['custom_implementation']:
self.custom_implementation = True
if 'custom_switch_code' in entry:
self.custom_switch_code = entry['custom_switch_code']
if 'members' in entry:
self.members = [MemberVariable(x) for x in entry['members']]
if 'return_type' in entry:
self.return_type = entry['return_type']
self.return_class = self.return_type
if 'set_parameters' in entry:
self.set_parameter_names = entry['set_parameters']
for set_parameter_name in self.set_parameter_names:
found = False
assert self.members is not None
for member in self.members:
if member.name == set_parameter_name:
self.set_parameters.append(member)
found = True
break
if not found:
raise Exception(f'Set parameter {set_parameter_name} not found in member list')
for key in entry.keys():
if key not in supported_serialize_entries:
print(
f"Unsupported key \"{key}\" in member variable, key should be in set {str(supported_serialize_entries)}"
)
def inherit(self, base_class):
self.base_object = base_class
self.pointer_type = base_class.pointer_type
def get_deserialize_element(
self, entry: MemberVariable, *, base: Optional[str] = None, pointer_type: Optional[str] = None
):
property_name = entry.deserialize_property
property_id = entry.id
property_key = entry.name
property_type = replace_pointer(entry.type)
if not pointer_type:
pointer_type = self.pointer_type
property_name = property_name.replace('.', '_')
template = DESERIALIZE_ELEMENT_FORMAT
if base:
template = DESERIALIZE_ELEMENT_BASE_FORMAT.replace('{base_property}', base.replace('*', ''))
return get_deserialize_element_template(
template,
property_name,
property_key,
property_id,
property_type,
entry.has_default,
entry.default,
entry.status,
pointer_type,
)
def get_serialize_element(self, entry: MemberVariable):
property_name = entry.serialize_property
property_id = entry.id
property_key = entry.name
property_type = replace_pointer(entry.type)
default_value = entry.default
assignment = '.' if self.pointer_type == 'none' else '->'
default_argument = '' if default_value is None else f', {get_default_argument(default_value)}'
storage_version = lookup_serialization_version(entry.version)
conditional_serialization = storage_version != 1
template = SERIALIZE_ELEMENT_FORMAT
if entry.status != MemberVariableStatus.EXISTING and not conditional_serialization:
template = "\t/* [Deleted] ({property_type}) \"{property_name}\" */\n"
elif entry.has_default:
template = template.replace('WriteProperty', 'WritePropertyWithDefault')
serialization_code = template.format(
property_name=property_name,
property_type=property_type,
property_id=str(property_id),
property_key=property_key,
property_default=default_argument,
assignment=assignment,
)
if conditional_serialization:
code = []
if entry.status != MemberVariableStatus.EXISTING:
# conditional delete
code.append(f'\tif (!serializer.ShouldSerialize({storage_version})) {{')
else:
# conditional serialization
code.append(f'\tif (serializer.ShouldSerialize({storage_version})) {{')
code.append('\t' + serialization_code)
result = '\n'.join(code) + '\t}\n'
return result
return serialization_code
def generate_constructor(self, constructor_parameters: List[str]):
parameters = ", ".join(constructor_parameters)
if self.constructor_method is not None:
return f'\tauto result = {self.constructor_method}({parameters});\n'
if self.pointer_type == 'none':
if parameters != '':
parameters = f'({parameters})'
return f'\t{self.return_class} result{parameters};\n'
return f'\tauto result = duckdb::{self.pointer_type}<{self.return_class}>(new {self.return_class}({parameters}));\n'
def generate_base_class_code(base_class: SerializableClass):
base_class_serialize = ''
base_class_deserialize = ''
# properties
enum_type = ''
for entry in base_class.members:
if entry.serialize_property == base_class.enum_value:
enum_type = entry.type
base_class_serialize += base_class.get_serialize_element(entry)
type_name = replace_pointer(entry.type)
base_class_deserialize += base_class.get_deserialize_element(entry)
expressions = [x for x in base_class.children.items()]
expressions = sorted(expressions, key=lambda x: x[0])
# set parameters
for entry in base_class.set_parameters:
base_class_deserialize += SET_DESERIALIZE_PARAMETER_FORMAT.format(
property_type=entry.type, property_name=entry.name
)
base_class_deserialize += f'\t{base_class.pointer_type}<{base_class.name}> result;\n'
switch_cases = ''
for expr in expressions:
enum_value = expr[0]
child_data = expr[1]
if child_data.custom_switch_code is not None:
switch_cases += SWITCH_HEADER_FORMAT.format(
enum_type=enum_type, enum_value=enum_value, class_deserialize=child_data.name
)
switch_cases += '\n'.join(
['\t\t' + x for x in child_data.custom_switch_code.replace('\\n', '\n').split('\n')]
)
switch_cases += '\n'
continue
switch_cases += SWITCH_STATEMENT_FORMAT.format(
enum_type=enum_type, enum_value=enum_value, class_deserialize=child_data.name
)
assign_entries = []
for entry in base_class.members:
skip = False
for check_entry in [entry.name, entry.serialize_property]:
if check_entry in base_class.set_parameter_names:
skip = True
if check_entry == base_class.enum_value:
skip = True
if skip:
continue
assign_entries.append(entry)
# class switch statement
base_class_deserialize += SWITCH_CODE_FORMAT.format(
switch_variable=base_class.enum_value, case_statements=switch_cases, base_class=base_class.name
)
deserialize_return = get_return_value(base_class.pointer_type, base_class.return_type)
for entry in base_class.set_parameters:
base_class_deserialize += UNSET_DESERIALIZE_PARAMETER_FORMAT.format(property_type=entry.type)
for entry in assign_entries:
if entry.status != MemberVariableStatus.EXISTING:
continue
move = False
if entry.type in MOVE_LIST or is_container(entry.type) or is_pointer(entry.type):
move = True
if move:
base_class_deserialize += (
f'\tresult->{entry.deserialize_property} = std::move({entry.deserialize_property});\n'
)
else:
base_class_deserialize += f'\tresult->{entry.deserialize_property} = {entry.deserialize_property};\n'
if base_class.finalize_deserialization is not None:
for line in base_class.finalize_deserialization:
base_class_deserialize += "\t" + line + "\n"
base_class_deserialize += generate_return(base_class)
base_class_generation = ''
serialization = ''
if base_class.base is not None:
serialization += BASE_SERIALIZE_FORMAT.format(base_class_name=base_class.base)
base_class_generation += SERIALIZE_BASE_FORMAT.format(
class_name=base_class.name, members=serialization + base_class_serialize
)
base_class_generation += DESERIALIZE_BASE_FORMAT.format(
deserialize_return=deserialize_return, class_name=base_class.name, members=base_class_deserialize
)
return base_class_generation
def generate_class_code(class_entry: SerializableClass):
if class_entry.custom_implementation:
return None
class_serialize = ''
class_deserialize = ''
constructor_parameters: List[str] = []
constructor_entries = set()
last_constructor_index = -1
if class_entry.constructor is not None:
for constructor_entry_ in class_entry.constructor:
if constructor_entry_.endswith('&'):
constructor_entry = constructor_entry_[:-1]
is_reference = True
else:
constructor_entry = constructor_entry_
is_reference = False
constructor_entries.add(constructor_entry)
found = False
for entry_idx, entry in enumerate(class_entry.members):
if entry.name == constructor_entry:
if entry_idx > last_constructor_index:
last_constructor_index = entry_idx
type_name = replace_pointer(entry.type)
entry.deserialize_property = entry.deserialize_property.replace('.', '_')
if requires_move(type_name) and not is_reference:
constructor_parameters.append(f'std::move({entry.deserialize_property})')
else:
constructor_parameters.append(entry.deserialize_property)
found = True
break
if constructor_entry.startswith('$') or constructor_entry.startswith('?'):
is_optional = constructor_entry.startswith('?')
if is_optional:
param_type = constructor_entry.replace('?', '')
get_format = TRY_GET_DESERIALIZE_PARAMETER_FORMAT
else:
param_type = constructor_entry.replace('$', '')
get_format = GET_DESERIALIZE_PARAMETER_FORMAT
if param_type in REFERENCE_LIST:
param_type += ' &'
constructor_parameters.append(get_format.format(property_type=param_type))
found = True
if class_entry.base_object is not None:
for entry in class_entry.base_object.set_parameters:
if entry.name == constructor_entry:
constructor_parameters.append(GET_DESERIALIZE_PARAMETER_FORMAT.format(property_type=entry.type))
found = True
break
if not found:
print(f"Constructor member \"{constructor_entry}\" was not found in members list")
exit(1)
elif class_entry.constructor_method is not None:
for entry_idx, entry in enumerate(class_entry.members):
if entry_idx > last_constructor_index:
last_constructor_index = entry_idx
constructor_entries.add(entry.name)
type_name = replace_pointer(entry.type)
entry.deserialize_property = entry.deserialize_property.replace('.', '_')
if requires_move(type_name):
constructor_parameters.append(f'std::move({entry.deserialize_property})')
else:
constructor_parameters.append(entry.deserialize_property)
if class_entry.base is not None:
class_serialize += BASE_SERIALIZE_FORMAT.format(base_class_name=class_entry.base)
for entry_idx in range(last_constructor_index + 1):
entry = class_entry.members[entry_idx]
class_deserialize += class_entry.get_deserialize_element(entry, base=entry.base, pointer_type='unique_ptr')
class_deserialize += class_entry.generate_constructor(constructor_parameters)
if class_entry.members is None:
return None
for entry_idx, entry in enumerate(class_entry.members):
write_property_name = entry.serialize_property
deserialize_template_str = DESERIALIZE_ELEMENT_CLASS_FORMAT
if entry.base:
deserialize_template_str = DESERIALIZE_ELEMENT_CLASS_BASE_FORMAT.replace(
'{base_property}', entry.base.replace('*', '')
).replace('{derived_property}', entry.type.replace('*', ''))
class_serialize += class_entry.get_serialize_element(entry)
type_name = replace_pointer(entry.type)
if entry_idx > last_constructor_index:
class_deserialize += get_deserialize_element_template(
deserialize_template_str,
entry.deserialize_property,
entry.name,
entry.id,
type_name,
entry.has_default,
entry.default,
entry.status,
class_entry.pointer_type,
)
elif entry.name not in constructor_entries and entry.status == MemberVariableStatus.EXISTING:
class_deserialize += get_deserialize_assignment(
entry.deserialize_property, entry.type, class_entry.pointer_type
)
if entry.name in class_entry.set_parameter_names and entry.status == MemberVariableStatus.EXISTING:
class_deserialize += SET_DESERIALIZE_PARAMETER_FORMAT.format(
property_type=entry.type, property_name=entry.name
)
for entry in class_entry.set_parameters:
class_deserialize += UNSET_DESERIALIZE_PARAMETER_FORMAT.format(
property_type=entry.type, property_name=entry.name
)
if class_entry.finalize_deserialization is not None:
class_deserialize += class_entry.finalize_deserialization
if class_entry.finalize_deserialization is not None:
for line in class_entry.finalize_deserialization:
class_deserialize += "\t" + line + "\n"
class_deserialize += generate_return(class_entry)
deserialize_return = get_return_value(class_entry.pointer_type, class_entry.return_type)
class_generation = ''
pattern = re.compile(r'<\w+>')
templated_type = ''
# Check if is a templated class
is_templated = pattern.search(class_entry.name)
if is_templated:
templated_type = TEMPLATED_BASE_FORMAT.format(template_name=is_templated.group()[1:-1])
class_generation += templated_type + SERIALIZE_BASE_FORMAT.format(
class_name=class_entry.name, members=class_serialize
)
class_generation += templated_type + DESERIALIZE_BASE_FORMAT.format(
deserialize_return=deserialize_return,
class_name=class_entry.name,
members=class_deserialize,
)
return class_generation
def check_children_for_duplicate_members(node: SerializableClass, parents: list, seen_names: set, seen_ids: set):
# Check for duplicate names
if node.members is not None:
for member in node.members:
if member.name in seen_names:
# Print the inheritance tree
exit(
f"Error: Duplicate member name \"{member.name}\" in class \"{node.name}\" ({' -> '.join(map(lambda x: x.name, parents))} -> {node.name})"
)
seen_names.add(member.name)
if member.id in seen_ids:
exit(
f"Error: Duplicate member id \"{member.id}\" in class \"{node.name}\" ({' -> '.join(map(lambda x: x.name, parents))} -> {node.name})"
)
seen_ids.add(member.id)
# Recurse
for child in node.children.values():
check_children_for_duplicate_members(child, parents + [node], seen_names.copy(), seen_ids.copy())
file_list = get_file_list()
for entry in file_list:
source_path = entry['source']
target_path = entry['target']
with open(source_path, 'r') as f:
try:
json_data = json.load(f)
except Exception as e:
print(f"Failed to parse {source_path}: {str(e)}")
exit(1)
include_list = [
'duckdb/common/serializer/serializer.hpp',
'duckdb/common/serializer/deserializer.hpp',
]
base_classes: List[SerializableClass] = []
classes: List[SerializableClass] = []
base_class_data: Dict[str, SerializableClass] = {}
for entry in json_data:
if 'includes' in entry:
if type(entry['includes']) != type([]):
print(f"Include list must be a list, found {type(entry['includes'])} (in {str(entry)})")
exit(1)
for include_entry in entry['includes']:
if include_entry not in include_list:
include_list.append(include_entry)
new_class = SerializableClass(entry)
if new_class.is_base_class:
# this class is a base class itself - construct the base class list
if new_class.name in base_class_data:
raise Exception(f"Duplicate base class \"{new_class.name}\"")
base_class_data[new_class.name] = new_class
base_classes.append(new_class)
else:
classes.append(new_class)
if new_class.base is not None:
# this class inherits from a base class - add the enum value
if new_class.base not in base_class_data:
raise Exception(f"Unknown base class \"{new_class.base}\" for entry \"{new_class.name}\"")
base_class_object = base_class_data[new_class.base]
new_class.inherit(base_class_object)
for enum_entry in new_class.enum_entries:
if enum_entry in base_class_object.children:
raise Exception(f"Duplicate enum entry \"{enum_entry}\"")
base_class_object.children[enum_entry] = new_class
# Ensure that there are no duplicate names in the inheritance tree
for base_class in base_classes:
if base_class.base is None:
# Root base class, now traverse the children
check_children_for_duplicate_members(base_class, [], set(), set())
with open(target_path, 'w+') as f:
include_list = ''.join([INCLUDE_FORMAT.format(filename=x) for x in include_list])
header = HEADER.format(include_list=include_list)
f.write(header)
# generate the base class serialization
for base_class in base_classes:
base_class_generation = generate_base_class_code(base_class)
f.write(base_class_generation)
# generate the class serialization
classes = sorted(classes, key=lambda x: x.name)
for class_entry in classes:
class_generation = generate_class_code(class_entry)
if class_generation is None:
continue
f.write(class_generation)
f.write(FOOTER)

View File

@@ -0,0 +1,10 @@
from settings_scripts import parse_and_sort_json_file, update_header_file, update_scopes, update_src_code
from settings_scripts.config import SettingsList, make_format
if __name__ == '__main__':
parse_and_sort_json_file()
update_header_file()
update_scopes()
update_src_code()
make_format()
print(f"- Successfully parsed and included {len(SettingsList)} setting(s)!")

View File

@@ -0,0 +1,77 @@
import json
import os
scripts_dir = os.path.dirname(os.path.abspath(__file__))
VERSION_MAP_PATH = scripts_dir + "/../src/storage/version_map.json"
STORAGE_INFO_PATH = scripts_dir + "/../src/storage/storage_info.cpp"
START_MARKER = "// START OF {type} VERSION INFO"
END_MARKER = "// END OF {type} VERSION INFO"
def generate_version_info_array(storage_versions, type, name, default):
result = []
name_upper = name.upper()
if 'latest' in storage_versions:
latest_value = storage_versions['latest']
result.append(f"const uint64_t LATEST_{name_upper} = {latest_value};")
result.append(f"const uint64_t DEFAULT_{name_upper} = {default};")
result.append(f"static const {type} {name}[] = {{")
for version_name, storage_version in storage_versions.items():
result.append(f'\t{{"{version_name}", {storage_version}}},')
result.append("\t{nullptr, 0}")
result.append("};\n")
return "\n".join(result)
def main():
with open(VERSION_MAP_PATH, 'r') as json_file:
version_map = json.load(json_file)
with open(STORAGE_INFO_PATH, "r") as cpp_file:
content = cpp_file.read()
for key in version_map['serialization']['values'].keys():
if key in ['latest']:
continue
if key not in version_map['storage']['values'].keys():
print(f'Key {key} found in serialization version but not in storage version')
exit(1)
types = ['storage', 'serialization']
for type in version_map:
if type not in types:
print("Unexpected key {type}")
exit(1)
capitalized_type = type.capitalize()
upper_type = type.upper()
array_code = generate_version_info_array(
version_map[type]['values'],
f'{capitalized_type}VersionInfo',
f'{type}_version_info',
version_map[type]['default'],
)
start_marker = START_MARKER.format(type=upper_type)
start_index = content.find(start_marker)
if start_index == -1:
print(f"storage_info.cpp is corrupted, could not find the START_MARKER for {type}")
exit(1)
end_marker = END_MARKER.format(type=upper_type)
end_index = content.find(end_marker)
if end_index == -1:
print(f"storage_info.cpp is corrupted, could not find the END_MARKER for {type}")
exit(1)
content = content[: start_index + len(start_marker)] + "\n" + array_code + content[end_index:]
with open(STORAGE_INFO_PATH, "w") as cpp_file:
cpp_file.write(content)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,49 @@
# this script re-generates the storage used for storage_version.test_slow
# before running this script, increment the version number in src/storage/storage_info.cpp and recompile (`make`)
import os
import subprocess
from python_helpers import open_utf8
shell_proc = os.path.join('build', 'release', 'duckdb')
gen_storage_script = os.path.join('test', 'sql', 'storage_version', 'generate_storage_version.sql')
gen_storage_target = os.path.join('test', 'sql', 'storage_version', 'storage_version.db')
def try_remove_file(fname):
try:
os.remove(fname)
except:
pass
try_remove_file(gen_storage_target)
try_remove_file(gen_storage_target + '.wal')
def run_command_in_shell(cmd):
print(cmd)
res = subprocess.run(
[shell_proc, '--batch', '-init', '/dev/null', gen_storage_target],
capture_output=True,
input=bytearray(cmd, 'utf8'),
)
stdout = res.stdout.decode('utf8').strip()
stderr = res.stderr.decode('utf8').strip()
if res.returncode != 0:
print("Failed to create database file!")
print("----STDOUT----")
print(stdout)
print("----STDERR----")
print(stderr)
with open_utf8(gen_storage_script, 'r') as f:
cmd = f.read()
run_command_in_shell(cmd)
run_command_in_shell('select * from integral_values')
run_command_in_shell('select * from integral_values')
try_remove_file(gen_storage_target + '.wal')

View File

@@ -0,0 +1,137 @@
import psycopg2
import argparse
import os
import platform
import shutil
import sys
import subprocess
import multiprocessing.pool
parser = argparse.ArgumentParser(description='Generate TPC-DS reference results from Postgres.')
parser.add_argument(
'--sf', dest='sf', action='store', help='The TPC-DS scale factor reference results to generate', default=1
)
parser.add_argument(
'--query-dir',
dest='query_dir',
action='store',
help='The directory with queries to run',
default='extension/tpcds/dsdgen/queries',
)
parser.add_argument(
'--answer-dir',
dest='answer_dir',
action='store',
help='The directory where to store the answers',
default='extension/tpcds/dsdgen/answers/sf${SF}',
)
parser.add_argument(
'--duckdb-path',
dest='duckdb_path',
action='store',
help='The path to the DuckDB executable',
default='build/reldebug/duckdb',
)
parser.add_argument(
'--skip-load',
dest='skip_load',
action='store_const',
const=True,
help='Whether or not to skip loading',
default=False,
)
parser.add_argument(
'--query-list', dest='query_list', action='store', help='The list of queries to run (default = all)', default=''
)
parser.add_argument('--nthreads', dest='nthreads', action='store', type=int, help='The number of threads', default=0)
args = parser.parse_args()
con = psycopg2.connect(database='postgres')
c = con.cursor()
if not args.skip_load:
tpcds_dir = f'tpcds_sf{args.sf}'
q = f"""
CALL dsdgen(sf={args.sf});
EXPORT DATABASE '{tpcds_dir}' (DELIMITER '|');
"""
proc = subprocess.Popen([args.duckdb_path, "-c", q])
proc.wait()
if proc.returncode != 0:
exit(1)
# drop the previous tables
tables = [
'name',
'web_site',
'web_sales',
'web_returns',
'web_page',
'warehouse',
'time_dim',
'store_sales',
'store_returns',
'store',
'ship_mode',
'reason',
'promotion',
'item',
'inventory',
'income_band',
'household_demographics',
'date_dim',
'customer_demographics',
'customer_address',
'customer',
'catalog_sales',
'catalog_returns',
'catalog_page',
'call_center',
]
for table in tables:
c.execute(f'DROP TABLE IF EXISTS {table};')
with open(os.path.join(tpcds_dir, 'schema.sql'), 'r') as f:
schema = f.read()
c.execute(schema)
with open(os.path.join(tpcds_dir, 'load.sql'), 'r') as f:
load = f.read()
load = load.replace(f'{tpcds_dir}/', f'{os.getcwd()}/{tpcds_dir}/')
c.execute(load)
con.commit()
# get a list of all queries
queries = os.listdir(args.query_dir)
queries.sort()
answer_dir = args.answer_dir.replace('${SF}', args.sf)
if len(args.query_list) > 0:
passing_queries = [x + '.sql' for x in args.query_list.split(',')]
queries = [x for x in queries if x in passing_queries]
queries.sort()
def run_query(q):
print(q)
with open(os.path.join(args.query_dir, q), 'r') as f:
sql_query = f.read()
answer_path = os.path.join(os.getcwd(), answer_dir, q.replace('.sql', '.csv'))
c.execute(f'DROP TABLE IF EXISTS "query_result{q}"')
c.execute(f'CREATE TABLE "query_result{q}" AS ' + sql_query)
c.execute(f"COPY \"query_result{q}\" TO '{answer_path}' (FORMAT CSV, DELIMITER '|', HEADER, NULL 'NULL')")
if args.nthreads == 0:
for q in queries:
run_query(q)
else:
pool = multiprocessing.pool.ThreadPool(processes=args.nthreads)
pool.map(run_query, queries)

View File

@@ -0,0 +1,116 @@
import os
import subprocess
duckdb_program = '/Users/myth/Programs/duckdb-bugfix/build/release/duckdb'
struct_def = '''struct $STRUCT_NAME {
static constexpr char *Name = "$NAME";
static const char *Columns[];
static constexpr idx_t ColumnCount = $COLUMN_COUNT;
static const LogicalType Types[];
static constexpr idx_t PrimaryKeyCount = $PK_COLUMN_COUNT;
static const char *PrimaryKeyColumns[];
};
'''
initcode = '''
call dsdgen(sf=0);
.mode csv
.header 0
'''
column_count_query = '''
select count(*) from pragma_table_info('$NAME');
'''
pk_column_count_query = '''
select count(*) from pragma_table_info('$NAME') where pk=true;
'''
gen_names = '''
select concat('const char *', '$STRUCT_NAME', '::Columns[] = {', STRING_AGG('"' || name || '"', ', ') || '};') from pragma_table_info('$NAME');
'''
gen_types = '''
select concat('const LogicalType ', '$STRUCT_NAME', '::Types[] = {', STRING_AGG('LogicalType::' || type, ', ') || '};') from pragma_table_info('$NAME');
'''
pk_columns = '''
select concat('const char *', '$STRUCT_NAME', '::PrimaryKeyColumns[] = {', STRING_AGG('"' || name || '"', ', ') || '};') from pragma_table_info('$NAME') where pk=true;
'''
def run_query(sql):
input_sql = initcode + '\n' + sql
res = subprocess.run(duckdb_program, input=input_sql.encode('utf8'), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout = res.stdout.decode('utf8').strip()
stderr = res.stderr.decode('utf8').strip()
if res.returncode != 0:
print("FAILED TO RUN QUERY")
print(stderr)
exit(1)
return stdout
def prepare_query(sql, table_name, struct_name):
return sql.replace('$NAME', table_name).replace('$STRUCT_NAME', struct_name)
header = '''
#pragma once
#include "duckdb.hpp"
#ifndef DUCKDB_AMALGAMATION
#include "duckdb/common/exception.hpp"
#include "duckdb/common/types/date.hpp"
#include "duckdb/parser/column_definition.hpp"
#include "duckdb/storage/data_table.hpp"
#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
#include "duckdb/planner/parsed_data/bound_create_table_info.hpp"
#include "duckdb/parser/parsed_data/create_table_info.hpp"
#include "duckdb/parser/constraints/not_null_constraint.hpp"
#include "duckdb/catalog/catalog.hpp"
#include "duckdb/planner/binder.hpp"
#endif
namespace tpcds {
using duckdb::LogicalType;
using duckdb::idx_t;
'''
footer = '''
}
'''
print(header)
table_list = run_query('show tables')
for table_name in table_list.split('\n'):
table_name = table_name.strip()
print(
'''
//===--------------------------------------------------------------------===//
// $NAME
//===--------------------------------------------------------------------===//'''.replace(
'$NAME', table_name
)
)
struct_name = str(table_name.title().replace('_', '')) + 'Info'
column_count = int(run_query(prepare_query(column_count_query, table_name, struct_name)).strip())
pk_column_count = int(run_query(prepare_query(pk_column_count_query, table_name, struct_name)).strip())
print(
prepare_query(struct_def, table_name, struct_name)
.replace('$COLUMN_COUNT', str(column_count))
.replace('$PK_COLUMN_COUNT', str(pk_column_count))
)
print(run_query(prepare_query(gen_names, table_name, struct_name)).replace('""', '"').strip('"'))
print("")
print(run_query(prepare_query(gen_types, table_name, struct_name)).strip('"'))
print("")
print(run_query(prepare_query(pk_columns, table_name, struct_name)).replace('""', '"').strip('"'))
print(footer)

View File

@@ -0,0 +1,21 @@
supported_vector_sizes = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
result = ""
for i in range(len(supported_vector_sizes)):
vsize = supported_vector_sizes[i]
if i == 0:
result += "#if"
else:
result += "#elif"
result += " STANDARD_VECTOR_SIZE == " + str(vsize) + "\n"
result += "const sel_t FlatVector::incremental_vector[] = {"
for idx in range(vsize):
if idx != 0:
result += ", "
result += str(idx)
result += "};\n"
result += """#else
#error Unsupported VECTOR_SIZE!
#endif"""
print(result)

327
external/duckdb/scripts/gentpcecode.py vendored Normal file
View File

@@ -0,0 +1,327 @@
import os
from python_helpers import open_utf8
GENERATED_HEADER = 'include/tpce_generated.hpp'
GENERATED_SOURCE = 'tpce_generated.cpp'
TPCE_DIR = os.path.join('third_party', 'tpce-tool')
GENERATED_HEADER = os.path.join(TPCE_DIR, GENERATED_HEADER)
GENERATED_SOURCE = os.path.join(TPCE_DIR, GENERATED_SOURCE)
current_table = None
tables = {}
print(GENERATED_HEADER)
print(GENERATED_SOURCE)
header = open_utf8(GENERATED_HEADER, 'w+')
source = open_utf8(GENERATED_SOURCE, 'w+')
for fp in [header, source]:
fp.write(
"""
////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
// THIS FILE IS GENERATED BY gentpcecode.py, DO NOT EDIT MANUALLY //
////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
"""
)
header.write(
"""
#include "duckdb/catalog/catalog.hpp"
#include "duckdb/main/appender.hpp"
#include "duckdb/main/connection.hpp"
#include "duckdb/main/database.hpp"
#include "main/BaseLoader.h"
#include "main/BaseLoaderFactory.h"
#include "main/NullLoader.h"
#include "main/TableRows.h"
namespace TPCE {
class DuckDBLoaderFactory : public CBaseLoaderFactory {
duckdb::Connection &con;
std::string schema;
std::string suffix;
public:
DuckDBLoaderFactory(duckdb::Connection &con, std::string schema,
std::string suffix)
: con(con), schema(schema), suffix(suffix) {
}
// Functions to create loader classes for individual tables.
virtual CBaseLoader<ACCOUNT_PERMISSION_ROW> *
CreateAccountPermissionLoader();
virtual CBaseLoader<ADDRESS_ROW> *CreateAddressLoader();
virtual CBaseLoader<BROKER_ROW> *CreateBrokerLoader();
virtual CBaseLoader<CASH_TRANSACTION_ROW> *
CreateCashTransactionLoader();
virtual CBaseLoader<CHARGE_ROW> *CreateChargeLoader();
virtual CBaseLoader<COMMISSION_RATE_ROW> *CreateCommissionRateLoader();
virtual CBaseLoader<COMPANY_COMPETITOR_ROW> *
CreateCompanyCompetitorLoader();
virtual CBaseLoader<COMPANY_ROW> *CreateCompanyLoader();
virtual CBaseLoader<CUSTOMER_ACCOUNT_ROW> *
CreateCustomerAccountLoader();
virtual CBaseLoader<CUSTOMER_ROW> *CreateCustomerLoader();
virtual CBaseLoader<CUSTOMER_TAXRATE_ROW> *
CreateCustomerTaxrateLoader();
virtual CBaseLoader<DAILY_MARKET_ROW> *CreateDailyMarketLoader();
virtual CBaseLoader<EXCHANGE_ROW> *CreateExchangeLoader();
virtual CBaseLoader<FINANCIAL_ROW> *CreateFinancialLoader();
virtual CBaseLoader<HOLDING_ROW> *CreateHoldingLoader();
virtual CBaseLoader<HOLDING_HISTORY_ROW> *CreateHoldingHistoryLoader();
virtual CBaseLoader<HOLDING_SUMMARY_ROW> *CreateHoldingSummaryLoader();
virtual CBaseLoader<INDUSTRY_ROW> *CreateIndustryLoader();
virtual CBaseLoader<LAST_TRADE_ROW> *CreateLastTradeLoader();
virtual CBaseLoader<NEWS_ITEM_ROW> *CreateNewsItemLoader();
virtual CBaseLoader<NEWS_XREF_ROW> *CreateNewsXRefLoader();
virtual CBaseLoader<SECTOR_ROW> *CreateSectorLoader();
virtual CBaseLoader<SECURITY_ROW> *CreateSecurityLoader();
virtual CBaseLoader<SETTLEMENT_ROW> *CreateSettlementLoader();
virtual CBaseLoader<STATUS_TYPE_ROW> *CreateStatusTypeLoader();
virtual CBaseLoader<TAX_RATE_ROW> *CreateTaxRateLoader();
virtual CBaseLoader<TRADE_HISTORY_ROW> *CreateTradeHistoryLoader();
virtual CBaseLoader<TRADE_ROW> *CreateTradeLoader();
virtual CBaseLoader<TRADE_REQUEST_ROW> *CreateTradeRequestLoader();
virtual CBaseLoader<TRADE_TYPE_ROW> *CreateTradeTypeLoader();
virtual CBaseLoader<WATCH_ITEM_ROW> *CreateWatchItemLoader();
virtual CBaseLoader<WATCH_LIST_ROW> *CreateWatchListLoader();
virtual CBaseLoader<ZIP_CODE_ROW> *CreateZipCodeLoader();
};
"""
)
source.write(
"""
#include "tpce_generated.hpp"
using namespace duckdb;
using namespace std;
namespace TPCE {
struct tpce_append_information {
tpce_append_information(Connection &con, string schema, string table) :
appender(con, schema, table) {}
Appender appender;
};
static void append_value(tpce_append_information &info, int32_t value) {
info.appender.Append<int32_t>(value);
}
static void append_bigint(tpce_append_information &info, int64_t value) {
info.appender.Append<int64_t>(value);
}
static void append_string(tpce_append_information &info, const char *value) {
info.appender.Append<Value>(Value(value));
}
static void append_double(tpce_append_information &info, double value) {
info.appender.Append<double>(value);
}
static void append_bool(tpce_append_information &info, bool value) {
info.appender.Append<bool>(value);
}
static void append_timestamp(tpce_append_information &info, CDateTime time) {
int32_t year = 0, month = 0, day = 0, hour = 0, minute = 0, second = 0, msec = 0;
time.GetYMDHMS(&year, &month, &day, &hour, &minute, &second, &msec);
info.appender.Append<Value>(Value::TIMESTAMP(year, month, day, hour, minute, second, msec * 1000));
}
void append_char(tpce_append_information &info, char value) {
char val[2];
val[0] = value;
val[1] = '\\0';
append_string(info, val);
}
template <typename T> class DuckDBBaseLoader : public CBaseLoader<T> {
protected:
tpce_append_information info;
public:
DuckDBBaseLoader(Connection &con, string schema, string table) :
info(con, schema, table) {
}
void FinishLoad() {
}
};
"""
)
with open(os.path.join(TPCE_DIR, 'include/main/TableRows.h'), 'r') as f:
for line in f:
line = line.strip()
if line.startswith('typedef struct '):
line = line.replace('typedef struct ', '')
current_table = line.split(' ')[0].replace('_ROW', ' ').replace('_', ' ').lower().strip()
tables[current_table] = []
elif line.startswith('}'):
current_table = None
elif current_table != None:
# row
# get type
splits = line.strip().split(' ')
if len(splits) < 2:
continue
line = splits[0]
name = splits[1].split(';')[0].split('[')[0].lower()
is_single_char = False
if 'TIdent' in line or 'INT64' in line or 'TTrade' in line:
tpe = "TypeId::BIGINT"
sqltpe = "BIGINT"
elif 'double' in line or 'float' in line:
tpe = "TypeId::DECIMAL"
sqltpe = "DECIMAL"
elif 'int' in line:
tpe = "TypeId::INTEGER"
sqltpe = "INTEGER"
elif 'CDateTime' in line:
tpe = "TypeId::TIMESTAMP"
sqltpe = "TIMESTAMP"
elif 'bool' in line:
tpe = 'TypeId::BOOLEAN'
sqltpe = "BOOLEAN"
elif 'char' in line:
if '[' not in splits[1]:
is_single_char = True
tpe = "TypeId::VARCHAR"
sqltpe = "VARCHAR"
else:
continue
tables[current_table].append([name, tpe, is_single_char, sqltpe])
def get_tablename(name):
name = name.title().replace(' ', '')
if name == 'NewsXref':
return 'NewsXRef'
return name
for table in tables.keys():
source.write(
"""
class DuckDB${TABLENAME}Load : public DuckDBBaseLoader<${ROW_TYPE}> {
public:
DuckDB${TABLENAME}Load(Connection &con, string schema, string table) :
DuckDBBaseLoader(con, schema, table) {
}
void WriteNextRecord(const ${ROW_TYPE} &next_record) {
info.appender.BeginRow();""".replace(
"${TABLENAME}", get_tablename(table)
).replace(
"${ROW_TYPE}", table.upper().replace(' ', '_') + '_ROW'
)
)
source.write("\n")
collist = tables[table]
for i in range(len(collist)):
entry = collist[i]
name = entry[0].upper()
tpe = entry[1]
if tpe == "TypeId::BIGINT":
funcname = "bigint"
elif tpe == "TypeId::DECIMAL":
funcname = "double"
elif tpe == "TypeId::INTEGER":
funcname = "value"
elif tpe == "TypeId::TIMESTAMP":
funcname = "timestamp"
elif tpe == 'TypeId::BOOLEAN':
funcname = "bool"
elif tpe == "TypeId::VARCHAR":
if entry[2]:
funcname = "char"
else:
funcname = "string"
else:
print("Unknown type " + tpe)
exit(1)
source.write("\t\tappend_%s(info, next_record.%s);" % (funcname, name))
if i != len(collist) - 1:
source.write("\n")
source.write(
"""
info.appender.EndRow();
}
};"""
)
for table in tables.keys():
source.write(
"""
CBaseLoader<${ROW_TYPE}> *
DuckDBLoaderFactory::Create${TABLENAME}Loader() {
return new DuckDB${TABLENAME}Load(con, schema, "${TABLEINDB}" + suffix);
}
""".replace(
"${TABLENAME}", get_tablename(table)
)
.replace("${ROW_TYPE}", table.upper().replace(' ', '_') + '_ROW')
.replace("${TABLEINDB}", table.replace(' ', '_'))
)
source.write("\n")
# static string RegionSchema(string schema, string suffix) {
# return "CREATE TABLE " + schema + ".region" + suffix + " ("
# "r_regionkey INT NOT NULL,"
# "r_name VARCHAR(25) NOT NULL,"
# "r_comment VARCHAR(152) NOT NULL);";
# }
for table in tables.keys():
tname = table.replace(' ', '_')
str = 'static string ' + table.title().replace(' ', '') + 'Schema(string schema, string suffix) {\n'
str += '\treturn "CREATE TABLE " + schema + ".%s" + suffix + " ("\n' % (tname,)
columns = tables[table]
for i in range(len(columns)):
column = columns[i]
str += '\t "' + column[0] + " " + column[3]
if i == len(columns) - 1:
str += ')";'
else:
str += ',"'
str += "\n"
str += "}\n\n"
source.write(str)
func = 'void CreateTPCESchema(duckdb::DuckDB &db, duckdb::Connection &con, std::string &schema, std::string &suffix)'
header.write(func + ';\n\n')
source.write(func + ' {\n')
# con.Query(RegionSchema(schema, suffix));
for table in tables.keys():
tname = table.replace(' ', '_')
source.write('\tcon.Query(%sSchema(schema, suffix));\n' % (table.title().replace(' ', '')))
source.write('}\n\n')
for fp in [header, source]:
fp.write("} /* namespace TPCE */\n")
fp.close()

View File

@@ -0,0 +1,61 @@
import argparse
import sys
import subprocess
import re
import os
DEFAULT_UNITTEST_PATH = 'build/release/test/unittest'
parser = argparse.ArgumentParser(description='Print a list of tests to run.')
parser.add_argument(
'--file-contains',
dest='file_contains',
action='store',
help='Filter based on a string contained in the text',
default=None,
)
parser.add_argument(
'--unittest',
dest='unittest',
action='store',
help='The path to the unittest program',
default=DEFAULT_UNITTEST_PATH,
)
parser.add_argument('--list', dest='filter', action='store', help='The unittest filter to apply', default='')
args = parser.parse_args()
file_contains = args.file_contains
extra_args = [args.filter]
unittest_program = args.unittest
# Override default for windows
if os.name == 'nt' and unittest_program == DEFAULT_UNITTEST_PATH:
unittest_program = 'build/release/test/Release/unittest.exe'
proc = subprocess.Popen([unittest_program, '-l'] + extra_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout = proc.stdout.read().decode('utf8')
stderr = proc.stderr.read().decode('utf8')
if proc.returncode is not None and proc.returncode != 0:
print("Failed to run program " + unittest_program)
print(proc.returncode)
print(stdout)
print(stderr)
exit(1)
test_cases = []
for line in stdout.splitlines()[1:]:
if not line.strip():
continue
splits = line.rsplit('\t', 1)
if file_contains is not None:
if not os.path.isfile(splits[0]):
continue
try:
with open(splits[0], 'r') as f:
text = f.read()
except UnicodeDecodeError:
continue
if file_contains not in text:
continue
print(splits[0])

View File

@@ -0,0 +1,78 @@
import amalgamation
import os
import re
import sys
import shutil
from python_helpers import open_utf8
include_counts = {}
include_chains = {}
cached_includes = {}
def analyze_include_file(fpath, already_included_files, prev_include=""):
if fpath in already_included_files:
return
if fpath in amalgamation.always_excluded:
return
if fpath not in cached_includes:
# print(fpath)
with open_utf8(fpath, 'r') as f:
text = f.read()
(statements, includes) = amalgamation.get_includes(fpath, text)
cached_includes[fpath] = includes
else:
includes = cached_includes[fpath]
if fpath in include_counts:
include_counts[fpath] += 1
else:
include_counts[fpath] = 1
if fpath not in include_chains:
include_chains[fpath] = {}
if prev_include not in include_chains[fpath]:
include_chains[fpath][prev_include] = 0
include_chains[fpath][prev_include] += 1
already_included_files.append(fpath)
if fpath.endswith('.h') or fpath.endswith('.hpp'):
prev_include = fpath
for include in includes:
analyze_include_file(include, already_included_files, prev_include)
def analyze_includes(dir):
files = os.listdir(dir)
files.sort()
for fname in files:
if fname in amalgamation.excluded_files:
continue
fpath = os.path.join(dir, fname)
if os.path.isdir(fpath):
analyze_includes(fpath)
elif fname.endswith('.cpp') or fname.endswith('.c') or fname.endswith('.cc'):
analyze_include_file(fpath, [])
for compile_dir in amalgamation.compile_directories:
analyze_includes(compile_dir)
kws = []
for entry in include_counts.keys():
kws.append([entry, include_counts[entry]])
kws.sort(key=lambda tup: -tup[1])
for k in range(0, len(kws)):
include_file = kws[k][0]
include_count = kws[k][1]
print("------------------------------------------------------------")
print(include_file + " (" + str(include_count) + ")")
print("------------------------------------------------------------")
print("FILE INCLUDED FROM:")
chainkws = []
for chain in include_chains[include_file]:
chainkws.append([chain, include_chains[include_file][chain]])
chainkws.sort(key=lambda tup: -tup[1])
for l in range(0, min(5, len(chainkws))):
print(chainkws[l])

21
external/duckdb/scripts/install_node.sh vendored Executable file
View File

@@ -0,0 +1,21 @@
#!/usr/bin/env bash
if [[ ${1:-false} == 'false' ]]; then
echo "Error: pass node version as first argument"
exit 1
fi
NODE_VERSION=$1
# if an existing nvm is already installed we need to unload it
nvm unload || true
# here we set up the node version on the fly based on the matrix value.
# This is done manually so that the build works the same on OS X
rm -rf ./__nvm/ && git clone --depth 1 https://github.com/creationix/nvm.git ./__nvm
source ./__nvm/nvm.sh
nvm install ${NODE_VERSION}
nvm use --delete-prefix ${NODE_VERSION}
node --version
npm --version
which node

View File

@@ -0,0 +1,33 @@
import argparse
import requests
parser = argparse.ArgumentParser(description='Generate the list of packages provided by the registry at <baseline>.')
parser.add_argument(
'--baseline',
action='store',
help='The baseline (git commit) of the vcpkg-duckdb-ports',
required=True,
)
args = parser.parse_args()
GITHUB_API = "https://api.github.com/repos/duckdb/vcpkg-duckdb-ports/git/trees"
def main():
# Get the tree recursively for the commit
response = requests.get(f"{GITHUB_API}/{args.baseline}?recursive=1")
response.raise_for_status()
# Extract package names from ports directory
packages = set()
for item in response.json()['tree']:
path = item['path']
if path.startswith('ports/'):
parts = path.split('/')
if len(parts) > 2:
packages.add(parts[1])
print(sorted(list(packages)))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,97 @@
import json
import os
import sys
# Pass vcpkg.json files to merge their dependencies and produce a single vcpkg.json with their
# combined & deduplicated dependencies. Note that this script is very dumb and some manual merging may be required
# to combine extensions from multiple builds in the case of colliding dependencies.
# Also: note that due to the fact that the httpfs extension currently can not use the latest openssl version (3.1),
# we need to pin the openssl version requiring us to also pin the vcpkg version here. When updating the vcpkg git hash
# we probably want to change it here and in ('.github/actions/build_extensions/action.yml') at the same time
dependencies_str = []
dependencies_dict = []
merged_overlay_ports = []
merged_overlay_triplets = []
def prefix_overlay_ports_or_triples(overlay_dir, path_to_vcpkg_json):
def prefix_overlay_port_or_triplet(overlay_port_or_triplet):
vcpkg_prefix_path = path_to_vcpkg_json[0 : path_to_vcpkg_json.find("/vcpkg.json")]
if len(vcpkg_prefix_path) == 0:
return overlay_port_or_triplet
return vcpkg_prefix_path + '/' + overlay_port_or_triplet
return map(prefix_overlay_port_or_triplet, overlay_dir)
for file in sys.argv[1:]:
f = open(file)
data = json.load(f)
if 'dependencies' in data:
for dep in data['dependencies']:
if type(dep) is str:
dependencies_str.append(dep)
elif type(dep) is dict:
dependencies_dict.append(dep)
else:
raise Exception(f"Unknown entry type found in dependencies: '{dep}'")
if 'vcpkg-configuration' in data:
if 'overlay-ports' in data['vcpkg-configuration']:
merged_overlay_ports += prefix_overlay_ports_or_triples(data['vcpkg-configuration']['overlay-ports'], file)
if 'overlay-triplets' in data['vcpkg-configuration']:
merged_overlay_triplets += prefix_overlay_ports_or_triples(
data['vcpkg-configuration']['overlay-triplets'], file
)
final_deduplicated_deps = list()
dedup_set = set()
for dep in dependencies_dict:
if dep['name'] not in dedup_set:
final_deduplicated_deps.append(dep)
# TODO: deduplication is disabled for now, just let vcpkg handle duplicates in deps
# dedup_set.add(dep['name'])
for dep in dependencies_str:
if dep not in dedup_set:
final_deduplicated_deps.append(dep)
# TODO: deduplication is disabled for now, just let vcpkg handle duplicates in deps
# dedup_set.add(dep)
opensslVersion = os.getenv("OPENSSL_VERSION_OVERRIDE", "3.0.8")
data = {
"description": f"Auto-generated vcpkg.json for combined DuckDB extension build, generated by 'scripts/merge_vcpkg_deps.py'",
"builtin-baseline": "ce613c41372b23b1f51333815feb3edd87ef8a8b",
"dependencies": final_deduplicated_deps,
"overrides": [{"name": "openssl", "version": opensslVersion}],
}
data['vcpkg-configuration'] = {}
if merged_overlay_ports:
data['vcpkg-configuration']['overlay-ports'] = merged_overlay_ports
if merged_overlay_triplets:
data['vcpkg-configuration']['overlay-triplets'] = merged_overlay_triplets
REGISTRY_BASELINE = '869bddccca976e0abe25894356e7f49e77765169'
# NOTE: use 'scripts/list_vcpkg_registry_packages.py --baseline <baseline>' to generate the list of packages
data['vcpkg-configuration']['registries'] = [
{
"kind": "git",
"repository": "https://github.com/duckdb/vcpkg-duckdb-ports",
"baseline": REGISTRY_BASELINE,
"packages": ['avro-c', 'vcpkg-cmake'],
}
]
# Print output
print("Writing to 'build/extension_configuration/vcpkg.json': ")
print(data["dependencies"])
with open('build/extension_configuration/vcpkg.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)

View File

@@ -0,0 +1,75 @@
# This script is used by CI to modify the deployment matrix for the extension distribution
import argparse
import json
import sys
import logging
# Define command-line arguments
parser = argparse.ArgumentParser(description="Filter a JSON file based on excluded duckdb_arch values and select an OS")
parser.add_argument("--input", required=True, help="Input JSON file path")
parser.add_argument("--exclude", required=True, help="Semicolon-separated list of excluded duckdb_arch values")
parser.add_argument("--output", help="Output JSON file path")
parser.add_argument("--pretty", action="store_true", help="Pretty print the output JSON")
parser.add_argument("--select_os", help="Select an OS to include in the output JSON")
parser.add_argument("--deploy_matrix", action="store_true", help="Create a merged list used in deploy step")
args = parser.parse_args()
# Parse the input file path, excluded arch values, and output file path
input_json_file_path = args.input
excluded_arch_values = args.exclude.split(";")
output_json_file_path = args.output
select_os = args.select_os
# Read the input JSON file
with open(input_json_file_path, "r") as json_file:
data = json.load(json_file)
# Function to filter entries based on duckdb_arch values
def filter_entries(data, arch_values):
for os, config in data.items():
if "include" in config:
config["include"] = [entry for entry in config["include"] if entry["duckdb_arch"] not in arch_values]
if not config["include"]:
del config["include"]
return data
# Filter the JSON data
filtered_data = filter_entries(data, excluded_arch_values)
# Select an OS if specified
if select_os:
found = False
for os in filtered_data.keys():
if os == select_os:
filtered_data = filtered_data[os]
found = True
break
if found == False:
logging.warning('A selection OS was provided but not found')
filtered_data = []
# When deploy_matrix is specified, we only output a single merged include list with all the duckdb_archs
elif args.deploy_matrix:
deploy_archs = []
for os, config in filtered_data.items():
if "include" in config:
for item in config["include"]:
deploy_archs.append({"duckdb_arch": item["duckdb_arch"]})
filtered_data = {"include": deploy_archs}
# Determine the JSON formatting
indent = 2 if args.pretty else None
# If no output file is provided, print to stdout
if output_json_file_path:
with open(output_json_file_path, "w") as output_json_file:
if filtered_data:
json.dump(filtered_data, output_json_file, indent=indent)
else:
json.dump(filtered_data, sys.stdout, indent=indent)

BIN
external/duckdb/scripts/null.txt vendored Normal file

Binary file not shown.

View File

@@ -0,0 +1,15 @@
# create variables
export CERTIFICATE_PATH=$RUNNER_TEMP/build_certificate.p12
export KEYCHAIN_PATH=$RUNNER_TEMP/app-signing.keychain-db
# import certificate and provisioning profile from secrets
echo -n "$BUILD_CERTIFICATE_BASE64" | base64 --decode -o $CERTIFICATE_PATH
# create temporary keychain
security create-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH
security set-keychain-settings -lut 21600 $KEYCHAIN_PATH
security unlock-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH
# import certificate to keychain
security import $CERTIFICATE_PATH -P "$P12_PASSWORD" -A -t cert -f pkcs12 -k $KEYCHAIN_PATH
security list-keychain -d user -s $KEYCHAIN_PATH

417
external/duckdb/scripts/package_build.py vendored Normal file
View File

@@ -0,0 +1,417 @@
import os
import sys
import shutil
import subprocess
from python_helpers import open_utf8
import re
excluded_objects = ['utf8proc_data.cpp']
def third_party_includes():
includes = []
includes += [os.path.join('third_party', 'concurrentqueue')]
includes += [os.path.join('third_party', 'fast_float')]
includes += [os.path.join('third_party', 'fastpforlib')]
includes += [os.path.join('third_party', 'fmt', 'include')]
includes += [os.path.join('third_party', 'fsst')]
includes += [os.path.join('third_party', 'httplib')]
includes += [os.path.join('third_party', 'hyperloglog')]
includes += [os.path.join('third_party', 'jaro_winkler')]
includes += [os.path.join('third_party', 'jaro_winkler', 'details')]
includes += [os.path.join('third_party', 'libpg_query')]
includes += [os.path.join('third_party', 'libpg_query', 'include')]
includes += [os.path.join('third_party', 'lz4')]
includes += [os.path.join('third_party', 'brotli', 'include')]
includes += [os.path.join('third_party', 'brotli', 'common')]
includes += [os.path.join('third_party', 'brotli', 'dec')]
includes += [os.path.join('third_party', 'brotli', 'enc')]
includes += [os.path.join('third_party', 'mbedtls', 'include')]
includes += [os.path.join('third_party', 'mbedtls', 'library')]
includes += [os.path.join('third_party', 'miniz')]
includes += [os.path.join('third_party', 'pcg')]
includes += [os.path.join('third_party', 'pdqsort')]
includes += [os.path.join('third_party', 're2')]
includes += [os.path.join('third_party', 'ska_sort')]
includes += [os.path.join('third_party', 'skiplist')]
includes += [os.path.join('third_party', 'tdigest')]
includes += [os.path.join('third_party', 'utf8proc')]
includes += [os.path.join('third_party', 'utf8proc', 'include')]
includes += [os.path.join('third_party', 'vergesort')]
includes += [os.path.join('third_party', 'yyjson', 'include')]
includes += [os.path.join('third_party', 'zstd', 'include')]
return includes
def third_party_sources():
sources = []
sources += [os.path.join('third_party', 'fmt')]
sources += [os.path.join('third_party', 'fsst')]
sources += [os.path.join('third_party', 'miniz')]
sources += [os.path.join('third_party', 're2')]
sources += [os.path.join('third_party', 'hyperloglog')]
sources += [os.path.join('third_party', 'skiplist')]
sources += [os.path.join('third_party', 'fastpforlib')]
sources += [os.path.join('third_party', 'utf8proc')]
sources += [os.path.join('third_party', 'libpg_query')]
sources += [os.path.join('third_party', 'mbedtls')]
sources += [os.path.join('third_party', 'yyjson')]
sources += [os.path.join('third_party', 'zstd')]
return sources
def file_is_lib(fname, libname):
libextensions = ['.a', '.lib']
libprefixes = ['', 'lib']
for ext in libextensions:
for prefix in libprefixes:
potential_libname = prefix + libname + ext
if fname == potential_libname:
return True
return False
def get_libraries(binary_dir, libraries, extensions):
result_libs = []
def find_library_recursive(search_dir, libname):
flist = os.listdir(search_dir)
for fname in flist:
fpath = os.path.join(search_dir, fname)
if os.path.isdir(fpath):
entry = find_library_recursive(fpath, libname)
if entry != None:
return entry
elif os.path.isfile(fpath) and file_is_lib(fname, libname):
return search_dir
return None
def find_library(search_dir, libname, result_libs, required=False):
if libname == 'Threads::Threads':
result_libs += [(None, 'pthread')]
return
libdir = find_library_recursive(binary_dir, libname)
if libdir is None and required:
raise Exception(f"Failed to locate required library {libname} in {binary_dir}")
result_libs += [(libdir, libname)]
duckdb_lib_name = 'duckdb_static'
if os.name == 'nt':
duckdb_lib_name = 'duckdb'
find_library(os.path.join(binary_dir, 'src'), duckdb_lib_name, result_libs, True)
for ext in extensions:
find_library(os.path.join(binary_dir, 'extension', ext), ext + '_extension', result_libs, True)
for libname in libraries:
find_library(binary_dir, libname, result_libs)
return result_libs
def includes(extensions):
scripts_dir = os.path.dirname(os.path.abspath(__file__))
# add includes for duckdb and extensions
includes = []
includes.append(os.path.join(scripts_dir, '..', 'src', 'include'))
includes.append(os.path.join(scripts_dir, '..'))
includes.append(os.path.join(scripts_dir, '..', 'third_party', 'utf8proc', 'include'))
for ext in extensions:
includes.append(os.path.join(scripts_dir, '..', 'extension', ext, 'include'))
return includes
def include_flags(extensions):
return ' ' + ' '.join(['-I' + x for x in includes(extensions)])
def convert_backslashes(x):
return '/'.join(x.split(os.path.sep))
def get_relative_path(source_dir, target_file):
source_dir = convert_backslashes(source_dir)
target_file = convert_backslashes(target_file)
# absolute path: try to convert
if source_dir in target_file:
target_file = target_file.replace(source_dir, "").lstrip('/')
return target_file
######
# MAIN_BRANCH_VERSIONING default should be 'True' for main branch and feature branches
# MAIN_BRANCH_VERSIONING default should be 'False' for release branches
# MAIN_BRANCH_VERSIONING default value needs to keep in sync between:
# - CMakeLists.txt
# - scripts/amalgamation.py
# - scripts/package_build.py
######
MAIN_BRANCH_VERSIONING = True
if os.getenv('MAIN_BRANCH_VERSIONING') == "0":
MAIN_BRANCH_VERSIONING = False
if os.getenv('MAIN_BRANCH_VERSIONING') == "1":
MAIN_BRANCH_VERSIONING = True
def get_git_describe():
override_git_describe = os.getenv('OVERRIDE_GIT_DESCRIBE') or ''
versioning_tag_match = 'v*.*.*'
if MAIN_BRANCH_VERSIONING:
versioning_tag_match = 'v*.*.0'
# empty override_git_describe, either since env was empty string or not existing
# -> ask git (that can fail, so except in place)
if len(override_git_describe) == 0:
try:
return (
subprocess.check_output(
['git', 'describe', '--tags', '--long', '--debug', '--match', versioning_tag_match]
)
.strip()
.decode('utf8')
)
except subprocess.CalledProcessError:
return "v0.0.0-0-gdeadbeeff"
if len(override_git_describe.split('-')) == 3:
return override_git_describe
if len(override_git_describe.split('-')) == 1:
override_git_describe += "-0"
assert len(override_git_describe.split('-')) == 2
try:
return (
override_git_describe
+ "-g"
+ subprocess.check_output(['git', 'log', '-1', '--format=%h']).strip().decode('utf8')
)
except subprocess.CalledProcessError:
return override_git_describe + "-g" + "deadbeeff"
def git_commit_hash():
if 'SETUPTOOLS_SCM_PRETEND_HASH' in os.environ:
return os.environ['SETUPTOOLS_SCM_PRETEND_HASH']
try:
git_describe = get_git_describe()
hash = git_describe.split('-')[2].lstrip('g')
return hash
except:
return "deadbeeff"
def prefix_version(version):
"""Make sure the version is prefixed with 'v' to be of the form vX.Y.Z"""
if version.startswith('v'):
return version
return 'v' + version
def git_dev_version():
if 'SETUPTOOLS_SCM_PRETEND_VERSION' in os.environ:
return prefix_version(os.environ['SETUPTOOLS_SCM_PRETEND_VERSION'])
try:
long_version = get_git_describe()
version_splits = long_version.split('-')[0].lstrip('v').split('.')
dev_version = long_version.split('-')[1]
if int(dev_version) == 0:
# directly on a tag: emit the regular version
return "v" + '.'.join(version_splits)
else:
# not on a tag: increment the version by one and add a -devX suffix
# this needs to keep in sync with changes to CMakeLists.txt
if MAIN_BRANCH_VERSIONING == True:
# increment minor version
version_splits[1] = str(int(version_splits[1]) + 1)
else:
# increment patch version
version_splits[2] = str(int(version_splits[2]) + 1)
return "v" + '.'.join(version_splits) + "-dev" + dev_version
except:
return "v0.0.0"
def include_package(pkg_name, pkg_dir, include_files, include_list, source_list):
import amalgamation
original_path = sys.path
# append the directory
sys.path.append(pkg_dir)
ext_pkg = __import__(pkg_name + '_config')
ext_include_dirs = ext_pkg.include_directories
ext_source_files = ext_pkg.source_files
include_files += amalgamation.list_includes_files(ext_include_dirs)
include_list += ext_include_dirs
source_list += ext_source_files
sys.path = original_path
def build_package(target_dir, extensions, linenumbers=False, unity_count=32, folder_name='duckdb', short_paths=False):
if not os.path.isdir(target_dir):
os.mkdir(target_dir)
scripts_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(scripts_dir)
import amalgamation
prev_wd = os.getcwd()
os.chdir(os.path.join(scripts_dir, '..'))
# obtain the list of source files from the amalgamation
source_list = amalgamation.list_sources()
include_list = amalgamation.list_include_dirs()
include_files = amalgamation.list_includes()
def copy_file(src, target_dir):
# get the path
full_path = src.split(os.path.sep)
current_path = target_dir
for i in range(len(full_path) - 1):
current_path = os.path.join(current_path, full_path[i])
if not os.path.isdir(current_path):
os.mkdir(current_path)
target_name = full_path[-1]
target_file = os.path.join(current_path, target_name)
amalgamation.copy_if_different(src, target_file)
# include the main extension helper
include_files += [os.path.join('src', 'include', 'duckdb', 'main', 'extension_helper.hpp')]
# include the separate extensions
for ext in extensions:
ext_path = os.path.join(scripts_dir, '..', 'extension', ext)
include_package(ext, ext_path, include_files, include_list, source_list)
for src in source_list:
copy_file(src, target_dir)
for inc in include_files:
copy_file(inc, target_dir)
# handle pragma_version.cpp: paste #define DUCKDB_SOURCE_ID and DUCKDB_VERSION there
curdir = os.getcwd()
os.chdir(os.path.join(scripts_dir, '..'))
githash = git_commit_hash()
dev_version = git_dev_version()
dev_v_parts = dev_version.lstrip('v').split('.')
os.chdir(curdir)
# open the file and read the current contents
fpath = os.path.join(target_dir, 'src', 'function', 'table', 'version', 'pragma_version.cpp')
with open_utf8(fpath, 'r') as f:
text = f.read()
# now add the DUCKDB_SOURCE_ID define, if it is not there already
found_hash = False
found_dev = False
found_major = False
found_minor = False
found_patch = False
lines = text.split('\n')
for i in range(len(lines)):
if '#define DUCKDB_SOURCE_ID ' in lines[i]:
lines[i] = '#define DUCKDB_SOURCE_ID "{}"'.format(githash)
found_hash = True
if '#define DUCKDB_VERSION ' in lines[i]:
lines[i] = '#define DUCKDB_VERSION "{}"'.format(dev_version)
found_dev = True
if '#define DUCKDB_MAJOR_VERSION ' in lines[i]:
lines[i] = '#define DUCKDB_MAJOR_VERSION {}'.format(int(dev_v_parts[0]))
found_major = True
if '#define DUCKDB_MINOR_VERSION ' in lines[i]:
lines[i] = '#define DUCKDB_MINOR_VERSION {}'.format(int(dev_v_parts[1]))
found_minor = True
if '#define DUCKDB_PATCH_VERSION ' in lines[i]:
lines[i] = '#define DUCKDB_PATCH_VERSION "{}"'.format(dev_v_parts[2])
found_patch = True
if not found_hash:
lines = ['#ifndef DUCKDB_SOURCE_ID', '#define DUCKDB_SOURCE_ID "{}"'.format(githash), '#endif'] + lines
if not found_dev:
lines = ['#ifndef DUCKDB_VERSION', '#define DUCKDB_VERSION "{}"'.format(dev_version), '#endif'] + lines
if not found_major:
lines = [
'#ifndef DUCKDB_MAJOR_VERSION',
'#define DUCKDB_MAJOR_VERSION {}'.format(int(dev_v_parts[0])),
'#endif',
] + lines
if not found_minor:
lines = [
'#ifndef DUCKDB_MINOR_VERSION',
'#define DUCKDB_MINOR_VERSION {}'.format(int(dev_v_parts[1])),
'#endif',
] + lines
if not found_patch:
lines = [
'#ifndef DUCKDB_PATCH_VERSION',
'#define DUCKDB_PATCH_VERSION "{}"'.format(dev_v_parts[2]),
'#endif',
] + lines
text = '\n'.join(lines)
with open_utf8(fpath, 'w+') as f:
f.write(text)
def file_is_excluded(fname):
for entry in excluded_objects:
if entry in fname:
return True
return False
def generate_unity_build(entries, unity_name, linenumbers):
ub_file = os.path.join(target_dir, unity_name)
with open_utf8(ub_file, 'w+') as f:
for entry in entries:
if linenumbers:
f.write('#line 0 "{}"\n'.format(convert_backslashes(entry)))
f.write('#include "{}"\n\n'.format(convert_backslashes(entry)))
return ub_file
def generate_unity_builds(source_list, nsplits, linenumbers):
files_per_directory = {}
for source in source_list:
dirname = os.path.dirname(source)
if dirname not in files_per_directory:
files_per_directory[dirname] = []
files_per_directory[dirname].append(source)
new_source_files = []
for dirname in files_per_directory.keys():
current_files = files_per_directory[dirname]
cmake_file = os.path.join(dirname, 'CMakeLists.txt')
unity_build = False
if os.path.isfile(cmake_file):
with open(cmake_file, 'r') as f:
text = f.read()
if 'add_library_unity' in text:
unity_build = True
# re-order the files in the unity build so that they follow the same order as the CMake
scores = {}
filenames = [x[0] for x in re.findall('([a-zA-Z0-9_]+[.](cpp|cc|c|cxx))', text)]
score = 0
for filename in filenames:
scores[filename] = score
score += 1
current_files.sort(
key=lambda x: scores[os.path.basename(x)] if os.path.basename(x) in scores else 99999
)
if not unity_build:
if short_paths:
# replace source files with "__"
for file in current_files:
unity_filename = os.path.basename(file)
new_source_files.append(generate_unity_build([file], unity_filename, linenumbers))
else:
# directly use the source files
new_source_files += [os.path.join(folder_name, file) for file in current_files]
else:
unity_base = dirname.replace(os.path.sep, '_')
unity_name = f'ub_{unity_base}.cpp'
new_source_files.append(generate_unity_build(current_files, unity_name, linenumbers))
return new_source_files
original_sources = source_list
source_list = generate_unity_builds(source_list, unity_count, linenumbers)
os.chdir(prev_wd)
return (
[convert_backslashes(x) for x in source_list if not file_is_excluded(x)],
[convert_backslashes(x) for x in include_list],
[convert_backslashes(x) for x in original_sources],
)

21
external/duckdb/scripts/parser_test.py vendored Normal file
View File

@@ -0,0 +1,21 @@
from sqllogictest import SQLParserException, SQLLogicParser, SQLLogicTest
from typing import Optional
import argparse
def main():
parser = argparse.ArgumentParser(description="SQL Logic Parser")
parser.add_argument("filename", type=str, help="Path to the SQL logic file")
args = parser.parse_args()
filename = args.filename
parser = SQLLogicParser()
out: Optional[SQLLogicTest] = parser.parse(filename)
if not out:
raise SQLParserException(f"Test {filename} could not be parsed")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,207 @@
import argparse
import glob
import json
import os
import subprocess
import sys
from tqdm import tqdm
OLD_DB_NAME = "old.duckdb"
NEW_DB_NAME = "new.duckdb"
PROFILE_FILENAME = "duckdb_profile.json"
ENABLE_PROFILING = "PRAGMA enable_profiling=json"
PROFILE_OUTPUT = f"PRAGMA profile_output='{PROFILE_FILENAME}'"
BANNER_SIZE = 52
def init_db(cli, dbname, benchmark_dir):
print(f"INITIALIZING {dbname} ...")
subprocess.run(
f"{cli} {dbname} < {benchmark_dir}/init/schema.sql", shell=True, check=True, stdout=subprocess.DEVNULL
)
subprocess.run(f"{cli} {dbname} < {benchmark_dir}/init/load.sql", shell=True, check=True, stdout=subprocess.DEVNULL)
print("INITIALIZATION DONE")
class PlanCost:
def __init__(self):
self.total = 0
self.build_side = 0
self.probe_side = 0
self.time = 0
def __add__(self, other):
self.total += other.total
self.build_side += other.build_side
self.probe_side += other.probe_side
return self
def __gt__(self, other):
if self == other or self.total < other.total:
return False
# if the total intermediate cardinalities is greater, also inspect time.
# it's possible a plan reordering increased cardinalities, but overall execution time
# was not greatly affected
total_card_increased = self.total > other.total
build_card_increased = self.build_side > other.build_side
if total_card_increased and build_card_increased:
return True
# we know the total cardinality is either the same or higher and the build side has not increased
# in this case fall back to the timing. It's possible that even if the probe side is higher
# since the tuples are in flight, the plan executes faster
return self.time > other.time * 1.03
def __lt__(self, other):
if self == other:
return False
return not (self > other)
def __eq__(self, other):
return self.total == other.total and self.build_side == other.build_side and self.probe_side == other.probe_side
def is_measured_join(op) -> bool:
if 'name' not in op:
return False
if op['name'] != 'HASH_JOIN':
return False
if 'Join Type' not in op['extra_info']:
return False
if op['extra_info']['Join Type'].startswith('MARK'):
return False
return True
def op_inspect(op) -> PlanCost:
cost = PlanCost()
if 'Query' in op:
cost.time = op['operator_timing']
if is_measured_join(op):
cost.total = op['operator_cardinality']
if 'operator_cardinality' in op['children'][0]:
cost.probe_side += op['children'][0]['operator_cardinality']
if 'operator_cardinality' in op['children'][1]:
cost.build_side += op['children'][1]['operator_cardinality']
left_cost = op_inspect(op['children'][0])
right_cost = op_inspect(op['children'][1])
cost.probe_side += left_cost.probe_side + right_cost.probe_side
cost.build_side += left_cost.build_side + right_cost.build_side
cost.total += left_cost.total + right_cost.total
return cost
for child_op in op['children']:
cost += op_inspect(child_op)
return cost
def query_plan_cost(cli, dbname, query):
try:
subprocess.run(
f"{cli} --readonly {dbname} -c \"{ENABLE_PROFILING};{PROFILE_OUTPUT};{query}\"",
shell=True,
check=True,
capture_output=True,
)
except subprocess.CalledProcessError as e:
print("-------------------------")
print("--------Failure----------")
print("-------------------------")
print(e.stderr.decode('utf8'))
print("-------------------------")
print("--------Output----------")
print("-------------------------")
print(e.output.decode('utf8'))
print("-------------------------")
raise e
with open(PROFILE_FILENAME, 'r') as file:
return op_inspect(json.load(file))
def print_banner(text):
text_len = len(text)
rest = BANNER_SIZE - text_len - 10
l_width = int(rest / 2)
r_width = l_width
if rest % 2 != 0:
l_width += 1
print("")
print("=" * BANNER_SIZE)
print("=" * l_width + " " * 5 + text + " " * 5 + "=" * r_width)
print("=" * BANNER_SIZE)
def print_diffs(diffs):
for query_name, old_cost, new_cost in diffs:
print("")
print("Query:", query_name)
print("Old total cost:", old_cost.total)
print("Old build cost:", old_cost.build_side)
print("Old probe cost:", old_cost.probe_side)
print("New total cost:", new_cost.total)
print("New build cost:", new_cost.build_side)
print("New probe cost:", new_cost.probe_side)
def main():
parser = argparse.ArgumentParser(description="Plan cost regression test script with old and new versions.")
parser.add_argument("--old", type=str, help="Path to the old runner.", required=True)
parser.add_argument("--new", type=str, help="Path to the new runner.", required=True)
parser.add_argument("--dir", type=str, help="Path to the benchmark directory.", required=True)
args = parser.parse_args()
old = args.old
new = args.new
benchmark_dir = args.dir
init_db(old, OLD_DB_NAME, benchmark_dir)
init_db(new, NEW_DB_NAME, benchmark_dir)
improvements = []
regressions = []
files = glob.glob(f"{benchmark_dir}/queries/*.sql")
files.sort()
print("")
print("RUNNING BENCHMARK QUERIES")
for f in tqdm(files):
query_name = f.split("/")[-1].replace(".sql", "")
with open(f, "r") as file:
query = file.read()
old_cost = query_plan_cost(old, OLD_DB_NAME, query)
new_cost = query_plan_cost(new, NEW_DB_NAME, query)
if old_cost > new_cost:
improvements.append((query_name, old_cost, new_cost))
elif new_cost > old_cost:
regressions.append((query_name, old_cost, new_cost))
exit_code = 0
if improvements:
print_banner("IMPROVEMENTS DETECTED")
print_diffs(improvements)
if regressions:
exit_code = 1
print_banner("REGRESSIONS DETECTED")
print_diffs(regressions)
if not improvements and not regressions:
print_banner("NO DIFFERENCES DETECTED")
os.remove(OLD_DB_NAME)
os.remove(NEW_DB_NAME)
os.remove(PROFILE_FILENAME)
exit(exit_code)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,23 @@
def open_utf8(fpath, flags):
import sys
if sys.version_info[0] < 3:
return open(fpath, flags)
else:
return open(fpath, flags, encoding="utf8")
def normalize_path(path):
import os
def normalize(p):
return os.path.sep.join(p.split('/'))
if isinstance(path, list):
normed = map(lambda p: normalize(p), path)
return list(normed)
if isinstance(path, str):
return normalize(path)
raise Exception("Can only be called with a str or list argument")

View File

@@ -0,0 +1,17 @@
SET(CMAKE_SYSTEM_NAME Linux)
# Define our host system
SET(CMAKE_SYSTEM_NAME Linux)
SET(CMAKE_SYSTEM_VERSION 1)
# Define the cross compiler locations
SET(CMAKE_C_COMPILER ${DUCKDB_RPI_TOOLCHAIN_PREFIX}/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc)
SET(CMAKE_CXX_COMPILER ${DUCKDB_RPI_TOOLCHAIN_PREFIX}/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc)
# Define the sysroot path for the RaspberryPi distribution in our tools folder
SET(CMAKE_FIND_ROOT_PATH ${DUCKDB_RPI_TOOLCHAIN_PREFIX}/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/arm-linux-gnueabihf/sysroot/)
# Use our definitions for compiler tools
SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
# Search for libraries and headers in the target directories only
SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
SET(DUCKDB_EXTRA_LINK_FLAGS -lstdc++ -lgcc -lm)

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,193 @@
import subprocess
import statistics
from io import StringIO
import csv
from dataclasses import dataclass
import argparse
from typing import Optional, Union, Tuple, List
import functools
print = functools.partial(print, flush=True)
STDERR_HEADER = '''====================================================
============== STDERR =============
====================================================
'''
STDOUT_HEADER = '''====================================================
============== STDOUT =============
====================================================
'''
# timeouts in seconds
MAX_TIMEOUT = 3600
DEFAULT_TIMEOUT = 600
@dataclass
class BenchmarkRunnerConfig:
"Configuration for a BenchmarkRunner"
benchmark_runner: str
benchmark_file: str
verbose: bool = False
threads: Optional[int] = None
memory_limit: Optional[str] = None
disable_timeout: bool = False
max_timeout: int = MAX_TIMEOUT
root_dir: str = ""
no_summary: bool = False
@classmethod
def from_params(cls, benchmark_runner, benchmark_file, **kwargs) -> "BenchmarkRunnerConfig":
verbose = kwargs.get("verbose", False)
threads = kwargs.get("threads", None)
memory_limit = kwargs.get("memory_limit", None)
disable_timeout = kwargs.get("disable_timeout", False)
max_timeout = kwargs.get("max_timeout", MAX_TIMEOUT)
root_dir = kwargs.get("root_dir", "")
no_summary = kwargs.get("no_summary", False)
config = cls(
benchmark_runner=benchmark_runner,
benchmark_file=benchmark_file,
verbose=verbose,
threads=threads,
memory_limit=memory_limit,
disable_timeout=disable_timeout,
max_timeout=max_timeout,
root_dir=root_dir,
no_summary=no_summary,
)
return config
@classmethod
def from_args(cls) -> "BenchmarkRunnerConfig":
parser = argparse.ArgumentParser(description="Benchmark script with old and new runners.")
# Define the arguments
parser.add_argument("--path", type=str, help="Path to the benchmark_runner executable", required=True)
parser.add_argument("--benchmarks", type=str, help="Path to the benchmark file.", required=True)
parser.add_argument("--verbose", action="store_true", help="Enable verbose output.")
parser.add_argument("--threads", type=int, help="Number of threads to use.")
parser.add_argument("--memory_limit", type=str, help="Memory limit to use.")
parser.add_argument("--disable-timeout", action="store_true", help="Disable timeout.")
parser.add_argument(
"--max-timeout", type=int, default=3600, help="Set maximum timeout in seconds (default: 3600)."
)
parser.add_argument("--root-dir", type=str, default="", help="Root directory.")
parser.add_argument(
"--no-summary", type=str, default=False, help="No failures summary is outputed when passing this flag."
)
# Parse arguments
parsed_args = parser.parse_args()
# Create an instance of BenchmarkRunnerConfig using parsed arguments
config = cls(
benchmark_runner=parsed_args.path,
benchmark_file=parsed_args.benchmarks,
verbose=parsed_args.verbose,
threads=parsed_args.threads,
memory_limit=parsed_args.memory_limit,
disable_timeout=parsed_args.disable_timeout,
max_timeout=parsed_args.max_timeout,
root_dir=parsed_args.root_dir,
no_summary=parsed_args.no_summary,
)
return config
class BenchmarkRunner:
def __init__(self, config: BenchmarkRunnerConfig):
self.config = config
self.complete_timings = []
self.benchmark_list: List[str] = []
with open(self.config.benchmark_file, 'r') as f:
self.benchmark_list = [x.strip() for x in f.read().split('\n') if len(x) > 0]
def construct_args(self, benchmark_path):
benchmark_args = []
benchmark_args.extend([self.config.benchmark_runner, benchmark_path])
if self.config.root_dir:
benchmark_args.extend(['--root-dir', self.config.root_dir])
if self.config.threads:
benchmark_args.extend([f"--threads={self.config.threads}"])
if self.config.memory_limit:
benchmark_args.extend([f"--memory_limit={self.config.memory_limit}"])
if self.config.disable_timeout:
benchmark_args.extend(["--disable-timeout"])
if self.config.no_summary:
benchmark_args.extend(["--no-summary"])
return benchmark_args
def run_benchmark(self, benchmark) -> Tuple[Union[float, str], Optional[str]]:
benchmark_args = self.construct_args(benchmark)
timeout_seconds = DEFAULT_TIMEOUT
if self.config.disable_timeout:
timeout_seconds = self.config.max_timeout
try:
proc = subprocess.run(
benchmark_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout_seconds
)
out = proc.stdout.decode('utf8')
err = proc.stderr.decode('utf8')
returncode = proc.returncode
except subprocess.TimeoutExpired:
print("Failed to run benchmark " + benchmark)
print(f"Aborted due to exceeding the limit of {timeout_seconds} seconds")
return (
'Failed to run benchmark ' + benchmark,
f"Aborted due to exceeding the limit of {timeout_seconds} seconds",
)
if returncode != 0:
print("Failed to run benchmark " + benchmark)
print(STDERR_HEADER)
print(err)
print(STDOUT_HEADER)
print(out)
if 'HTTP' in err:
print("Ignoring HTTP error and terminating the running of the regression tests")
exit(0)
return 'Failed to run benchmark ' + benchmark, err
if self.config.verbose:
print(err)
# read the input CSV
f = StringIO(err)
csv_reader = csv.reader(f, delimiter='\t')
header = True
timings = []
try:
for row in csv_reader:
if len(row) == 0:
continue
if header:
header = False
else:
timings.append(row[2])
self.complete_timings.append(row[2])
return float(statistics.median(timings)), None
except:
print("Failed to run benchmark " + benchmark)
print(err)
return 'Failed to run benchmark ' + benchmark, err
def run_benchmarks(self, benchmark_list: List[str]):
results = {}
failures = {}
for benchmark in benchmark_list:
result, failure_message = self.run_benchmark(benchmark)
results[benchmark] = result
failures[benchmark] = failure_message if failure_message else None
return results, failures
def main():
config = BenchmarkRunnerConfig.from_args()
runner = BenchmarkRunner(config)
runner.run_benchmarks()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,227 @@
import os
import math
import functools
import shutil
from benchmark import BenchmarkRunner, BenchmarkRunnerConfig
from dataclasses import dataclass
from typing import Optional, List, Union
import subprocess
print = functools.partial(print, flush=True)
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
# Geometric mean of an array of numbers
def geomean(xs):
if len(xs) == 0:
return 'EMPTY'
for entry in xs:
if not is_number(entry):
return entry
return math.exp(math.fsum(math.log(float(x)) for x in xs) / len(xs))
import argparse
# Set up the argument parser
parser = argparse.ArgumentParser(description="Benchmark script with old and new runners.")
# Define the arguments
parser.add_argument("--old", type=str, help="Path to the old runner.", required=True)
parser.add_argument("--new", type=str, help="Path to the new runner.", required=True)
parser.add_argument("--benchmarks", type=str, help="Path to the benchmark file.", required=True)
parser.add_argument("--verbose", action="store_true", help="Enable verbose output.")
parser.add_argument("--threads", type=int, help="Number of threads to use.")
parser.add_argument("--memory_limit", type=str, help="Memory limit to use.")
parser.add_argument("--nofail", action="store_true", help="Do not fail on regression.")
parser.add_argument("--disable-timeout", action="store_true", help="Disable timeout.")
parser.add_argument("--max-timeout", type=int, default=3600, help="Set maximum timeout in seconds (default: 3600).")
parser.add_argument("--root-dir", type=str, default="", help="Root directory.")
parser.add_argument("--no-summary", type=str, default=False, help="No summary in the end.")
parser.add_argument(
"--regression-threshold-seconds",
type=float,
default=0.05,
help="REGRESSION_THRESHOLD_SECONDS value for large benchmarks.",
)
# Parse the arguments
args = parser.parse_args()
# Assign parsed arguments to variables
old_runner_path = args.old
new_runner_path = args.new
benchmark_file = args.benchmarks
verbose = args.verbose
threads = args.threads
memory_limit = args.memory_limit
no_regression_fail = args.nofail
disable_timeout = args.disable_timeout
max_timeout = args.max_timeout
root_dir = args.root_dir
no_summary = args.no_summary
regression_threshold_seconds = args.regression_threshold_seconds
# how many times we will run the experiment, to be sure of the regression
NUMBER_REPETITIONS = 5
# the threshold at which we consider something a regression (percentage)
REGRESSION_THRESHOLD_PERCENTAGE = 0.1
# minimal seconds diff for something to be a regression (for very fast benchmarks)
REGRESSION_THRESHOLD_SECONDS = regression_threshold_seconds
if not os.path.isfile(old_runner_path):
print(f"Failed to find old runner {old_runner_path}")
exit(1)
if not os.path.isfile(new_runner_path):
print(f"Failed to find new runner {new_runner_path}")
exit(1)
config_dict = vars(args)
old_runner = BenchmarkRunner(BenchmarkRunnerConfig.from_params(old_runner_path, benchmark_file, **config_dict))
new_runner = BenchmarkRunner(BenchmarkRunnerConfig.from_params(new_runner_path, benchmark_file, **config_dict))
benchmark_list = old_runner.benchmark_list
summary = []
@dataclass
class BenchmarkResult:
benchmark: str
old_result: Union[float, str]
new_result: Union[float, str]
old_failure: Optional[str] = None
new_failure: Optional[str] = None
multiply_percentage = 1.0 + REGRESSION_THRESHOLD_PERCENTAGE
other_results: List[BenchmarkResult] = []
error_list: List[BenchmarkResult] = []
for i in range(NUMBER_REPETITIONS):
regression_list: List[BenchmarkResult] = []
if len(benchmark_list) == 0:
break
print(
f'''====================================================
============== ITERATION {i} =============
============== REMAINING {len(benchmark_list)} =============
====================================================
'''
)
old_results, old_failures = old_runner.run_benchmarks(benchmark_list)
new_results, new_failures = new_runner.run_benchmarks(benchmark_list)
for benchmark in benchmark_list:
old_res = old_results[benchmark]
new_res = new_results[benchmark]
old_fail = old_failures[benchmark]
new_fail = new_failures[benchmark]
if isinstance(old_res, str) or isinstance(new_res, str):
# benchmark failed to run - always a regression
error_list.append(BenchmarkResult(benchmark, old_res, new_res, old_fail, new_fail))
elif (no_regression_fail == False) and (
(old_res + REGRESSION_THRESHOLD_SECONDS) * multiply_percentage < new_res
):
regression_list.append(BenchmarkResult(benchmark, old_res, new_res))
else:
other_results.append(BenchmarkResult(benchmark, old_res, new_res))
benchmark_list = [res.benchmark for res in regression_list]
exit_code = 0
regression_list.extend(error_list)
summary = []
if len(regression_list) > 0:
exit_code = 1
print(
'''====================================================
============== REGRESSIONS DETECTED =============
====================================================
'''
)
for regression in regression_list:
print(f"{regression.benchmark}")
print(f"Old timing: {regression.old_result}")
print(f"New timing: {regression.new_result}")
if regression.old_failure or regression.new_failure:
new_data = {
"benchmark": regression.benchmark,
"old_failure": regression.old_failure,
"new_failure": regression.new_failure,
}
summary.append(new_data)
print("")
print(
'''====================================================
============== OTHER TIMINGS =============
====================================================
'''
)
else:
print(
'''====================================================
============== NO REGRESSIONS DETECTED =============
====================================================
'''
)
other_results.sort(key=lambda x: x.benchmark)
for res in other_results:
print(f"{res.benchmark}")
print(f"Old timing: {res.old_result}")
print(f"New timing: {res.new_result}")
print("")
time_a = geomean(old_runner.complete_timings)
time_b = geomean(new_runner.complete_timings)
print("")
if isinstance(time_a, str) or isinstance(time_b, str):
print(f"Old: {time_a}")
print(f"New: {time_b}")
elif time_a > time_b * 1.01:
print(f"Old timing geometric mean: {time_a}")
print(f"New timing geometric mean: {time_b}, roughly {int((time_a - time_b) * 100.0 / time_a)}% faster")
elif time_b > time_a * 1.01:
print(f"Old timing geometric mean: {time_a}, roughly {int((time_b - time_a) * 100.0 / time_b)}% faster")
print(f"New timing geometric mean: {time_b}")
else:
print(f"Old timing geometric mean: {time_a}")
print(f"New timing geometric mean: {time_b}")
# nuke cached benchmark data between runs
if os.path.isdir("duckdb_benchmark_data"):
shutil.rmtree('duckdb_benchmark_data')
if summary and not no_summary:
print(
'''\n\n====================================================
================ FAILURES SUMMARY ================
====================================================
'''
)
# check the value is "true" otherwise you'll see the prefix in local run outputs
prefix = "::error::" if ('CI' in os.environ and os.getenv('CI') == 'true') else ""
for i, failure_message in enumerate(summary, start=1):
prefix_str = f"{prefix}{i}" if len(prefix) > 0 else f"{i}"
print(f"{prefix_str}: ", failure_message["benchmark"])
if failure_message["old_failure"] != failure_message["new_failure"]:
print("Old:\n", failure_message["old_failure"])
print("New:\n", failure_message["new_failure"])
else:
print(failure_message["old_failure"])
print("-", 52)
exit(exit_code)

View File

@@ -0,0 +1,115 @@
import os
import sys
import duckdb
import numpy
import subprocess
from io import StringIO
import csv
import statistics
old_file = None
new_file = None
# the threshold at which we consider something a regression (percentage)
regression_threshold_percentage = 0.1
# minimal seconds diff for something to be a regression (for very fast benchmarks)
regression_threshold_seconds = 0.01
for arg in sys.argv:
if arg.startswith("--old="):
old_file = arg.replace("--old=", "")
elif arg.startswith("--new="):
new_file = arg.replace("--new=", "")
if old_file is None or new_file is None:
print("Usage: python scripts/regression_check.py --old=<old_file> --new-<new_file>")
exit(1)
con = duckdb.connect()
old_timings_l = con.execute(
f"SELECT name, median(time) FROM read_csv_auto('{old_file}') t(name, nrun, time) GROUP BY ALL ORDER BY ALL"
).fetchall()
new_timings_l = con.execute(
f"SELECT name, median(time) FROM read_csv_auto('{new_file}') t(name, nrun, time) GROUP BY ALL ORDER BY ALL"
).fetchall()
old_timings = {}
new_timings = {}
for entry in old_timings_l:
name = entry[0]
timing = entry[1]
old_timings[name] = timing
for entry in new_timings_l:
name = entry[0]
timing = entry[1]
new_timings[name] = timing
slow_keys = []
multiply_percentage = 1.0 + regression_threshold_percentage
test_keys = list(new_timings.keys())
test_keys.sort()
for key in test_keys:
new_timing = new_timings[key]
old_timing = old_timings[key]
if (old_timing + regression_threshold_seconds) * multiply_percentage < new_timing:
slow_keys.append(key)
return_code = 0
if len(slow_keys) > 0:
print(
'''====================================================
============== REGRESSIONS DETECTED =============
====================================================
'''
)
return_code = 1
for key in slow_keys:
new_timing = new_timings[key]
old_timing = old_timings[key]
print(key)
print(f"Old timing: {old_timing}")
print(f"New timing: {new_timing}")
print("")
print(
'''====================================================
================== New Timings ==================
====================================================
'''
)
with open(new_file, 'r') as f:
print(f.read())
print(
'''====================================================
================== Old Timings ==================
====================================================
'''
)
with open(old_file, 'r') as f:
print(f.read())
else:
print(
'''====================================================
============== NO REGRESSIONS DETECTED =============
====================================================
'''
)
print(
'''====================================================
=================== ALL TIMINGS ===================
====================================================
'''
)
for key in test_keys:
new_timing = new_timings[key]
old_timing = old_timings[key]
print(key)
print(f"Old timing: {old_timing}")
print(f"New timing: {new_timing}")
print("")
exit(return_code)

View File

@@ -0,0 +1,80 @@
import os
import argparse
import subprocess
import tempfile
from pathlib import Path
# the threshold at which we consider something a regression (percentage)
regression_threshold_percentage = 0.20
parser = argparse.ArgumentParser(description='Generate TPC-DS reference results from Postgres.')
parser.add_argument(
'--old', dest='old_extension_dir', action='store', help='Path to the old extension dir', required=True
)
parser.add_argument(
'--new', dest='new_extension_dir', action='store', help='Path to the new extension dir', required=True
)
parser.add_argument(
'--expect',
dest='expected_extensions_raw',
action='store',
help='Comma separated list of expected extensions',
required=True,
)
args = parser.parse_args()
expected_extensions = args.expected_extensions_raw.split(',')
exit_code = 0
def parse_extensions(dir):
result = {}
for root, dirs, files in os.walk(dir):
for filename in files:
if filename.endswith(".duckdb_extension"):
result[Path(filename).stem] = os.path.join(root, filename)
# Check all expected extensions are there
for expected_extension in expected_extensions:
if expected_extension not in result.keys():
print(f"Did not find expected extension {expected_extension} in {dir}")
exit(1)
return result
old_extensions = parse_extensions(args.old_extension_dir)
new_extensions = parse_extensions(args.new_extension_dir)
matching_extensions = []
for extension in old_extensions.keys():
if extension in new_extensions:
matching_extensions.append(extension)
check_passed = True
error_message = ""
for extension in matching_extensions:
old_size = os.path.getsize(old_extensions[extension])
new_size = os.path.getsize(new_extensions[extension])
print(f" - checking '{extension}': old size={old_size}, new_size={new_size}")
if new_size / (old_size + 0.1) > (1.0 + regression_threshold_percentage):
check_passed = False
error_message += f" - Extension '{extension}' was bigger than expected {new_size}\n"
error_message += f" - old size: {old_size}\n"
error_message += f" - new size: {new_size}\n"
print()
if not check_passed:
print("Extension size regression check failed:\n")
print(error_message)
exit(1)
else:
print(f"All extensions passed the check!")

View File

@@ -0,0 +1,402 @@
import os
import sys
import duckdb
import pandas as pd
import pyarrow as pa
import time
import argparse
from typing import Dict, List, Any
import numpy as np
TPCH_QUERIES = []
res = duckdb.execute(
"""
select query from tpch_queries()
"""
).fetchall()
for x in res:
TPCH_QUERIES.append(x[0])
parser = argparse.ArgumentParser()
parser.add_argument("--verbose", action="store_true", help="Enable verbose mode", default=False)
parser.add_argument("--threads", type=int, help="Number of threads", default=None)
parser.add_argument("--nruns", type=int, help="Number of runs", default=10)
parser.add_argument("--out-file", type=str, help="Output file path", default=None)
parser.add_argument("--scale-factor", type=float, help="Set the scale factor TPCH is generated at", default=1.0)
args, unknown_args = parser.parse_known_args()
verbose = args.verbose
threads = args.threads
nruns = args.nruns
out_file = args.out_file
scale_factor = args.scale_factor
if unknown_args:
parser.error(f"Unrecognized parameter(s): {', '.join(unknown_args)}")
def print_msg(message: str):
if not verbose:
return
print(message)
def write_result(benchmark_name, nrun, t):
bench_result = f"{benchmark_name}\t{nrun}\t{t}"
if out_file is not None:
if not hasattr(write_result, 'file'):
write_result.file = open(out_file, 'w+')
write_result.file.write(bench_result)
write_result.file.write('\n')
else:
print_msg(bench_result)
def close_result():
if not hasattr(write_result, 'file'):
return
write_result.file.close()
class BenchmarkResult:
def __init__(self, name):
self.name = name
self.runs: List[float] = []
def add(self, duration: float):
self.runs.append(duration)
def write(self):
for i, run in enumerate(self.runs):
write_result(self.name, i, run)
class TPCHData:
TABLES = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
def __init__(self, scale_factor):
self.conn = duckdb.connect()
self.conn.execute(f'CALL dbgen(sf={scale_factor})')
def get_tables(self, convertor) -> Dict[str, Any]:
res = {}
for table in self.TABLES:
res[table] = convertor(self.conn, table)
return res
def load_lineitem(self, collector, benchmark_name) -> BenchmarkResult:
query = 'SELECT * FROM lineitem'
result = BenchmarkResult(benchmark_name)
for _ in range(nruns):
duration = 0.0
start = time.time()
rel = self.conn.sql(query)
res = collector(rel)
end = time.time()
duration = float(end - start)
del res
padding = " " * len(str(nruns))
print_msg(f"T{padding}: {duration}s")
result.add(duration)
return result
class TPCHBenchmarker:
def __init__(self, name: str):
self.initialize_connection()
self.name = name
def initialize_connection(self):
self.con = duckdb.connect()
if not threads:
return
print_msg(f'Limiting threads to {threads}')
self.con.execute(f"SET threads={threads}")
def register_tables(self, tables: Dict[str, Any]):
for name, table in tables.items():
self.con.register(name, table)
def run_tpch(self, collector, benchmark_name) -> BenchmarkResult:
print_msg("")
print_msg(TPCH_QUERIES)
result = BenchmarkResult(benchmark_name)
for _ in range(nruns):
duration = 0.0
# Execute all queries
for i, query in enumerate(TPCH_QUERIES):
start = time.time()
rel = self.con.sql(query)
if rel:
res = collector(rel)
del res
else:
print_msg(f"Query '{query}' did not produce output")
end = time.time()
query_time = float(end - start)
print_msg(f"Q{str(i).ljust(len(str(nruns)), ' ')}: {query_time}")
duration += float(end - start)
padding = " " * len(str(nruns))
print_msg(f"T{padding}: {duration}s")
result.add(duration)
return result
def test_tpch():
print_msg(f"Generating TPCH (sf={scale_factor})")
tpch = TPCHData(scale_factor)
## -------- Benchmark converting LineItem to different formats ---------
def fetch_native(rel: duckdb.DuckDBPyRelation):
return rel.fetchall()
def fetch_pandas(rel: duckdb.DuckDBPyRelation):
return rel.df()
def fetch_arrow(rel: duckdb.DuckDBPyRelation):
return rel.arrow()
COLLECTORS = {'native': fetch_native, 'pandas': fetch_pandas, 'arrow': fetch_arrow}
# For every collector, load lineitem 'nrun' times
for collector in COLLECTORS:
result: BenchmarkResult = tpch.load_lineitem(COLLECTORS[collector], collector + "_load_lineitem")
print_msg(result.name)
print_msg(collector)
result.write()
## ------- Benchmark running TPCH queries on top of different formats --------
def convert_pandas(conn: duckdb.DuckDBPyConnection, table_name: str):
return conn.execute(f"SELECT * FROM {table_name}").df()
def convert_arrow(conn: duckdb.DuckDBPyConnection, table_name: str):
df = convert_pandas(conn, table_name)
return pa.Table.from_pandas(df)
CONVERTORS = {'pandas': convert_pandas, 'arrow': convert_arrow}
# Convert TPCH data to the right format, then run TPCH queries on that data
for convertor in CONVERTORS:
tables = tpch.get_tables(CONVERTORS[convertor])
tester = TPCHBenchmarker(convertor)
tester.register_tables(tables)
collector = COLLECTORS[convertor]
result: BenchmarkResult = tester.run_tpch(collector, f"{convertor}tpch")
result.write()
def generate_string(seed: int):
output = ''
for _ in range(10):
output += chr(ord('A') + int(seed % 26))
seed /= 26
return output
class ArrowDictionary:
def __init__(self, unique_values):
self.size = unique_values
self.dict = [generate_string(x) for x in range(unique_values)]
class ArrowDictionaryBenchmark:
def __init__(self, unique_values, values, arrow_dict: ArrowDictionary):
assert unique_values <= arrow_dict.size
self.initialize_connection()
self.generate(unique_values, values, arrow_dict)
def initialize_connection(self):
self.con = duckdb.connect()
if not threads:
return
print_msg(f'Limiting threads to {threads}')
self.con.execute(f"SET threads={threads}")
def generate(self, unique_values, values, arrow_dict: ArrowDictionary):
self.input = []
self.expected = []
for x in range(values):
value = arrow_dict.dict[x % unique_values]
self.input.append(value)
self.expected.append((value,))
array = pa.array(
self.input,
type=pa.dictionary(pa.int64(), pa.string()),
)
self.table = pa.table([array], names=["x"])
def benchmark(self, benchmark_name) -> BenchmarkResult:
self.con.register('arrow_table', self.table)
result = BenchmarkResult(benchmark_name)
for _ in range(nruns):
duration = 0.0
start = time.time()
res = self.con.execute(
"""
select * from arrow_table
"""
).fetchall()
end = time.time()
duration = float(end - start)
assert self.expected == res
del res
padding = " " * len(str(nruns))
print_msg(f"T{padding}: {duration}s")
result.add(duration)
return result
class SelectAndCallBenchmark:
def __init__(self):
"""
SELECT statements become QueryRelations, any other statement type becomes a MaterializedRelation.
We use SELECT and CALL here because their execution plans are identical
"""
self.initialize_connection()
def initialize_connection(self):
self.con = duckdb.connect()
if not threads:
return
print_msg(f'Limiting threads to {threads}')
self.con.execute(f"SET threads={threads}")
def benchmark(self, name, query) -> List[BenchmarkResult]:
results: List[BenchmarkResult] = []
methods = {'select': 'select * from ', 'call': 'call '}
for key, value in methods.items():
for rowcount in [2048, 50000, 2500000]:
result = BenchmarkResult(f'{key}_{name}_{rowcount}')
query_string = query.format(rows=rowcount)
query_string = value + query_string
rel = self.con.sql(query_string)
print_msg(rel.type)
for _ in range(nruns):
duration = 0.0
start = time.time()
rel.fetchall()
end = time.time()
duration = float(end - start)
padding = " " * len(str(nruns))
print_msg(f"T{padding}: {duration}s")
result.add(duration)
results.append(result)
return results
class PandasDFLoadBenchmark:
def __init__(self):
self.initialize_connection()
self.generate()
def initialize_connection(self):
self.con = duckdb.connect()
if not threads:
return
print_msg(f'Limiting threads to {threads}')
self.con.execute(f"SET threads={threads}")
def generate(self):
self.con.execute("call dbgen(sf=0.1)")
new_table = "*, " + ", ".join(["l_shipdate"] * 300)
self.con.execute(f"create table wide as select {new_table} from lineitem limit 500")
self.con.execute(f"copy wide to 'wide_table.csv' (FORMAT CSV)")
def benchmark(self, benchmark_name) -> BenchmarkResult:
result = BenchmarkResult(benchmark_name)
for _ in range(nruns):
duration = 0.0
pandas_df = pd.read_csv('wide_table.csv')
start = time.time()
for _ in range(30):
res = self.con.execute("""select * from pandas_df""").df()
end = time.time()
duration = float(end - start)
del res
result.add(duration)
return result
class PandasAnalyzerBenchmark:
def __init__(self):
self.initialize_connection()
self.generate()
def initialize_connection(self):
self.con = duckdb.connect()
if not threads:
return
print_msg(f'Limiting threads to {threads}')
self.con.execute(f"SET threads={threads}")
def generate(self):
return
def benchmark(self, benchmark_name) -> BenchmarkResult:
result = BenchmarkResult(benchmark_name)
data = [None] * 9999999 + [1] # Last element is 1, others are None
# Create the DataFrame with the specified data and column type as object
pandas_df = pd.DataFrame(data, columns=['Column'], dtype=object)
for _ in range(nruns):
duration = 0.0
start = time.time()
for _ in range(30):
res = self.con.execute("""select * from pandas_df""").df()
end = time.time()
duration = float(end - start)
del res
result.add(duration)
return result
def test_arrow_dictionaries_scan():
DICT_SIZE = 26 * 1000
print_msg(f"Generating a unique dictionary of size {DICT_SIZE}")
arrow_dict = ArrowDictionary(DICT_SIZE)
DATASET_SIZE = 10000000
for unique_values in [2, 1000, DICT_SIZE]:
test = ArrowDictionaryBenchmark(unique_values, DATASET_SIZE, arrow_dict)
benchmark_name = f"arrow_dict_unique_{unique_values}_total_{DATASET_SIZE}"
result = test.benchmark(benchmark_name)
result.write()
def test_loading_pandas_df_many_times():
test = PandasDFLoadBenchmark()
benchmark_name = f"load_pandas_df_many_times"
result = test.benchmark(benchmark_name)
result.write()
def test_pandas_analyze():
test = PandasAnalyzerBenchmark()
benchmark_name = f"pandas_analyze"
result = test.benchmark(benchmark_name)
result.write()
def test_call_and_select_statements():
test = SelectAndCallBenchmark()
queries = {
'repeat_row': "repeat_row(42, 'test', True, 'this is a long string', num_rows={rows})",
}
for key, value in queries.items():
results = test.benchmark(key, value)
for res in results:
res.write()
def main():
test_tpch()
test_arrow_dictionaries_scan()
test_loading_pandas_df_many_times()
test_pandas_analyze()
test_call_and_select_statements()
close_result()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,87 @@
import os
import argparse
import subprocess
import tempfile
# the threshold at which we consider something a regression (percentage)
regression_threshold_percentage = 0.05
parser = argparse.ArgumentParser(description='Generate TPC-DS reference results from Postgres.')
parser.add_argument('--old', dest='old_runner', action='store', help='Path to the old shell executable')
parser.add_argument('--new', dest='new_runner', action='store', help='Path to the new shell executable')
args = parser.parse_args()
old_runner = args.old_runner
new_runner = args.new_runner
exit_code = 0
if not os.path.isfile(old_runner):
print(f"Failed to find old runner {old_runner}")
exit(1)
if not os.path.isfile(new_runner):
print(f"Failed to find new runner {new_runner}")
exit(1)
def load_data(shell_path, load_script):
with tempfile.NamedTemporaryFile() as f:
filename = f.name
proc = subprocess.Popen(
[
shell_path,
'-storage_version',
'latest',
'-c',
"set storage_compatibility_version='latest'",
'-c',
load_script,
filename,
]
)
proc.wait()
if proc.returncode != 0:
print('----------------------------')
print('FAILED TO RUN')
print('----------------------------')
return None
return os.path.getsize(filename)
def run_benchmark(load_script, benchmark_name):
print('----------------------------')
print(f'Running benchmark {benchmark_name}')
print('----------------------------')
old_size = load_data(old_runner, load_script)
if old_size is None:
return False
new_size = load_data(new_runner, load_script)
if new_size is None:
return False
print(f'Database size with old runner: {old_size}')
print(f'Database size with new runner: {new_size}')
if new_size - new_size * regression_threshold_percentage > old_size:
print('----------------------------')
print('FAILURE: SIZE INCREASE')
print('----------------------------')
return False
else:
print('----------------------------')
print('SUCCESS!')
print('----------------------------')
return True
tpch_load = 'CALL dbgen(sf=1);'
tpcds_load = 'CALL dsdgen(sf=1);'
benchmarks = [[tpch_load, 'TPC-H SF1'], [tpcds_load, 'TPC-DS SF1']]
for benchmark in benchmarks:
if not run_benchmark(benchmark[0], benchmark[1]):
print(f'Database size increased in {benchmark[1]}')
exit_code = 1
exit(exit_code)

View File

@@ -0,0 +1,48 @@
library(tidyverse)
here <- rprojroot::is_git_root$find_file
# build/debug/test/unittest -d yes 2>&1 > timings.txt
timings <- readLines(here("timings.txt"))
timings
timings_df <- rematch2::re_match(timings, "^.*(?<time>[0-9][.][0-9][0-9][0-9]) s: (?<desc>.*)$")
cum_timings_df <-
timings_df %>%
filter(!is.na(time)) %>%
mutate(time = as.numeric(time)) %>%
count(desc, wt = time, name = "time") %>%
arrange(time) %>%
mutate(cum_time = cumsum(time), id = row_number())
cum_timings_df %>%
ggplot(aes(x = time, y = cum_time, color = id)) +
geom_line() +
scale_x_log10()
cum_timings_df %>%
ggplot(aes(x = id, y = cum_time, color = time)) +
geom_line() +
scale_colour_continuous(trans = "log10")
cum_timings_cut <-
cum_timings_df %>%
filter(cum_time >= 200, str_detect(desc, "[.]test$"))
slow <- cum_timings_cut$desc
slow_renamed <- paste0(slow, "_coverage")
slow_renamed[fs::file_exists(here(slow_renamed))]
stopifnot(!any(fs::file_exists(here(slow_renamed))))
withr::with_dir(
here(),
fs::file_move(slow, slow_renamed)
)
walk2(slow_renamed, slow, ~ {
text <- brio::read_lines(here(.x))
text <- str_replace_all(text, fixed(.y), .x)
brio::write_lines(text, here(.x))
})

View File

@@ -0,0 +1,20 @@
import os
import sys
import time
if len(sys.argv) <= 1:
print("Expected usage: python3 repeat_until_success.py [command]")
exit(1)
ntries = 10
sleep_duration = 3
cmd = sys.argv[1]
for i in range(ntries):
ret = os.system(cmd)
if ret is None or ret == 0:
exit(0)
print("Command {{ " + cmd + " }} failed, retrying (" + str(i + 1) + "/" + str(ntries) + ")")
time.sleep(sleep_duration)
exit(1)

View File

@@ -0,0 +1,62 @@
import subprocess
import duckdb
import os
import pandas as pd
import argparse
from io import StringIO
parser = argparse.ArgumentParser(description='Rerun failed workflows from a PR.')
parser.add_argument(
'--title',
dest='title',
action='store',
help='The title of the PR for which we want to rerun workflows (or part of the title)',
required=True,
)
parser.add_argument(
'--repo', dest='repo', action='store', help='The repository to run this workflow on', default='duckdb/duckdb'
)
parser.add_argument(
'--max_workflows',
dest='max_workflows',
action='store',
help='The maximum number of workflows to look at (starting from the latest)',
default=200,
)
args = parser.parse_args()
nlimit = args.max_workflows
query = args.title
proc = subprocess.Popen(
[
'gh',
'run',
'-R',
args.repo,
'list',
'--json',
'displayTitle,databaseId,status,conclusion,headSha',
f'--limit={nlimit}',
],
stdout=subprocess.PIPE,
)
text = proc.stdout.read().decode('utf8')
df = pd.read_json(StringIO(text))
result = duckdb.query(f"select headSha from df where displayTitle LIKE '%{query}%' limit 1").fetchall()
if len(result) == 0:
print(
f"No workflows found in the latest {nlimit} workflows that contain the text {query}.\nPerhaps try running with a higher --max_workflows parameter?"
)
exit(1)
headSha = result[0][0]
result = duckdb.query(
f"select databaseId from df where conclusion IN ('failure', 'cancelled') AND displayTitle LIKE '%{query}%' and headSha='{headSha}'"
).fetchall()
if len(result) == 0:
print(f"Found runs that match the text {query} but no failing or cancelled runs were found")
for databaseId in [x[0] for x in result]:
os.system(f'gh run -R {args.repo} rerun {databaseId}')

View File

@@ -0,0 +1,347 @@
#!/usr/bin/env python
#
# ===- run-clang-tidy.py - Parallel clang-tidy runner ---------*- python -*--===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===------------------------------------------------------------------------===#
# FIXME: Integrate with clang-tidy-diff.py
"""
Parallel clang-tidy runner
==========================
Runs clang-tidy over all files in a compilation database. Requires clang-tidy
and clang-apply-replacements in $PATH.
Example invocations.
- Run clang-tidy on all files in the current working directory with a default
set of checks and show warnings in the cpp files and all project headers.
run-clang-tidy.py $PWD
- Fix all header guards.
run-clang-tidy.py -fix -checks=-*,llvm-header-guard
- Fix all header guards included from clang-tidy and header guards
for clang-tidy headers.
run-clang-tidy.py -fix -checks=-*,llvm-header-guard extra/clang-tidy \
-header-filter=extra/clang-tidy
Compilation database setup:
http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html
"""
from __future__ import print_function
import argparse
import glob
import json
import multiprocessing
import os
import re
import shutil
import subprocess
import sys
import tempfile
import threading
import traceback
try:
import yaml
except ImportError:
yaml = None
is_py2 = sys.version[0] == '2'
if is_py2:
import Queue as queue
else:
import queue as queue
def find_compilation_database(path):
"""Adjusts the directory until a compilation database is found."""
result = './'
while not os.path.isfile(os.path.join(result, path)):
if os.path.realpath(result) == '/':
print('Error: could not find compilation database.')
sys.exit(1)
result += '../'
return os.path.realpath(result)
def make_absolute(f, directory):
if os.path.isabs(f):
return f
return os.path.normpath(os.path.join(directory, f))
def get_tidy_invocation(
f, clang_tidy_binary, checks, tmpdir, build_path, header_filter, extra_arg, extra_arg_before, quiet, config
):
"""Gets a command line for clang-tidy."""
start = [clang_tidy_binary]
if header_filter is not None:
start.append('-header-filter=' + header_filter)
if checks:
start.append('-checks=' + checks)
if tmpdir is not None:
start.append('-export-fixes')
# Get a temporary file. We immediately close the handle so clang-tidy can
# overwrite it.
(handle, name) = tempfile.mkstemp(suffix='.yaml', dir=tmpdir)
os.close(handle)
start.append(name)
for arg in extra_arg:
start.append('-extra-arg=%s' % arg)
for arg in extra_arg_before:
start.append('-extra-arg-before=%s' % arg)
start.append('-p=' + build_path)
if quiet:
start.append('--quiet')
if config:
start.append('-config=' + config)
start.append(f)
return start
def merge_replacement_files(tmpdir, mergefile):
"""Merge all replacement files in a directory into a single file"""
# The fixes suggested by clang-tidy >= 4.0.0 are given under
# the top level key 'Diagnostics' in the output yaml files
mergekey = "Diagnostics"
merged = []
for replacefile in glob.iglob(os.path.join(tmpdir, '*.yaml')):
content = yaml.safe_load(open(replacefile, 'r'))
if not content:
continue # Skip empty files.
merged.extend(content.get(mergekey, []))
if merged:
# MainSourceFile: The key is required by the definition inside
# include/clang/Tooling/ReplacementsYaml.h, but the value
# is actually never used inside clang-apply-replacements,
# so we set it to '' here.
output = {'MainSourceFile': '', mergekey: merged}
with open(mergefile, 'w') as out:
yaml.safe_dump(output, out)
else:
# Empty the file:
open(mergefile, 'w').close()
def check_clang_apply_replacements_binary(args):
"""Checks if invoking supplied clang-apply-replacements binary works."""
try:
subprocess.check_call([args.clang_apply_replacements_binary, '--version'])
except:
print(
'Unable to run clang-apply-replacements. Is clang-apply-replacements ' 'binary correctly specified?',
file=sys.stderr,
)
traceback.print_exc()
sys.exit(1)
def apply_fixes(args, tmpdir):
"""Calls clang-apply-fixes on a given directory."""
invocation = [args.clang_apply_replacements_binary]
if args.format:
invocation.append('-format')
if args.style:
invocation.append('-style=' + args.style)
invocation.append(tmpdir)
subprocess.call(invocation)
def run_tidy(args, tmpdir, build_path, queue, lock, failed_files):
"""Takes filenames out of queue and runs clang-tidy on them."""
while True:
name = queue.get()
invocation = get_tidy_invocation(
name,
args.clang_tidy_binary,
args.checks,
tmpdir,
build_path,
args.header_filter,
args.extra_arg,
args.extra_arg_before,
args.quiet,
args.config,
)
proc = subprocess.Popen(invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, err = proc.communicate()
if proc.returncode != 0:
failed_files.append(name)
with lock:
sys.stdout.write(' '.join(invocation) + '\n' + output.decode('utf-8'))
if len(err) > 0:
sys.stdout.flush()
sys.stderr.write(err.decode('utf-8'))
queue.task_done()
def main():
parser = argparse.ArgumentParser(
description='Runs clang-tidy over all files '
'in a compilation database. Requires '
'clang-tidy and clang-apply-replacements in '
'$PATH.'
)
parser.add_argument('-clang-tidy-binary', metavar='PATH', default='clang-tidy', help='path to clang-tidy binary')
parser.add_argument(
'-clang-apply-replacements-binary',
metavar='PATH',
default='clang-apply-replacements',
help='path to clang-apply-replacements binary',
)
parser.add_argument('-checks', default=None, help='checks filter, when not specified, use clang-tidy ' 'default')
parser.add_argument(
'-config',
default=None,
help='Specifies a configuration in YAML/JSON format: '
' -config="{Checks: \'*\', '
' CheckOptions: [{key: x, '
' value: y}]}" '
'When the value is empty, clang-tidy will '
'attempt to find a file named .clang-tidy for '
'each source file in its parent directories.',
)
parser.add_argument(
'-header-filter',
default=None,
help='regular expression matching the names of the '
'headers to output diagnostics from. Diagnostics from '
'the main file of each translation unit are always '
'displayed.',
)
if yaml:
parser.add_argument(
'-export-fixes',
metavar='filename',
dest='export_fixes',
help='Create a yaml file to store suggested fixes in, '
'which can be applied with clang-apply-replacements.',
)
parser.add_argument('-j', type=int, default=0, help='number of tidy instances to be run in parallel.')
parser.add_argument('files', nargs='*', default=['.*'], help='files to be processed (regex on path)')
parser.add_argument('-fix', action='store_true', help='apply fix-its')
parser.add_argument('-format', action='store_true', help='Reformat code ' 'after applying fixes')
parser.add_argument('-style', default='file', help='The style of reformat ' 'code after applying fixes')
parser.add_argument('-p', dest='build_path', help='Path used to read a compile command database.')
parser.add_argument(
'-extra-arg',
dest='extra_arg',
action='append',
default=[],
help='Additional argument to append to the compiler ' 'command line.',
)
parser.add_argument(
'-extra-arg-before',
dest='extra_arg_before',
action='append',
default=[],
help='Additional argument to prepend to the compiler ' 'command line.',
)
parser.add_argument('-quiet', action='store_true', help='Run clang-tidy in quiet mode')
args = parser.parse_args()
db_path = 'compile_commands.json'
if args.build_path is not None:
build_path = args.build_path
else:
# Find our database
build_path = find_compilation_database(db_path)
try:
invocation = [args.clang_tidy_binary, '-list-checks']
invocation.append('-p=' + build_path)
if args.checks:
invocation.append('-checks=' + args.checks)
invocation.append('-')
if args.quiet:
# Even with -quiet we still want to check if we can call clang-tidy.
with open(os.devnull, 'w') as dev_null:
subprocess.check_call(invocation, stdout=dev_null)
else:
subprocess.check_call(invocation)
except:
print("Unable to run clang-tidy, consider running `pip install clang-tidy`", file=sys.stderr)
sys.exit(1)
# Load the database and extract all files.
database = json.load(open(os.path.join(build_path, db_path)))
files = [make_absolute(entry['file'], entry['directory']) for entry in database]
max_task = args.j
if max_task == 0:
max_task = multiprocessing.cpu_count()
tmpdir = None
if args.fix or (yaml and args.export_fixes):
check_clang_apply_replacements_binary(args)
tmpdir = tempfile.mkdtemp()
# Build up a big regexy filter from all command line arguments.
file_name_re = re.compile('|'.join(args.files))
return_code = 0
try:
# Spin up a bunch of tidy-launching threads.
task_queue = queue.Queue(max_task)
# List of files with a non-zero return code.
failed_files = []
lock = threading.Lock()
for _ in range(max_task):
t = threading.Thread(target=run_tidy, args=(args, tmpdir, build_path, task_queue, lock, failed_files))
t.daemon = True
t.start()
# Fill the queue with files.
for name in files:
if file_name_re.search(name):
task_queue.put(name)
# Wait for all threads to be done.
task_queue.join()
if len(failed_files):
return_code = 1
except KeyboardInterrupt:
# This is a sad hack. Unfortunately subprocess goes
# bonkers with ctrl-c and we start forking merrily.
print('\nCtrl-C detected, goodbye.')
if tmpdir:
shutil.rmtree(tmpdir)
os.kill(0, 9)
if yaml and args.export_fixes:
print('Writing fixes to ' + args.export_fixes + ' ...')
try:
merge_replacement_files(tmpdir, args.export_fixes)
except:
print('Error exporting fixes.\n', file=sys.stderr)
traceback.print_exc()
return_code = 1
if args.fix:
print('Applying fixes ...')
try:
apply_fixes(args, tmpdir)
except:
print('Error applying fixes.\n', file=sys.stderr)
traceback.print_exc()
return_code = 1
if tmpdir:
shutil.rmtree(tmpdir)
sys.exit(return_code)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,52 @@
import argparse
import os
import subprocess
import re
parser = argparse.ArgumentParser(description='Run a full benchmark using the CLI and report the results.')
parser.add_argument('--shell', action='store', help='Path to the CLI', default='build/reldebug/duckdb')
parser.add_argument('--database', action='store', help='Path to the database file to load data from')
parser.add_argument(
'--queries', action='store', help='Path to the list of queries to run (e.g. benchmark/clickbench/queries)'
)
parser.add_argument('--nrun', action='store', help='The number of runs', default=3)
args = parser.parse_args()
queries = os.listdir(args.queries)
queries.sort()
ran_queries = []
timings = []
for q in queries:
if 'load.sql' in q:
continue
command = [args.shell, args.database]
command += ['-c', '.timer on']
for i in range(args.nrun):
command += ['-c', '.read ' + os.path.join(args.queries, q)]
res = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout = res.stdout.decode('utf8').strip()
stderr = res.stderr.decode('utf8').strip()
results = re.findall(r'Run Time \(s\): real (\d+.\d+)', stdout)
if res.returncode != 0 or 'Error:\n' in stderr or len(results) != args.nrun:
print("------- Failed to run query -------")
print(q)
print("------- stdout -------")
print(stdout)
print("------- stderr -------")
print(stderr)
exit(1)
results = [float(x) for x in results]
print(f"Timings for {q}: " + str(results))
ran_queries.append(q)
timings.append(results[1])
print('')
sql_query = 'SELECT UNNEST(['
sql_query += ','.join(["'" + x + "'" for x in ran_queries]) + ']) as query'
sql_query += ","
sql_query += "UNNEST(["
sql_query += ','.join([str(x) for x in timings])
sql_query += "]) as timing;"
print(sql_query)

View File

@@ -0,0 +1,177 @@
#!/bin/bash
# Generates a bunch of directories to be used for testing extension updating related behaviour used in `test/extension/update_extensions_ci.test`
# Please consider your energy footprint by only running this script with ccache.
# note that subsequent runs used cached artifacts, use `make clean` or rm -rf build/debug to clean
set -x
set -e
DUCKDB_BUILD_DIR="./build/debug"
TEST_DIR="./build/extension_metadata_test_data"
TEST_DIR_COPY="./build/extension_metadata_test_data_copy"
### Directories to use
# Used as the extension installation directory for DuckDB
export LOCAL_EXTENSION_DIR="$TEST_DIR/extension_dir"
# Repository for testing successfully updating extensions
export LOCAL_EXTENSION_REPO_UPDATED="$TEST_DIR/repository"
# Repository for testing incorrect platform
export LOCAL_EXTENSION_REPO_INCORRECT_PLATFORM="$TEST_DIR/repository_incorrect_platform"
# Repository for testing incorrect version
export LOCAL_EXTENSION_REPO_INCORRECT_DUCKDB_VERSION="$TEST_DIR/repository_incorrect_version"
# Repository where both platform and version mismatch
export LOCAL_EXTENSION_REPO_VERSION_AND_PLATFORM_INCORRECT="$TEST_DIR/repository_incorrect_version_and_platform"
# Directory containing the extensions for direct installing
export DIRECT_INSTALL_DIR="$TEST_DIR/direct_install"
# Extension dir with a malformed info file for an extension
export LOCAL_EXTENSION_DIR_MALFORMED_INFO="$TEST_DIR/extension_dir_malformed_info"
# Extension dir with a metadata install version that mismatches the files metadata
export LOCAL_EXTENSION_DIR_INFO_INCORRECT_VERSION="$TEST_DIR/extension_dir_malformed_info_incorrect_version"
if [ -d "$TEST_DIR_COPY" ]; then
# REUSE PREVIOUSLY GENERATED DATA
rm -r $TEST_DIR
cp -R $TEST_DIR_COPY $TEST_DIR
else
# GENERATE FRESH DATA
mkdir -p $TEST_DIR
mkdir -p $DIRECT_INSTALL_DIR
mkdir -p $LOCAL_EXTENSION_DIR
mkdir -p $LOCAL_EXTENSION_REPO_UPDATED
mkdir -p $LOCAL_EXTENSION_REPO_INCORRECT_PLATFORM
mkdir -p $LOCAL_EXTENSION_REPO_INCORRECT_DUCKDB_VERSION
#################################################
### First repo: successfully updating extensions.
#################################################
# Set extension config
cat > $TEST_DIR/extension_config_before.cmake <<EOL
duckdb_extension_load(json DONT_LINK EXTENSION_VERSION v0.0.1)
duckdb_extension_load(tpch DONT_LINK EXTENSION_VERSION v0.0.1)
duckdb_extension_load(tpcds DONT_LINK EXTENSION_VERSION v0.0.1)
duckdb_extension_load(icu DONT_LINK EXTENSION_VERSION v0.0.1)
EOL
# Build the extensions using the first config
LOCAL_EXTENSION_REPO=$LOCAL_EXTENSION_REPO_UPDATED EXTENSION_CONFIGS=$TEST_DIR/extension_config_before.cmake make debug
# Set the version and platform now that we have a build
DUCKDB_VERSION=`$DUCKDB_BUILD_DIR/duckdb -csv -noheader -c 'select source_id from pragma_version()'`
DUCKDB_PLATFORM=`cat $DUCKDB_BUILD_DIR/duckdb_platform_out`
# Install the extension from the initial config
$DUCKDB_BUILD_DIR/duckdb -unsigned -c "set extension_directory='$LOCAL_EXTENSION_DIR'; set custom_extension_repository='$LOCAL_EXTENSION_REPO_UPDATED'; install tpch; install json; INSTALL icu;"
# Delete the info file from the icu extension
rm $LOCAL_EXTENSION_DIR/$DUCKDB_VERSION/$DUCKDB_PLATFORM/icu.duckdb_extension.info
# Install tpcds directly
cp $DUCKDB_BUILD_DIR/extension/tpcds/tpcds.duckdb_extension $DIRECT_INSTALL_DIR/tpcds.duckdb_extension
$DUCKDB_BUILD_DIR/duckdb -unsigned -c "set extension_directory='$LOCAL_EXTENSION_DIR'; install '$DIRECT_INSTALL_DIR/tpcds.duckdb_extension';"
# Set updated extension config where we update the tpch extension but not the json extension
cat > $TEST_DIR/extension_config_after.cmake <<EOL
duckdb_extension_load(json DONT_LINK EXTENSION_VERSION v0.0.1)
duckdb_extension_load(tpch DONT_LINK EXTENSION_VERSION v0.0.2)
duckdb_extension_load(icu DONT_LINK EXTENSION_VERSION v0.0.2)
EOL
# Build the extensions using the second config
LOCAL_EXTENSION_REPO=$LOCAL_EXTENSION_REPO_UPDATED EXTENSION_CONFIGS=$TEST_DIR/extension_config_after.cmake BUILD_EXTENSIONS_ONLY=1 make debug
# For good measure, we also gzip one of the files in the repo to ensure we can do both gzipped and non gzipped
gzip -1 $LOCAL_EXTENSION_REPO_UPDATED/$DUCKDB_VERSION/$DUCKDB_PLATFORM/icu.duckdb_extension
##########################################
### Second repo: Incorrect DuckDB platform
##########################################
rm -rf $DUCKDB_BUILD_DIR
# Set extension config
cat > $TEST_DIR/extension_config_incorrect_platform.cmake <<EOL
duckdb_extension_load(json DONT_LINK EXTENSION_VERSION v0.0.3)
EOL
# Build the extensions using the incorrect platform
DUCKDB_PLATFORM=test_platform EXTENSION_CONFIGS=$TEST_DIR/extension_config_incorrect_platform.cmake BUILD_EXTENSIONS_ONLY=1 make debug
cp $DUCKDB_BUILD_DIR/extension/json/json.duckdb_extension $DIRECT_INSTALL_DIR/json_incorrect_platform.duckdb_extension
########################################
### Third repo: Incorrect DuckDB version
########################################
rm -rf $DUCKDB_BUILD_DIR
# Set extension config
cat > $TEST_DIR/extension_config_incorrect_version.cmake <<EOL
duckdb_extension_load(json DONT_LINK EXTENSION_VERSION v0.0.4)
EOL
# Build the extensions using the incorrect platform
DUCKDB_EXPLICIT_VERSION=v1337 EXTENSION_CONFIGS=$TEST_DIR/extension_config_before.cmake BUILD_EXTENSIONS_ONLY=1 make debug
cp $DUCKDB_BUILD_DIR/extension/json/json.duckdb_extension $DIRECT_INSTALL_DIR/json_incorrect_version.duckdb_extension
####################################################
### Fourth repo: Both platform and version incorrect
####################################################
rm -rf $DUCKDB_BUILD_DIR
# Set extension config
cat > $TEST_DIR/extension_config_incorrect_version.cmake <<EOL
duckdb_extension_load(json DONT_LINK EXTENSION_VERSION v0.0.4)
EOL
# Build the extensions using the incorrect platform
DUCKDB_PLATFORM=test_platform DUCKDB_EXPLICIT_VERSION=v1337 EXTENSION_CONFIGS=$TEST_DIR/extension_config_before.cmake BUILD_EXTENSIONS_ONLY=1 make debug
cp $DUCKDB_BUILD_DIR/extension/json/json.duckdb_extension $DIRECT_INSTALL_DIR/json_incorrect_version_and_platform.duckdb_extension
# Note that we set the "double wrong" extension to have the proper name, so we can actually load it during testing with
# SET allow_extensions_metadata_mismatch=true;
cp $DUCKDB_BUILD_DIR/extension/json/json.duckdb_extension $DIRECT_INSTALL_DIR/json.duckdb_extension
###########################
### Prepare malformed repos/dirs
###########################
# Build clean duckdb
rm -rf $DUCKDB_BUILD_DIR
make debug
# Use duckdb to install the extensions into the repositories (note that we are doing a trick here by setting the extension_directory to the local repo dir)
$DUCKDB_BUILD_DIR/duckdb -unsigned -c "set allow_extensions_metadata_mismatch=true; set extension_directory='$LOCAL_EXTENSION_REPO_INCORRECT_PLATFORM'; install '$DIRECT_INSTALL_DIR/json_incorrect_platform.duckdb_extension'"
$DUCKDB_BUILD_DIR/duckdb -unsigned -c "set allow_extensions_metadata_mismatch=true; set extension_directory='$LOCAL_EXTENSION_REPO_INCORRECT_DUCKDB_VERSION'; install '$DIRECT_INSTALL_DIR/json_incorrect_version.duckdb_extension'"
$DUCKDB_BUILD_DIR/duckdb -unsigned -c "set allow_extensions_metadata_mismatch=true; set extension_directory='$LOCAL_EXTENSION_REPO_VERSION_AND_PLATFORM_INCORRECT'; install '$DIRECT_INSTALL_DIR/json_incorrect_version_and_platform.duckdb_extension'"
# Create dir with malformed info file
$DUCKDB_BUILD_DIR/duckdb -unsigned -c "set extension_directory='$LOCAL_EXTENSION_DIR_MALFORMED_INFO'; install '$DIRECT_INSTALL_DIR/tpcds.duckdb_extension';"
echo blablablab > $LOCAL_EXTENSION_DIR_MALFORMED_INFO/$DUCKDB_VERSION/$DUCKDB_PLATFORM/tpcds.duckdb_extension.info
# Create dir with malformed info file: we install a new version from LOCAL_EXTENSION_REPO_UPDATED but preserve the old info file
$DUCKDB_BUILD_DIR/duckdb -unsigned -c "set extension_directory='$LOCAL_EXTENSION_DIR_INFO_INCORRECT_VERSION'; install 'tpch' from '$LOCAL_EXTENSION_REPO_UPDATED'"
cp $LOCAL_EXTENSION_DIR/$DUCKDB_VERSION/$DUCKDB_PLATFORM/tpch.duckdb_extension.info $LOCAL_EXTENSION_DIR_INFO_INCORRECT_VERSION/$DUCKDB_VERSION/$DUCKDB_PLATFORM/tpch.duckdb_extension.info
###################################################################
### Allow using copy instead of regenerating test data on every run
###################################################################
cp -R $TEST_DIR $TEST_DIR_COPY
fi
###########################
### Set version and platform
###########################
DUCKDB_VERSION=`$DUCKDB_BUILD_DIR/duckdb -csv -noheader -c 'select source_id from pragma_version()'`
DUCKDB_PLATFORM=`cat $DUCKDB_BUILD_DIR/duckdb_platform_out`
###########################
### Populate the minio repositories
###########################
AWS_DEFAULT_REGION=eu-west-1 AWS_ACCESS_KEY_ID=minio_duckdb_user AWS_SECRET_ACCESS_KEY=minio_duckdb_user_password aws --endpoint-url http://duckdb-minio.com:9000 s3 sync $LOCAL_EXTENSION_REPO_UPDATED s3://test-bucket-public/ci-test-repo
export REMOTE_EXTENSION_REPO_UPDATED=http://duckdb-minio.com:9000/test-bucket-public/ci-test-repo
export REMOTE_EXTENSION_REPO_DIRECT_PATH=http://duckdb-minio.com:9000/test-bucket-public/ci-test-repo/$DUCKDB_VERSION/$DUCKDB_PLATFORM
################
### Run test
################
RUN_EXTENSION_UPDATE_TEST=1 $DUCKDB_BUILD_DIR/test/unittest test/extension/update_extensions_ci.test

View File

@@ -0,0 +1,318 @@
import argparse
import sys
import subprocess
import time
import threading
import tempfile
import os
import shutil
import re
class ErrorContainer:
def __init__(self):
self._lock = threading.Lock()
self._errors = []
def append(self, item):
with self._lock:
self._errors.append(item)
def get_errors(self):
with self._lock:
return list(self._errors)
def __len__(self):
with self._lock:
return len(self._errors)
error_container = ErrorContainer()
def valid_timeout(value):
try:
timeout_float = float(value)
if timeout_float <= 0:
raise argparse.ArgumentTypeError("Timeout value must be a positive float")
return timeout_float
except ValueError:
raise argparse.ArgumentTypeError("Timeout value must be a float")
parser = argparse.ArgumentParser(description='Run tests one by one with optional flags.')
parser.add_argument('unittest_program', help='Path to the unittest program')
parser.add_argument('--no-exit', action='store_true', help='Execute all tests, without stopping on first error')
parser.add_argument('--fast-fail', action='store_true', help='Terminate on first error')
parser.add_argument('--profile', action='store_true', help='Enable profiling')
parser.add_argument('--no-assertions', action='store_false', help='Disable assertions')
parser.add_argument('--time_execution', action='store_true', help='Measure and print the execution time of each test')
parser.add_argument('--list', action='store_true', help='Print the list of tests to run')
parser.add_argument('--summarize-failures', action='store_true', help='Summarize failures', default=None)
parser.add_argument(
'--tests-per-invocation', type=int, help='The amount of tests to run per invocation of the runner', default=1
)
parser.add_argument(
'--print-interval', action='store', help='Prints "Still running..." every N seconds', default=300.0, type=float
)
parser.add_argument(
'--timeout',
action='store',
help='Add a timeout for each test (in seconds, default: 3600s - i.e. one hour)',
default=3600,
type=valid_timeout,
)
parser.add_argument('--valgrind', action='store_true', help='Run the tests with valgrind', default=False)
args, extra_args = parser.parse_known_args()
if not args.unittest_program:
parser.error('Path to unittest program is required')
# Access the arguments
unittest_program = args.unittest_program
no_exit = args.no_exit
fast_fail = args.fast_fail
tests_per_invocation = args.tests_per_invocation
if no_exit:
if fast_fail:
print("--no-exit and --fast-fail can't be combined")
exit(1)
profile = args.profile
assertions = args.no_assertions
time_execution = args.time_execution
timeout = args.timeout
summarize_failures = args.summarize_failures
if summarize_failures is None:
# get from env
summarize_failures = False
if 'SUMMARIZE_FAILURES' in os.environ:
summarize_failures = os.environ['SUMMARIZE_FAILURES'] == '1'
elif 'CI' in os.environ:
# enable by default in CI if not set explicitly
summarize_failures = True
# Use the '-l' parameter to output the list of tests to run
proc = subprocess.run([unittest_program, '-l'] + extra_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout = proc.stdout.decode('utf8').strip()
stderr = proc.stderr.decode('utf8').strip()
if len(stderr) > 0:
print("Failed to run program " + unittest_program)
print("Returncode:", proc.returncode)
print(stdout)
print(stderr)
exit(1)
# The output is in the format of 'PATH\tGROUP', we're only interested in the PATH portion
test_cases = []
first_line = True
for line in stdout.splitlines():
if first_line:
first_line = False
continue
if len(line.strip()) == 0:
continue
splits = line.rsplit('\t', 1)
test_cases.append(splits[0])
test_count = len(test_cases)
if args.list:
for test_number, test_case in enumerate(test_cases):
print(print(f"[{test_number}/{test_count}]: {test_case}"))
all_passed = True
def fail():
global all_passed
all_passed = False
if fast_fail:
exit(1)
def parse_assertions(stdout):
for line in stdout.splitlines():
if 'All tests were skipped' in line:
return "SKIPPED"
if line == 'assertions: - none -':
return "0 assertions"
# Parse assertions in format
pos = line.find("assertion")
if pos != -1:
space_before_num = line.rfind(' ', 0, pos - 2)
return line[space_before_num + 2 : pos + 10]
return "ERROR"
is_active = False
def get_test_name_from(text):
match = re.findall(r'\((.*?)\)\!', text)
return match[0] if match else ''
def get_clean_error_message_from(text):
match = re.split(r'^=+\n', text, maxsplit=1, flags=re.MULTILINE)
return match[1] if len(match) > 1 else text
def print_interval_background(interval):
global is_active
current_ticker = 0.0
while is_active:
time.sleep(0.1)
current_ticker += 0.1
if current_ticker >= interval:
print("Still running...")
current_ticker = 0
def launch_test(test, list_of_tests=False):
global is_active
# start the background thread
is_active = True
background_print_thread = threading.Thread(target=print_interval_background, args=[args.print_interval])
background_print_thread.start()
unittest_stdout = sys.stdout if list_of_tests else subprocess.PIPE
unittest_stderr = subprocess.PIPE
start = time.time()
try:
test_cmd = [unittest_program] + test
if args.valgrind:
test_cmd = ['valgrind'] + test_cmd
# should unset SUMMARIZE_FAILURES to avoid producing exceeding failure logs
env = os.environ.copy()
# pass env variables globally
if list_of_tests or no_exit or tests_per_invocation:
env['SUMMARIZE_FAILURES'] = '0'
env['NO_DUPLICATING_HEADERS'] = '1'
else:
env['SUMMARIZE_FAILURES'] = '0'
res = subprocess.run(test_cmd, stdout=unittest_stdout, stderr=unittest_stderr, timeout=timeout, env=env)
except subprocess.TimeoutExpired as e:
if list_of_tests:
print("[TIMED OUT]", flush=True)
else:
print(" (TIMED OUT)", flush=True)
test_name = test[0] if not list_of_tests else str(test)
error_msg = f'TIMEOUT - exceeded specified timeout of {timeout} seconds'
new_data = {"test": test_name, "return_code": 1, "stdout": '', "stderr": error_msg}
error_container.append(new_data)
fail()
return
stdout = res.stdout.decode('utf8') if not list_of_tests else ''
stderr = res.stderr.decode('utf8')
if len(stderr) > 0:
# when list_of_tests test name gets transformed, but we can get it from stderr
test_name = test[0] if not list_of_tests else get_test_name_from(stderr)
error_message = get_clean_error_message_from(stderr)
new_data = {"test": test_name, "return_code": res.returncode, "stdout": stdout, "stderr": error_message}
error_container.append(new_data)
end = time.time()
# join the background print thread
is_active = False
background_print_thread.join()
additional_data = ""
if assertions:
additional_data += " (" + parse_assertions(stdout) + ")"
if args.time_execution:
additional_data += f" (Time: {end - start:.4f} seconds)"
print(additional_data, flush=True)
if profile:
print(f'{test_case} {end - start}')
if res.returncode is None or res.returncode == 0:
return
print("FAILURE IN RUNNING TEST")
print(
"""--------------------
RETURNCODE
--------------------"""
)
print(res.returncode)
print(
"""--------------------
STDOUT
--------------------"""
)
print(stdout)
print(
"""--------------------
STDERR
--------------------"""
)
print(stderr)
# if a test closes unexpectedly (e.g., SEGV), test cleanup doesn't happen,
# causing us to run out of space on subsequent tests in GH Actions (not much disk space there)
duckdb_unittest_tempdir = os.path.join(
os.path.dirname(unittest_program), '..', '..', '..', 'duckdb_unittest_tempdir'
)
if os.path.exists(duckdb_unittest_tempdir) and os.listdir(duckdb_unittest_tempdir):
shutil.rmtree(duckdb_unittest_tempdir)
fail()
def run_tests_one_by_one():
for test_number, test_case in enumerate(test_cases):
if not profile:
print(f"[{test_number}/{test_count}]: {test_case}", end="", flush=True)
launch_test([test_case])
def escape_test_case(test_case):
return test_case.replace(',', '\\,')
def run_tests_batched(batch_count):
tmp = tempfile.NamedTemporaryFile()
# write the test list to a temporary file
with open(tmp.name, 'w') as f:
for test_case in test_cases:
f.write(escape_test_case(test_case) + '\n')
# use start_offset/end_offset to cycle through the test list
test_number = 0
while test_number < len(test_cases):
# gather test cases
next_entry = test_number + batch_count
if next_entry > len(test_cases):
next_entry = len(test_cases)
launch_test(['-f', tmp.name, '--start-offset', str(test_number), '--end-offset', str(next_entry)], True)
test_number = next_entry
if args.tests_per_invocation == 1:
run_tests_one_by_one()
else:
assertions = False
run_tests_batched(args.tests_per_invocation)
if all_passed:
exit(0)
if summarize_failures and len(error_container):
print(
'''\n\n====================================================
================ FAILURES SUMMARY ================
====================================================\n
'''
)
for i, error in enumerate(error_container.get_errors(), start=1):
print(f"\n{i}:", error["test"], "\n")
print(error["stderr"])
exit(1)

View File

@@ -0,0 +1,4 @@
from .parse_and_sort_settings_in_json import add_all_settings_to_global_list as parse_and_sort_json_file
from .update_settings_header_file import generate as update_header_file
from .update_settings_scopes import generate as update_scopes
from .update_settings_src_code import generate as update_src_code

View File

@@ -0,0 +1,197 @@
import os
import re
import subprocess
import tempfile
from pathlib import Path
from typing import Set, List
from functools import total_ordering
# define file paths and global variables
DUCKDB_DIR = Path(__file__).resolve().parent.parent.parent
DUCKDB_SETTINGS_HEADER_FILE = os.path.join(DUCKDB_DIR, "src/include/duckdb/main", "settings.hpp")
DUCKDB_AUTOGENERATED_SETTINGS_FILE = os.path.join(DUCKDB_DIR, "src/main/settings", "autogenerated_settings.cpp")
DUCKDB_SETTINGS_SCOPE_FILE = os.path.join(DUCKDB_DIR, "src/main", "config.cpp")
JSON_PATH = os.path.join(DUCKDB_DIR, "src/common", "settings.json")
# define scope values
VALID_SCOPE_VALUES = ["GLOBAL", "LOCAL", "GLOBAL_LOCAL"]
INVALID_SCOPE_VALUE = "INVALID"
SQL_TYPE_MAP = {"UBIGINT": "idx_t", "BIGINT": "int64_t", "BOOLEAN": "bool", "DOUBLE": "double", "VARCHAR": "string"}
# global Setting structure
@total_ordering
class Setting:
# track names of written settings to prevent duplicates
__written_settings: Set[str] = set()
def __init__(
self,
name: str,
description: str,
sql_type: str,
scope: str,
internal_setting: str,
on_callbacks: List[str],
custom_implementation,
struct_name: str,
aliases: List[str],
default_scope: str,
default_value: str,
):
self.name = self._get_valid_name(name)
self.description = description
self.sql_type = self._get_sql_type(sql_type)
self.return_type = self._get_setting_type(sql_type)
self.is_enum = sql_type.startswith('ENUM')
self.internal_setting = internal_setting
self.scope = self._get_valid_scope(scope) if scope is not None else None
self.on_set, self.on_reset = self._get_on_callbacks(on_callbacks)
self.is_generic_setting = self.scope is None
if self.is_enum and self.is_generic_setting:
self.on_set = True
custom_callbacks = ['set', 'reset', 'get']
if type(custom_implementation) is bool:
self.all_custom = custom_implementation
self.custom_implementation = custom_callbacks if custom_implementation else []
else:
for entry in custom_implementation:
if entry not in custom_callbacks:
raise ValueError(
f"Setting {self.name} - incorrect input for custom_implementation - expected set/reset/get, got {entry}"
)
self.all_custom = len(set(custom_implementation)) == 3
self.custom_implementation = custom_implementation
self.aliases = self._get_aliases(aliases)
self.struct_name = self._get_struct_name() if len(struct_name) == 0 else struct_name
self.default_scope = self._get_valid_default_scope(default_scope) if default_scope is not None else None
self.default_value = default_value
# define all comparisons to be based on the setting's name attribute
def __eq__(self, other) -> bool:
return isinstance(other, Setting) and self.name == other.name
def __lt__(self, other) -> bool:
return isinstance(other, Setting) and self.name < other.name
def __hash__(self) -> int:
return hash(self.name)
def __repr__(self):
return f"struct {self.struct_name} -> {self.name}, {self.sql_type}, {self.type}, {self.scope}, {self.description} {self.aliases}"
# validate setting name for correct format and uniqueness
def _get_valid_name(self, name: str) -> str:
if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name):
raise ValueError(f"'{name}' cannot be used as setting name - invalid character")
if name in Setting.__written_settings:
raise ValueError(f"'{name}' cannot be used as setting name - already exists")
Setting.__written_settings.add(name)
return name
# ensure the setting scope is valid based on the accepted values
def _get_valid_scope(self, scope: str) -> str:
scope = scope.upper()
if scope in VALID_SCOPE_VALUES:
return scope
return INVALID_SCOPE_VALUE
def _get_valid_default_scope(self, scope: str) -> str:
scope = scope.upper()
if scope == 'GLOBAL':
return scope
elif scope == 'LOCAL':
return 'SESSION'
raise Exception(f"Invalid default scope value {scope}")
# validate and return the correct type format
def _get_sql_type(self, sql_type) -> str:
if sql_type.startswith('ENUM'):
return 'VARCHAR'
if sql_type.endswith('[]'):
# recurse into child-element
sub_type = self._get_sql_type(sql_type[:-2])
return sql_type
if sql_type in SQL_TYPE_MAP:
return sql_type
raise ValueError(f"Invalid SQL type: '{sql_type}' - supported types are {', '.join(SQL_TYPE_MAP.keys())}")
# validate and return the cpp input type
def _get_setting_type(self, type) -> str:
if type.startswith('ENUM'):
return type[len('ENUM<') : -1]
if type.endswith('[]'):
subtype = self._get_setting_type(type[:-2])
return "vector<" + subtype + ">"
return SQL_TYPE_MAP[type]
# validate and return the correct type format
def _get_on_callbacks(self, callbacks) -> (bool, bool):
set = False
reset = False
for entry in callbacks:
if entry == 'set':
set = True
elif entry == 'reset':
reset = True
else:
raise ValueError(f"Invalid entry in on_callbacks list: {entry} (expected set or reset)")
return (set, reset)
# validate and return the set of the aliases
def _get_aliases(self, aliases: List[str]) -> List[str]:
return [self._get_valid_name(alias) for alias in aliases]
# generate a function name
def _get_struct_name(self) -> str:
camel_case_name = ''.join(word.capitalize() for word in re.split(r'[-_]', self.name))
if camel_case_name.endswith("Setting"):
return f"{camel_case_name}"
return f"{camel_case_name}Setting"
# this global list (accessible across all files) stores all the settings definitions in the json file
SettingsList: List[Setting] = []
# global method that finds the indexes of a start and an end marker in a file
def find_start_end_indexes(source_code, start_marker, end_marker, file_path):
start_matches = list(re.finditer(start_marker, source_code))
if len(start_matches) == 0:
raise ValueError(f"Couldn't find start marker {start_marker} in {file_path}")
elif len(start_matches) > 1:
raise ValueError(f"Start marker found more than once in {file_path}")
start_index = start_matches[0].end()
end_matches = list(re.finditer(end_marker, source_code[start_index:]))
if len(end_matches) == 0:
raise ValueError(f"Couldn't find end marker {end_marker} in {file_path}")
elif len(end_matches) > 1:
raise ValueError(f"End marker found more than once in {file_path}")
end_index = start_index + end_matches[0].start()
return start_index, end_index
# global markers
SEPARATOR = "//===----------------------------------------------------------------------===//\n"
SRC_CODE_START_MARKER = "namespace duckdb {"
SRC_CODE_END_MARKER = "} // namespace duckdb"
# global method
def write_content_to_file(new_content, path):
with open(path, 'w') as source_file:
source_file.write("".join(new_content))
def get_setting_heading(setting_struct_name):
struct_name_wt_Setting = re.sub(r'Setting$', '', setting_struct_name)
heading_name = re.sub(r'(?<!^)(?=[A-Z])', ' ', struct_name_wt_Setting)
heading = SEPARATOR + f"// {heading_name}\n" + SEPARATOR
return heading
def make_format():
os.system(f"python3 scripts/format.py {DUCKDB_SETTINGS_HEADER_FILE} --fix --force --noconfirm")
os.system(f"python3 scripts/format.py {DUCKDB_SETTINGS_SCOPE_FILE} --fix --force --noconfirm")
os.system(f"python3 scripts/format.py {DUCKDB_AUTOGENERATED_SETTINGS_FILE} --fix --force --noconfirm")

View File

@@ -0,0 +1,58 @@
import json
from .config import Setting, SettingsList, JSON_PATH
# sort settings in json by name
def sort_json_data(path):
with open(path, 'r') as file:
data = json.load(file)
sorted_data = sorted(data, key=lambda x: x['name'])
with open(path, 'w') as file:
json.dump(sorted_data, file, indent=4)
return sorted_data
# parse json data and stores each entry as a settings object in the global list SettingsList
def add_all_settings_to_global_list():
valid_entries = [
'name',
'description',
'type',
'scope',
'internal_setting',
'on_callbacks',
'custom_implementation',
'struct',
'aliases',
'default_scope',
'default_value',
]
print(f"Parsing and sorting the settings data in {JSON_PATH}")
clear_global_settings_list()
json_data = sort_json_data(JSON_PATH)
# store all the settings in the SettingsList
for entry in json_data:
for field_entry in entry:
if field_entry not in valid_entries:
raise ValueError(
f"Found entry unexpected entry \"{field_entry}\" in setting, expected entry to be in {', '.join(valid_entries)}"
)
setting = Setting(
name=entry['name'],
description=entry['description'],
sql_type=entry['type'],
internal_setting=entry.get('internal_setting', entry['name']),
scope=entry.get('scope', None),
struct_name=entry.get('struct', ''),
on_callbacks=entry.get('on_callbacks', []),
custom_implementation=entry.get('custom_implementation', False),
aliases=entry.get('aliases', []),
default_scope=entry.get('default_scope', None),
default_value=entry.get('default_value', None),
)
SettingsList.append(setting)
def clear_global_settings_list():
SettingsList.clear()

View File

@@ -0,0 +1,132 @@
from .config import (
SRC_CODE_START_MARKER,
SRC_CODE_END_MARKER,
SettingsList,
find_start_end_indexes,
get_setting_heading,
)
def generate_create_value(setting):
if setting.sql_type == 'VARCHAR':
return 'Value'
else:
return f'Value::{setting.sql_type}'
def add_autogenerated_global_functions(setting):
cpp_code = ""
if 'set' not in setting.custom_implementation:
cpp_code += (
f"void {setting.struct_name}::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) {{\n"
)
if setting.on_set:
cpp_code += f"\tif (!OnGlobalSet(db, config, input)) {{\n"
cpp_code += f"\t\treturn;\n\t}}\n"
if setting.is_enum:
cpp_code += f"\tauto str_input = StringUtil::Upper(input.GetValue<string>());\n"
cpp_code += f"\tconfig.options.{setting.internal_setting} = EnumUtil::FromString<{setting.return_type}>(str_input);\n"
else:
cpp_code += f"\tconfig.options.{setting.internal_setting} = input.GetValue<{setting.return_type}>();\n"
cpp_code += f"}}\n\n"
if 'reset' not in setting.custom_implementation:
cpp_code += f"void {setting.struct_name}::ResetGlobal(DatabaseInstance *db, DBConfig &config) {{\n"
if setting.on_reset:
cpp_code += f"\tif (!OnGlobalReset(db, config)) {{\n"
cpp_code += f"\t\treturn;\n\t}}\n"
cpp_code += f"\tconfig.options.{setting.internal_setting} = DBConfigOptions().{setting.internal_setting};\n"
cpp_code += f"}}\n\n"
if 'get' not in setting.custom_implementation:
cpp_code += f"Value {setting.struct_name}::GetSetting(const ClientContext &context) {{\n"
cpp_code += f"\tauto &config = DBConfig::GetConfig(context);\n"
if setting.is_enum:
cpp_code += f"\treturn {generate_create_value(setting)}(StringUtil::Lower(EnumUtil::ToString(config.options.{setting.internal_setting})));\n"
else:
cpp_code += f"\treturn {generate_create_value(setting)}(config.options.{setting.internal_setting});\n"
cpp_code += f"}}\n\n"
return cpp_code
def add_autogenerated_local_functions(setting):
cpp_code = ""
if 'set' not in setting.custom_implementation:
cpp_code += f"void {setting.struct_name}::SetLocal(ClientContext &context, const Value &input) {{\n"
if setting.on_set:
cpp_code += f"\tif (!OnLocalSet(context, input)) {{\n"
cpp_code += f"\t\treturn;\n\t}}\n"
cpp_code += f"\tauto &config = ClientConfig::GetConfig(context);\n"
if setting.is_enum:
cpp_code += f"\tauto str_input = StringUtil::Upper(input.GetValue<string>());\n"
cpp_code += (
f"\tconfig.{setting.internal_setting} = EnumUtil::FromString<{setting.return_type}>(str_input);\n"
)
else:
cpp_code += f"\tconfig.{setting.internal_setting} = input.GetValue<{setting.return_type}>();\n"
cpp_code += f"}}\n\n"
if 'reset' not in setting.custom_implementation:
cpp_code += f"void {setting.struct_name}::ResetLocal(ClientContext &context) {{\n"
if setting.on_reset:
cpp_code += f"\tif (!OnLocalReset(context)) {{\n"
cpp_code += f"\t\treturn;\n\t}}\n"
cpp_code += f"\tClientConfig::GetConfig(context).{setting.internal_setting} = ClientConfig().{setting.internal_setting};\n"
cpp_code += f"}}\n\n"
if 'get' not in setting.custom_implementation:
cpp_code += f"Value {setting.struct_name}::GetSetting(const ClientContext &context) {{\n"
cpp_code += f"\tauto &config = ClientConfig::GetConfig(context);\n"
if setting.is_enum:
cpp_code += f"\treturn {generate_create_value(setting)}(StringUtil::Lower(EnumUtil::ToString(config.{setting.internal_setting})));\n"
else:
cpp_code += f"\treturn {generate_create_value(setting)}(config.{setting.internal_setting});\n"
cpp_code += f"}}\n\n"
return cpp_code
def add_autogenerated_enum_set(setting):
if not setting.on_set:
return ""
if not setting.is_enum:
return ""
if 'set' in setting.custom_implementation:
return ""
cpp_code = ""
cpp_code += f"void {setting.struct_name}::OnSet(SettingCallbackInfo &info, Value &parameter) {{\n"
cpp_code += f"\tEnumUtil::FromString<{setting.return_type}>(StringValue::Get(parameter));\n"
cpp_code += f"}}\n\n"
return cpp_code
def add_autogenerated_functions(path):
with open(path, 'r') as source_file:
source_code = source_file.read()
# find start and end indexes of the auto-generated section
start_index, end_index = find_start_end_indexes(source_code, SRC_CODE_START_MARKER, SRC_CODE_END_MARKER, path)
# split source code into sections
start_section = source_code[: start_index + 1] + "\n"
end_section = source_code[end_index:]
new_content = ""
added = 0
for setting in SettingsList:
# if the setting doesn't need custom implementation, an autogenerated one will be included
if not setting.all_custom:
header = get_setting_heading(setting.struct_name)
content = ""
if setting.is_generic_setting:
content += add_autogenerated_enum_set(setting)
else:
if setting.scope == "GLOBAL" or setting.scope == "GLOBAL_LOCAL":
content += add_autogenerated_global_functions(setting)
if setting.scope == "LOCAL" or setting.scope == "GLOBAL_LOCAL":
content += add_autogenerated_local_functions(setting)
if len(content) > 0:
new_content += header
new_content += content
added += 1
return start_section + new_content + end_section, added
if __name__ == '__main__':
raise ValueError("Please use 'generate_settings.py' instead of running the individual script(s)")

View File

@@ -0,0 +1,73 @@
from .config import SEPARATOR, SettingsList, find_start_end_indexes, write_content_to_file
# markers
START_MARKER = (
f"//===----------------------------------------------------------------------===//\n"
f"// This code is autogenerated from 'update_settings_header_file.py'.\n"
f"// Please do not make any changes directly here, as they will be overwritten.\n//\n"
f"// Start of the auto-generated list of settings structures\n"
f"//===----------------------------------------------------------------------===//\n"
)
END_MARKER = "// End of the auto-generated list of settings structures"
def extract_declarations(setting) -> str:
definition = (
f"struct {setting.struct_name} {{\n"
f" using RETURN_TYPE = {setting.return_type};\n"
f" static constexpr const char *Name = \"{setting.name}\";\n"
f" static constexpr const char *Description = \"{setting.description}\";\n"
f" static constexpr const char *InputType = \"{setting.sql_type}\";\n"
)
if setting.scope == "GLOBAL" or setting.scope == "GLOBAL_LOCAL":
definition += f" static void SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &parameter);\n"
definition += f" static void ResetGlobal(DatabaseInstance *db, DBConfig &config);\n"
if setting.on_set:
definition += f"static bool OnGlobalSet(DatabaseInstance *db, DBConfig &config, const Value &input);\n"
if setting.on_reset:
definition += f"static bool OnGlobalReset(DatabaseInstance *db, DBConfig &config);\n"
if setting.scope == "LOCAL" or setting.scope == "GLOBAL_LOCAL":
definition += f" static void SetLocal(ClientContext &context, const Value &parameter);\n"
definition += f" static void ResetLocal(ClientContext &context);\n"
if setting.on_set:
definition += f"static bool OnLocalSet(ClientContext &context, const Value &input);\n"
if setting.on_reset:
definition += f"static bool OnLocalReset(ClientContext &context);\n"
if setting.scope is not None:
definition += f" static Value GetSetting(const ClientContext &context);\n"
if setting.is_generic_setting:
definition += f" static constexpr const char *DefaultValue = \"{setting.default_value}\";\n"
definition += f" static constexpr SetScope DefaultScope = SetScope::{setting.default_scope};\n"
if setting.on_set:
definition += f" static void OnSet(SettingCallbackInfo &info, Value &input);\n"
definition += f"}};\n\n"
return definition
# generate code for all the settings for the the header file
def generate_content(header_file_path):
with open(header_file_path, 'r') as source_file:
source_code = source_file.read()
# find start and end indexes of the auto-generated section
start_index, end_index = find_start_end_indexes(source_code, START_MARKER, END_MARKER, header_file_path)
# split source code into sections
start_section = source_code[: start_index + 1]
end_section = SEPARATOR + source_code[end_index:]
new_content = "".join(extract_declarations(setting) for setting in SettingsList)
return start_section + new_content + end_section
def generate():
from .config import DUCKDB_SETTINGS_HEADER_FILE
print(f"Updating {DUCKDB_SETTINGS_HEADER_FILE}")
new_content = generate_content(DUCKDB_SETTINGS_HEADER_FILE)
write_content_to_file(new_content, DUCKDB_SETTINGS_HEADER_FILE)
if __name__ == '__main__':
raise ValueError("Please use 'generate_settings.py' instead of running the individual script(s)")

View File

@@ -0,0 +1,61 @@
from .config import SettingsList, VALID_SCOPE_VALUES, find_start_end_indexes, write_content_to_file
# markers
START_MARKER = r'static const ConfigurationOption internal_options\[\] = \{\n'
END_MARKER = r',\s*FINAL_ALIAS};'
# generate the scope code for the ConfigurationOption array and insert into the config file
def generate_scope_code(file):
with open(file, 'r') as source_file:
source_code = source_file.read()
# find the start and end indexes of the settings' scope array
start_index, end_index = find_start_end_indexes(source_code, START_MARKER, END_MARKER, file)
# split source code into sections
before_array = source_code[:start_index] + "\n "
after_array = source_code[end_index:]
# generate new entries for the settings array
new_entries = []
new_aliases = []
for setting in SettingsList:
if setting.is_generic_setting:
if setting.on_set:
new_entries.append([setting.name, f"DUCKDB_SETTING_CALLBACK({setting.struct_name})"])
else:
new_entries.append([setting.name, f"DUCKDB_SETTING({setting.struct_name})"])
elif setting.scope in VALID_SCOPE_VALUES: # valid setting_scope values
new_entries.append([setting.name, f"DUCKDB_{setting.scope}({setting.struct_name})"])
else:
raise ValueError(f"Setting {setting.name} has invalid input scope value")
for alias in setting.aliases:
new_aliases.append([alias, setting.name])
new_entries.sort(key=lambda x: x[0])
new_aliases.sort(key=lambda x: x[0])
entry_indexes = {}
for i in range(len(new_entries)):
entry_indexes[new_entries[i][0]] = i
for alias in new_aliases:
alias_index = entry_indexes[alias[1]]
alias.append(f"DUCKDB_SETTING_ALIAS(\"{alias[0]}\", {alias_index})")
new_array_section = ',\n '.join([x[1] for x in new_entries])
new_array_section += ', FINAL_SETTING};\n\n'
new_array_section += 'static const ConfigurationAlias setting_aliases[] = {'
new_array_section += ',\n '.join([x[2] for x in new_aliases])
return before_array + new_array_section + after_array
def generate():
from .config import DUCKDB_SETTINGS_SCOPE_FILE
print(f"Updating {DUCKDB_SETTINGS_SCOPE_FILE}")
new_content = generate_scope_code(DUCKDB_SETTINGS_SCOPE_FILE)
write_content_to_file(new_content, DUCKDB_SETTINGS_SCOPE_FILE)
if __name__ == '__main__':
raise ValueError("Please use 'generate_settings.py' instead of running the individual script(s)")

View File

@@ -0,0 +1,18 @@
import re
from .config import SettingsList, write_content_to_file, find_start_end_indexes
from .update_autogenerated_functions import add_autogenerated_functions
def generate():
from .config import DUCKDB_AUTOGENERATED_SETTINGS_FILE
print(f"Updating {DUCKDB_AUTOGENERATED_SETTINGS_FILE}")
new_autogenerated_content, generated = add_autogenerated_functions(DUCKDB_AUTOGENERATED_SETTINGS_FILE)
write_content_to_file(new_autogenerated_content, DUCKDB_AUTOGENERATED_SETTINGS_FILE)
# NOTE: for debugging purposes
# print(f"The total number of settings is {len(SettingsList)}, and {generated} settings are added in {DUCKDB_AUTOGENERATED_SETTINGS_FILE} and, {added_custom} new and {existing_custom} existing added in {DUCKDB_CUSTOM_DEFINED_SETTINGS_FILE}")
if __name__ == '__main__':
raise ValueError("Please use 'generate_settings.py' instead of running the individual script(s)")

25
external/duckdb/scripts/setup_ubuntu1804.sh vendored Executable file
View File

@@ -0,0 +1,25 @@
#!/bin/bash
# stuff
apt-get update -y -qq
apt-get install -y -qq software-properties-common
add-apt-repository ppa:git-core/ppa
apt-get update -y -qq
apt-get install -y -qq --fix-missing ninja-build make gcc-multilib g++-multilib libssl-dev wget openjdk-8-jdk zip maven unixodbc-dev libc6-dev-i386 lib32readline6-dev libssl-dev libcurl4-gnutls-dev libexpat1-dev gettext unzip build-essential checkinstall libffi-dev curl libz-dev openssh-client pkg-config
# cross compilation stuff
apt-get install -y -qq gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
# git
wget https://github.com/git/git/archive/refs/tags/v2.18.5.tar.gz
tar xvf v2.18.5.tar.gz
cd git-2.18.5
make
make prefix=/usr install
git --version
# cmake
wget https://github.com/Kitware/CMake/releases/download/v3.21.3/cmake-3.21.3-linux-x86_64.sh
chmod +x cmake-3.21.3-linux-x86_64.sh
./cmake-3.21.3-linux-x86_64.sh --skip-license --prefix=/usr/local
cmake --version

86
external/duckdb/scripts/test_compile.py vendored Normal file
View File

@@ -0,0 +1,86 @@
import os
import sys
import amalgamation
import pickle
import subprocess
# where to cache which files have already been compiled
cache_file = 'amalgamation.cache'
ignored_files = ['utf8proc_data.cpp']
RESUME_AUTO = 0
RESUME_ALWAYS = 1
RESUME_NEVER = 2
# resume behavior
# by default, we resume if the previous test_compile was run on the same commit hash as this one
resume = RESUME_AUTO
for arg in sys.argv:
if arg == '--resume':
resume = RESUME_ALWAYS
elif arg == '--restart':
cache = RESUME_NEVER
if resume == RESUME_NEVER:
try:
os.remove(cache_file)
except:
pass
def get_git_hash():
proc = subprocess.Popen(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE)
return proc.stdout.read().strip()
current_hash = get_git_hash()
# load the cache, and check the commit hash
try:
with open(cache_file, 'rb') as cf:
cache = pickle.load(cf)
if resume == RESUME_AUTO:
# auto resume, check
if cache['commit_hash'] != current_hash:
cache = {}
except:
cache = {}
cache['commit_hash'] = current_hash
def try_compilation(fpath, cache):
if fpath in cache:
return
print(fpath)
cmd = (
'clang++ -std=c++11 -Wno-deprecated -Wno-writable-strings -S -MMD -MF dependencies.d -o deps.s '
+ fpath
+ ' '
+ ' '.join(["-I" + x for x in amalgamation.include_paths])
)
ret = os.system(cmd)
if ret != 0:
raise Exception('Failed compilation of file "' + fpath + '"!\n Command: ' + cmd)
cache[fpath] = True
with open(cache_file, 'wb') as cf:
pickle.dump(cache, cf)
def compile_dir(dir, cache):
files = os.listdir(dir)
files.sort()
for fname in files:
if fname in amalgamation.excluded_compilation_files or fname in ignored_files:
continue
fpath = os.path.join(dir, fname)
if os.path.isdir(fpath):
compile_dir(fpath, cache)
elif fname.endswith('.cpp') or fname.endswith('.hpp') or fname.endswith('.c') or fname.endswith('.cc'):
try_compilation(fpath, cache)
# compile all files in the src directory (including headers!) individually
for cdir in amalgamation.compile_directories:
compile_dir(cdir, cache)

22
external/duckdb/scripts/test_docker_images.sh vendored Executable file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/env bash
TEST="./build/release/duckdb -c 'PRAGMA platform;' && make clean && echo 'DOCKER TEST RESULT: SUCCESS' || (echo 'DOCKER TEST RESULT: FAILURE' && make clean)"
make clean
# Currently not working due to cmake version being too low
# docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb amazonlinux:2 <<< "yum install gcc gcc-c++ git make cmake ninja-build -y && GEN=ninja make && $TEST" 2>&1
docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb alpine:latest <<< "apk add g++ git make cmake ninja python3 && cmake -Bbuild . && cmake --build build && cmake --install build && g++ -std=c++11 examples/embedded-c++/main.cpp"
docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb amazonlinux:latest <<< "yum install clang git make cmake ninja-build -y && GEN=ninja make && $TEST" 2>&1
docker run -i --platform linux/arm64 --rm -v $(pwd):/duckdb --workdir /duckdb alpine:latest <<< "apk add g++ git make cmake ninja && GEN=ninja make && $TEST" 2>&1
docker run -i --platform linux/amd64 --rm -v $(pwd):/duckdb --workdir /duckdb alpine:latest <<< "apk add g++ git make cmake ninja && GEN=ninja make && $TEST" 2>&1
docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb alpine:latest <<< "apk add g++ git make cmake ninja && GEN=ninja make && $TEST" 2>&1
docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb alpine:latest <<< "apk add g++ git make cmake ninja python3 && GEN=ninja make && $TEST" 2>&1
docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb alpine:latest <<< "apk add g++ git make cmake ninja && CXX_STANDARD=23 GEN=ninja make && $TEST" 2>&1
docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb ubuntu:20.04 <<< "apt-get update && export DEBIAN_FRONTEND=noninteractive && apt-get install g++ git make cmake ninja-build -y && GEN=ninja make && $TEST" 2>&1
docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb ubuntu:devel <<< "apt-get update && export DEBIAN_FRONTEND=noninteractive && apt-get install g++ git make cmake ninja-build -y && GEN=ninja make && $TEST" 2>&1
docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb centos <<< "sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && yum install git make cmake clang -y && make && $TEST" 2>&1
docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb fedora <<< "dnf install make cmake ninja-build gcc g++ -y && make && $TEST" 2>&1
docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb ghcr.io/mocusez/duckdb-riscv-ci/duckdb-riscv-ci <<< "apt-get update && export DEBIAN_FRONTEND=noninteractive && apt-get install cmake ninja-build libssl-dev g++-riscv64-linux-gnu -y && GEN=ninja CC='riscv64-linux-gnu-gcc -march=rv64gcv_zicsr_zifencei_zihintpause_zvl256b' CXX='riscv64-linux-gnu-g++ -march=rv64gcv_zicsr_zifencei_zihintpause_zvl256b' DUCKDB_PLATFORM=linux_riscv make && cd / && ./start_qemu.sh && cd /duckdb && make clean && echo 'DOCKER TEST RESULT: SUCCESS' || (echo 'DOCKER TEST RESULT: FAILURE' && make clean)" 2>&1

View File

@@ -0,0 +1,230 @@
import argparse
import os
import sqllogictest
from sqllogictest import SQLParserException, SQLLogicParser, SQLLogicTest
import subprocess
import multiprocessing
import tempfile
import re
parser = argparse.ArgumentParser(description="Test serialization")
parser.add_argument("--shell", type=str, help="Shell binary to run", default=os.path.join('build', 'debug', 'duckdb'))
parser.add_argument("--offset", type=int, help="File offset", default=None)
parser.add_argument("--count", type=int, help="File count", default=None)
parser.add_argument('--no-exit', action='store_true', help='Do not exit after a test fails', default=False)
parser.add_argument('--print-failing-only', action='store_true', help='Print failing tests only', default=False)
parser.add_argument(
'--include-extensions', action='store_true', help='Include test files of out-of-tree extensions', default=False
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--test-file", type=str, help="Path to the SQL logic file", default='')
group.add_argument(
"--test-list", type=str, help="Path to the file that contains a newline separated list of test files", default=''
)
group.add_argument("--all-tests", action='store_true', help="Run all tests", default=False)
args = parser.parse_args()
def extract_git_urls(script: str):
pattern = r'GIT_URL\s+(https?://\S+)'
return re.findall(pattern, script)
import os
import requests
from urllib.parse import urlparse
def download_directory_contents(api_url, local_path, headers):
response = requests.get(api_url, headers=headers)
if response.status_code != 200:
print(f"⚠️ Could not access {api_url}: {response.status_code}")
return
os.makedirs(local_path, exist_ok=True)
for item in response.json():
item_type = item.get("type")
item_name = item.get("name")
if item_type == "file":
download_url = item.get("download_url")
if not download_url:
continue
file_path = os.path.join(local_path, item_name)
file_resp = requests.get(download_url)
if file_resp.status_code == 200:
with open(file_path, "wb") as f:
f.write(file_resp.content)
print(f" - Downloaded {file_path}")
else:
print(f" - Failed to download {file_path}")
elif item_type == "dir":
subdir_api_url = item.get("url")
subdir_local_path = os.path.join(local_path, item_name)
download_directory_contents(subdir_api_url, subdir_local_path, headers)
def download_test_sql_folder(repo_url, base_folder="extension-test-files"):
repo_name = urlparse(repo_url).path.strip("/").split("/")[-1]
target_folder = os.path.join(base_folder, repo_name)
if os.path.exists(target_folder):
print(f"✓ Skipping {repo_name}, already exists.")
return
print(f"⬇️ Downloading test/sql from {repo_name}...")
api_url = f"https://api.github.com/repos/duckdb/{repo_name}/contents/test/sql?ref=main"
GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
headers = {"Accept": "application/vnd.github.v3+json", "Authorization": f"Bearer {GITHUB_TOKEN}"}
download_directory_contents(api_url, target_folder, headers)
def batch_download_all_test_sql():
filename = ".github/config/out_of_tree_extensions.cmake"
if not os.path.isfile(filename):
raise Exception(f"File {filename} not found")
with open(filename, "r") as f:
content = f.read()
urls = extract_git_urls(content)
if urls == []:
print("No URLs found.")
for url in urls:
download_test_sql_folder(url)
def find_tests_recursive(dir, excluded_paths):
test_list = []
for f in os.listdir(dir):
path = os.path.join(dir, f)
if path in excluded_paths:
continue
if os.path.isdir(path):
test_list += find_tests_recursive(path, excluded_paths)
elif path.endswith('.test') or path.endswith('.test_slow'):
test_list.append(path)
return test_list
def parse_test_file(filename):
if not os.path.isfile(filename):
raise Exception(f"File {filename} not found")
parser = SQLLogicParser()
try:
out: Optional[SQLLogicTest] = parser.parse(filename)
if not out:
raise SQLParserException(f"Test {filename} could not be parsed")
except:
return []
loop_count = 0
statements = []
for stmt in out.statements:
if type(stmt) is sqllogictest.statement.skip.Skip:
# mode skip - just skip entire test
break
if type(stmt) is sqllogictest.statement.loop.Loop or type(stmt) is sqllogictest.statement.foreach.Foreach:
loop_count += 1
if type(stmt) is sqllogictest.statement.endloop.Endloop:
loop_count -= 1
if loop_count > 0:
# loops are ignored currently
continue
if not (
type(stmt) is sqllogictest.statement.query.Query or type(stmt) is sqllogictest.statement.statement.Statement
):
# only handle query and statement nodes for now
continue
if type(stmt) is sqllogictest.statement.statement.Statement:
# skip expected errors
if stmt.expected_result.type == sqllogictest.ExpectedResult.Type.ERROR:
if any(
"parser error" in line.lower() or "syntax error" in line.lower()
for line in stmt.expected_result.lines
):
continue
query = ' '.join(stmt.lines)
statements.append(query)
return statements
def run_test_case(args_tuple):
i, file, shell, print_failing_only = args_tuple
results = []
if not print_failing_only:
print(f"Run test {i}: {file}")
statements = parse_test_file(file)
for statement in statements:
with tempfile.TemporaryDirectory() as tmpdir:
peg_sql_path = os.path.join(tmpdir, 'peg_test.sql')
with open(peg_sql_path, 'w') as f:
f.write(f'CALL check_peg_parser($TEST_PEG_PARSER${statement}$TEST_PEG_PARSER$);\n')
proc = subprocess.run([shell, '-init', peg_sql_path, '-c', '.exit'], capture_output=True)
stderr = proc.stderr.decode('utf8')
if proc.returncode == 0 and ' Error:' not in stderr:
continue
if print_failing_only:
print(f"Failed test {i}: {file}")
else:
print(f'Failed')
print(f'-- STDOUT --')
print(proc.stdout.decode('utf8'))
print(f'-- STDERR --')
print(stderr)
results.append((file, statement))
break
return results
if __name__ == "__main__":
files = []
excluded_tests = {
'test/sql/peg_parser', # Fail for some reason
'test/sql/prepared/parameter_variants.test', # PostgreSQL parser bug with ?1
'test/sql/copy/s3/download_config.test', # Unknown why this passes in SQLLogicTest
'test/sql/function/list/lambdas/arrow/lambda_scope_deprecated.test', # Error in the tokenization of *+*
'test/sql/catalog/function/test_simple_macro.test', # Bug when mixing named parameters and non-named
}
if args.all_tests:
# run all tests
test_dir = os.path.join('test', 'sql')
files = find_tests_recursive(test_dir, excluded_tests)
if args.include_extensions:
batch_download_all_test_sql()
extension_files = find_tests_recursive('extension-test-files', {})
files = files + extension_files
elif len(args.test_list) > 0:
with open(args.test_list, 'r') as f:
files = [x.strip() for x in f.readlines() if x.strip() not in excluded_tests]
else:
# run a single test
files.append(args.test_file)
files.sort()
start = args.offset if args.offset is not None else 0
end = start + args.count if args.count is not None else len(files)
work_items = [(i, files[i], args.shell, args.print_failing_only) for i in range(start, end)]
if not args.no_exit:
# Disable multiprocessing for --no-exit behavior
failed_test_list = []
for item in work_items:
res = run_test_case(item)
if res:
failed_test_list.extend(res)
exit(1)
else:
with multiprocessing.Pool() as pool:
results = pool.map(run_test_case, work_items)
failed_test_list = [item for sublist in results for item in sublist]
failed_tests = len(failed_test_list)
print("List of failed tests: ")
for test, statement in failed_test_list:
print(f"{test}\n{statement}\n\n")
print(f"Total of {failed_tests} out of {len(files)} failed ({round(failed_tests/len(files) * 100,2)}%). ")

View File

@@ -0,0 +1,226 @@
import sqllogictest
from sqllogictest import SQLParserException, SQLLogicParser, SQLLogicTest
import duckdb
from typing import Optional
import argparse
import shutil
import os
import subprocess
# example usage: python3 scripts/test_serialization_bwc.py --old-source ../duckdb-bugfix --test-file test/sql/aggregate/aggregates/test_median.test
serialized_path = os.path.join('test', 'api', 'serialized_plans')
db_load_path = os.path.join(serialized_path, 'db_load.sql')
queries_path = os.path.join(serialized_path, 'queries.sql')
result_binary = os.path.join(serialized_path, 'serialized_plans.binary')
unittest_binary = os.path.join('build', 'debug', 'test', 'unittest')
def complete_query(q):
q = q.strip()
if q.endswith(';'):
return q
return q + ';'
def parse_test_file(filename):
parser = SQLLogicParser()
try:
out: Optional[SQLLogicTest] = parser.parse(filename)
if not out:
raise SQLParserException(f"Test {filename} could not be parsed")
except:
return {'load': [], 'query': []}
loop_count = 0
load_statements = []
query_statements = []
for stmt in out.statements:
if type(stmt) is sqllogictest.statement.skip.Skip:
# mode skip - just skip entire test
break
if type(stmt) is sqllogictest.statement.loop.Loop or type(stmt) is sqllogictest.statement.foreach.Foreach:
loop_count += 1
if type(stmt) is sqllogictest.statement.endloop.Endloop:
loop_count -= 1
if loop_count > 0:
# loops are ignored currently
continue
if not (
type(stmt) is sqllogictest.statement.query.Query or type(stmt) is sqllogictest.statement.statement.Statement
):
# only handle query and statement nodes for now
continue
if type(stmt) is sqllogictest.statement.statement.Statement:
# skip expected errors
if stmt.expected_result.type == sqllogictest.ExpectedResult.Type.ERROR:
continue
query = ' '.join(stmt.lines)
try:
sql_stmt_list = duckdb.extract_statements(query)
except KeyboardInterrupt:
raise
except:
continue
for sql_stmt in sql_stmt_list:
if sql_stmt.type == duckdb.StatementType.SELECT:
query_statements.append(query)
elif sql_stmt.type == duckdb.StatementType.PRAGMA:
continue
else:
load_statements.append(query)
return {'load': load_statements, 'query': query_statements}
def build_sources(old_source, new_source):
# generate the sources
current_path = os.getcwd()
os.chdir(old_source)
# build if not yet build
if not os.path.isfile(unittest_binary):
res = subprocess.run(['make', 'debug']).returncode
if res != 0:
raise Exception("Failed to build old sources")
# run the verification
os.chdir(current_path)
os.chdir(new_source)
# build if not yet build
if not os.path.isfile(unittest_binary):
res = subprocess.run(['make', 'debug']).returncode
if res != 0:
raise Exception("Failed to build new sources")
os.chdir(current_path)
def run_test(filename, old_source, new_source, no_exit):
statements = parse_test_file(filename)
# generate the sources
current_path = os.getcwd()
os.chdir(old_source)
# write the files
with open(os.path.join(old_source, db_load_path), 'w+') as f:
for stmt in statements['load']:
f.write(complete_query(stmt) + '\n')
with open(os.path.join(old_source, queries_path), 'w+') as f:
for stmt in statements['query']:
f.write(complete_query(stmt) + '\n')
# generate the serialization
my_env = os.environ.copy()
my_env['GEN_PLAN_STORAGE'] = '1'
res = subprocess.run(['build/debug/test/unittest', 'Generate serialized plans file'], env=my_env).returncode
if res != 0:
print(f"SKIPPING TEST {filename}")
return True
os.chdir(current_path)
# copy over the files
for f in [db_load_path, queries_path, result_binary]:
shutil.copy(os.path.join(old_source, f), os.path.join(new_source, f))
# run the verification
os.chdir(new_source)
res = subprocess.run(['build/debug/test/unittest', "Test deserialized plans from file"]).returncode
if res != 0:
if no_exit:
print("BROKEN TEST")
with open('broken_tests.list', 'a') as f:
f.write(filename + '\n')
return False
raise Exception("Deserialization failure")
os.chdir(current_path)
return True
def parse_excluded_tests(path):
exclusion_list = {}
with open(path) as f:
for line in f:
if len(line.strip()) == 0 or line[0] == '#':
continue
exclusion_list[line.strip()] = True
return exclusion_list
def find_tests_recursive(dir, excluded_paths):
test_list = []
for f in os.listdir(dir):
path = os.path.join(dir, f)
if path in excluded_paths:
continue
if os.path.isdir(path):
test_list += find_tests_recursive(path, excluded_paths)
elif path.endswith('.test'):
test_list.append(path)
return test_list
def main():
parser = argparse.ArgumentParser(description="Test serialization")
parser.add_argument("--new-source", type=str, help="Path to the new source", default='.')
parser.add_argument("--old-source", type=str, help="Path to the old source")
parser.add_argument("--start-at", type=str, help="Start running tests at this specific test", default=None)
parser.add_argument("--no-exit", action="store_true", help="Keep running even if a test fails", default=False)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--test-file", type=str, help="Path to the SQL logic file", default='')
group.add_argument("--all-tests", action='store_true', help="Run all tests", default=False)
group.add_argument("--test-list", type=str, help="Load tests to run from a file list", default=None)
args = parser.parse_args()
old_source = args.old_source
new_source = args.new_source
files = []
if args.all_tests:
# run all tests
excluded_tests = parse_excluded_tests(
os.path.join(new_source, 'test', 'api', 'serialized_plans', 'excluded_tests.list')
)
test_dir = os.path.join('test', 'sql')
if new_source != '.':
test_dir = os.path.join(new_source, test_dir)
files = find_tests_recursive(test_dir, excluded_tests)
elif args.test_list is not None:
with open(args.test_list, 'r') as f:
for line in f:
if len(line.strip()) == 0:
continue
files.append(line.strip())
else:
# run a single test
files.append(args.test_file)
files.sort()
current_path = os.getcwd()
try:
build_sources(old_source, new_source)
all_succeeded = True
started = False
if args.start_at is None:
started = True
for filename in files:
if not started:
if filename == args.start_at:
started = True
else:
continue
print(f"Run test {filename}")
os.chdir(current_path)
if not run_test(filename, old_source, new_source, args.no_exit):
all_succeeded = False
if not all_succeeded:
exit(1)
except:
raise
finally:
os.chdir(current_path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,229 @@
import argparse
import os
import subprocess
import re
import csv
from pathlib import Path
parser = argparse.ArgumentParser(description='Run a full benchmark using the CLI and report the results.')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--old-cli', action='store', help='Path to the CLI of the old DuckDB version to test')
group.add_argument('--versions', type=str, action='store', help='DuckDB versions to test')
parser.add_argument('--new-unittest', action='store', help='Path to the new unittester to run', required=True)
parser.add_argument('--new-cli', action='store', help='Path to the new unittester to run', default=None)
parser.add_argument('--compatibility', action='store', help='Storage compatibility version', default='v1.0.0')
parser.add_argument(
'--test-config', action='store', help='Test config script to run', default='test/configs/storage_compatibility.json'
)
parser.add_argument('--db-name', action='store', help='Database name to write to', default='bwc_storage_test.db')
parser.add_argument('--abort-on-failure', action='store_true', help='Abort on first failure', default=False)
parser.add_argument('--start-offset', type=int, action='store', help='Test start offset', default=None)
parser.add_argument('--end-offset', type=int, action='store', help='Test end offset', default=None)
parser.add_argument('--no-summarize-failures', action='store_true', help='Skip failure summary', default=False)
parser.add_argument('--list-versions', action='store_true', help='Only list versions to test', default=False)
parser.add_argument(
'--run-empty-tests',
action='store_true',
help='Run tests that don' 't have a CREATE TABLE or CREATE VIEW statement',
default=False,
)
args, extra_args = parser.parse_known_args()
programs_to_test = []
if args.versions is not None:
version_splits = args.versions.split('|')
for version in version_splits:
cli_path = os.path.join(Path.home(), '.duckdb', 'cli', version, 'duckdb')
if not os.path.isfile(cli_path):
os.system(f'curl https://install.duckdb.org | DUCKDB_VERSION={version} sh')
programs_to_test.append(cli_path)
else:
programs_to_test.append(args.old_cli)
unittest_program = args.new_unittest
db_name = args.db_name
new_cli = args.new_unittest.replace('test/unittest', 'duckdb') if args.new_cli is None else args.new_cli
summarize_failures = not args.no_summarize_failures
# Use the '-l' parameter to output the list of tests to run
proc = subprocess.run(
[unittest_program, '--test-config', args.test_config, '-l'] + extra_args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout = proc.stdout.decode('utf8').strip()
stderr = proc.stderr.decode('utf8').strip()
if len(stderr) > 0:
print("Failed to run program " + unittest_program)
print("Returncode:", proc.returncode)
print(stdout)
print(stderr)
exit(1)
# The output is in the format of 'PATH\tGROUP', we're only interested in the PATH portion
test_cases = []
first_line = True
for line in stdout.splitlines():
if first_line:
first_line = False
continue
if len(line.strip()) == 0:
continue
splits = line.rsplit('\t', 1)
test_cases.append(splits[0])
test_cases.sort()
if args.compatibility != 'v1.0.0':
raise Exception("Only v1.0.0 is supported for now (FIXME)")
def escape_cmd_arg(arg):
if '"' in arg or '\'' in arg or ' ' in arg or '\\' in arg:
arg = arg.replace('\\', '\\\\')
arg = arg.replace('"', '\\"')
arg = arg.replace("'", "\\'")
return f'"{arg}"'
return arg
error_container = []
def handle_failure(test, cmd, msg, stdout, stderr, returncode):
print(f"==============FAILURE============")
print(test)
print(f"==============MESSAGE============")
print(msg)
print(f"==============COMMAND============")
cmd_str = ''
for entry in cmd:
cmd_str += escape_cmd_arg(entry) + ' '
print(cmd_str.strip())
print(f"==============RETURNCODE=========")
print(str(returncode))
print(f"==============STDOUT=============")
print(stdout)
print(f"==============STDERR=============")
print(stderr)
print(f"=================================")
if args.abort_on_failure:
exit(1)
else:
error_container.append({'test': test, 'stderr': stderr})
def run_program(cmd, description):
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout = proc.stdout.decode('utf8').strip()
stderr = proc.stderr.decode('utf8').strip()
if proc.returncode != 0:
return {
'test': test,
'cmd': cmd,
'msg': f'Failed to {description}',
'stdout': stdout,
'stderr': stderr,
'returncode': proc.returncode,
}
return None
def try_run_program(cmd, description):
result = run_program(cmd, description)
if result is None:
return True
handle_failure(**result)
return False
index = 0
start = 0 if args.start_offset is None else args.start_offset
end = len(test_cases) if args.end_offset is None else args.end_offset
for i in range(start, end):
test = test_cases[i]
skipped = ''
if not args.run_empty_tests:
with open(test, 'r') as f:
test_contents = f.read().lower()
if 'create table' not in test_contents and 'create view' not in test_contents:
skipped = ' (SKIPPED)'
print(f'[{i}/{len(test_cases)}]: {test}{skipped}')
if skipped != '':
continue
# remove the old db
try:
os.remove(db_name)
except:
pass
cmd = [unittest_program, '--test-config', args.test_config, test]
if not try_run_program(cmd, 'Run Test'):
continue
if not os.path.isfile(db_name):
# db not created
continue
cmd = [
programs_to_test[-1],
db_name,
'-c',
'.headers off',
'-csv',
'-c',
'.output table_list.csv',
'-c',
'SHOW ALL TABLES',
]
if not try_run_program(cmd, 'List Tables'):
continue
tables = []
with open('table_list.csv', newline='') as f:
reader = csv.reader(f)
for row in reader:
tables.append((row[1], row[2]))
# no tables / views
if len(tables) == 0:
continue
# read all tables / views
failures = []
for cli in programs_to_test:
cmd = [cli, db_name]
for table in tables:
schema_name = table[0].replace('"', '""')
table_name = table[1].replace('"', '""')
cmd += ['-c', f'FROM "{schema_name}"."{table_name}"']
failure = run_program(cmd, 'Query Tables')
if failure is not None:
failures.append(failure)
if len(failures) > 0:
# we failed to query the tables
# this MIGHT be expected - e.g. we might have views that reference stale state (e.g. files that are deleted)
# try to run it with the new CLI - if this succeeds we have a problem
new_cmd = [new_cli] + cmd[1:]
new_failure = run_program(new_cmd, 'Query Tables (New)')
if new_failure is None:
# we succeeded with the new CLI - report the failure
for failure in failures:
handle_failure(**failure)
continue
if len(error_container) == 0:
exit(0)
if summarize_failures:
print(
'''\n\n====================================================
================ FAILURES SUMMARY ================
====================================================\n
'''
)
for i, error in enumerate(error_container, start=1):
print(f"\n{i}:", error["test"], "\n")
print(error["stderr"])
exit(1)

View File

@@ -0,0 +1,162 @@
import os
import argparse
import subprocess
import shutil
parser = argparse.ArgumentParser(
description='''Runs storage tests both with explicit one-initialization and with explicit zero-initialization, and verifies that the final storage files are the same.
The purpose of this is to verify all memory is correctly initialized before writing to disk - which prevents leaking of in-memory data in storage files by writing uninitialized memory to disk.'''
)
parser.add_argument('--unittest', default='build/debug/test/unittest', help='path to unittest', dest='unittest')
parser.add_argument(
'--zero_init_dir',
default='test_zero_init_db',
help='directory to write zero-initialized databases to',
dest='zero_init_dir',
)
parser.add_argument(
'--standard_dir', default='test_standard_db', help='directory to write regular databases to', dest='standard_dir'
)
args = parser.parse_args()
test_list = [
'test/sql/index/art/storage/test_art_checkpoint.test',
'test/sql/storage/compression/simple_compression.test',
'test/sql/storage/delete/test_store_deletes.test',
'test/sql/storage/mix/test_update_delete_string.test',
'test/sql/storage/nested/struct_of_lists_unaligned.test',
'test/sql/storage/test_store_integers.test',
'test/sql/storage/test_store_nulls_strings.test',
'test/sql/storage/update/test_store_null_updates.test',
]
def run_test(args):
res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout = res.stdout.decode('utf8').strip()
stderr = res.stderr.decode('utf8').strip()
if res.returncode != 0:
print("Failed to run test!")
print("----------COMMAND-----------")
print(' '.join(args))
print("----------STDOUT-----------")
print(stdout)
print("----------STDERR-----------")
print(stderr)
print("---------------------")
exit(1)
header_size = 4096 * 3
block_size = 262144
checksum_size = 8
def handle_error(i, standard_db, zero_init_db, standard_data, zero_data):
print("------------------------------------------------------------------")
print(f"FAIL - Mismatch between one-initialized and zero-initialized databases at byte position {i}")
print("------------------------------------------------------------------")
print(f"One-initialized database {standard_db} - byte value {standard_data}")
print(f"Zero-initialized database {zero_init_db} - byte value {zero_data}")
if i < header_size:
print("This byte is in the initial headers of the file")
else:
byte_pos = (i - header_size) % block_size
if byte_pos >= checksum_size:
print(
f"This byte is in block id {(i - header_size) // block_size} at byte position {byte_pos - checksum_size} (position {byte_pos} including the block checksum)"
)
else:
print(f"This byte is in block id {(i - header_size) // block_size} at byte position {byte_pos}")
print("This is in the checksum part of the block")
print("------------------------------------------------------------------")
print(
"This error likely means that memory was not correctly zero-initialized in a block before being written out to disk."
)
def compare_database(standard_db, zero_init_db):
with open(standard_db, 'rb') as f:
standard_data = f.read()
with open(zero_init_db, 'rb') as f:
zero_data = f.read()
if len(standard_data) != len(zero_data):
print(
f"FAIL - Length mismatch between database {standard_db} ({str(len(standard_data))}) and {zero_init_db} ({str(len(zero_data))})"
)
return False
found_error = None
for i in range(len(standard_data)):
if standard_data[i] != zero_data[i]:
if i > header_size:
byte_pos = (i - header_size) % block_size
if byte_pos <= 8:
# different checksum, skip because it does not tell us anything!
if found_error is None:
found_error = i
continue
handle_error(i, standard_db, zero_init_db, standard_data[i], zero_data[i])
return False
if found_error is not None:
i = found_error
handle_error(i, standard_db, zero_init_db, standard_data[i], zero_data[i])
return False
print("Success!")
return True
def compare_files(standard_dir, zero_init_dir):
standard_list = os.listdir(standard_dir)
zero_init_list = os.listdir(zero_init_dir)
standard_list.sort()
zero_init_list.sort()
if standard_list != zero_init_list:
print(
f"FAIL - Directories contain mismatching files (standard - {str(standard_list)}, zero init - {str(zero_init_list)})"
)
return False
if len(standard_list) == 0:
print("FAIL - Directory is empty!")
return False
success = True
for entry in standard_list:
if not compare_database(os.path.join(standard_dir, entry), os.path.join(zero_init_dir, entry)):
success = False
return success
def clear_directories(directories):
for dir in directories:
try:
shutil.rmtree(dir)
except FileNotFoundError as e:
pass
test_dirs = [args.standard_dir, args.zero_init_dir]
success = True
for test in test_list:
print(f"Running test {test}")
clear_directories(test_dirs)
standard_args = [args.unittest, '--test-temp-dir', args.standard_dir, '--one-initialize', '--single-threaded', test]
zero_init_args = [
args.unittest,
'--test-temp-dir',
args.zero_init_dir,
'--zero-initialize',
'--single-threaded',
test,
]
print(f"Running test in one-initialize mode")
run_test(standard_args)
print(f"Running test in zero-initialize mode")
run_test(zero_init_args)
if not compare_files(args.standard_dir, args.zero_init_dir):
success = False
clear_directories(test_dirs)
if not success:
exit(1)

48
external/duckdb/scripts/try_timeout.py vendored Normal file
View File

@@ -0,0 +1,48 @@
import os
import sys
import subprocess
import threading
if len(sys.argv) < 3:
print("Expected python3 scripts/try_timeout.py --timeout=[timeout] --retry=[retries] [cmd] [options...]")
print("Timeout should be given in seconds")
exit(1)
timeout = int(sys.argv[1].replace("--timeout=", ""))
retries = int(sys.argv[2].replace("--retry=", ""))
cmd = sys.argv[3:]
class Command(object):
def __init__(self, cmd):
self.cmd = cmd
self.process = None
def run(self, timeout):
self.process = None
def target():
self.process = subprocess.Popen(self.cmd)
self.process.communicate()
thread = threading.Thread(target=target)
thread.start()
thread.join(timeout)
if thread.is_alive():
print('Terminating process: process exceeded timeout of ' + str(timeout) + ' seconds')
self.process.terminate()
thread.join()
if self.process is None:
return 1
return self.process.returncode
for i in range(retries):
print("Attempting to run command \"" + ' '.join(cmd) + '"')
command = Command(cmd)
returncode = command.run(timeout)
if returncode == 0:
exit(0)
exit(1)

View File

@@ -0,0 +1,64 @@
#!/bin/bash
# Main extension uploading script
# Usage: ./scripts/upload-staging-asset.sh <folder> <file>*
# <folder> : Folder to upload to
# <file> : File to be uploaded
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Usage: ./scripts/upload-staging-asset.sh <folder> <file1> [... <fileN>]"
exit 1
fi
set -e
# skip if repo is not in duckdb organization
if [ "$GITHUB_REPOSITORY_OWNER" != "duckdb" ]; then
echo "Repository is $GITHUB_REPOSITORY_OWNER (not duckdb)"
exit 0
fi
FOLDER="$1"
DRY_RUN_PARAM=""
# dryrun if repo is not duckdb/duckdb
if [ "$GITHUB_REPOSITORY" != "duckdb/duckdb" ]; then
echo "Repository is $GITHUB_REPOSITORY (not duckdb/duckdb)"
DRY_RUN_PARAM="--dryrun"
fi
# dryrun if we are not in main
if [ "$GITHUB_REF" != "refs/heads/main" ]; then
echo "git ref is $GITHUB_REF (not refs/heads/main)"
DRY_RUN_PARAM="--dryrun"
fi
if [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then
echo "overriding DRY_RUN_PARAM, forcing upload"
DRY_RUN_PARAM=""
fi
# dryrun if AWS key is not set
if [ -z "$AWS_ACCESS_KEY_ID" ]; then
echo "No access key available"
DRY_RUN_PARAM="--dryrun"
fi
TARGET=$(git log -1 --format=%h)
if [ "$UPLOAD_ASSETS_TO_STAGING_TARGET" ]; then
TARGET="$UPLOAD_ASSETS_TO_STAGING_TARGET"
fi
# decide target for staging
if [ "$OVERRIDE_GIT_DESCRIBE" ]; then
TARGET="$TARGET/$OVERRIDE_GIT_DESCRIBE"
fi
python3 -m pip install awscli
for var in "${@: 2}"
do
aws s3 cp $var s3://duckdb-staging/$TARGET/$GITHUB_REPOSITORY/$FOLDER/ $DRY_RUN_PARAM --region us-east-2
done

View File

@@ -0,0 +1,62 @@
from cxxheaderparser.parser import CxxParser, ParserOptions
from cxxheaderparser.visitor import CxxVisitor
from cxxheaderparser.preprocessor import make_pcpp_preprocessor
from cxxheaderparser.parserstate import NamespaceBlockState
from cxxheaderparser.types import EnumDecl
import textwrap
import os
class Visitor:
def on_enum(self, state: NamespaceBlockState, cursor: EnumDecl) -> None:
enum_name = cursor.typename.segments[0].format()
if '<' in enum_name:
raise Exception(
"Enum '{}' is an anonymous enum, please name it\n".format(cursor.doxygen[3:] if cursor.doxygen else '')
)
enum_constants = dict()
for enum_const in cursor.values:
name = enum_const.name.format()
if enum_const.value is None:
raise Exception(f"Enum constant '{name}' in '{enum_name}' does not have an explicit value assignment.")
value = enum_const.value.format()
if value in enum_constants:
other_constant = enum_constants[value]
error = f"""
Enum '{enum_name}' contains a duplicate value:
Value {value} is defined for both '{other_constant}' and '{name}'
"""
error = textwrap.dedent(error)
raise Exception(error)
enum_constants[value] = name
print(f"Successfully verified the integrity of enum {enum_name} ({len(enum_constants)} entries)")
def __getattr__(self, name):
return lambda *args, **kwargs: True
def parse_enum(file_path):
# Create index
parser = CxxParser(
file_path,
None,
visitor=Visitor(),
options=ParserOptions(preprocessor=make_pcpp_preprocessor()),
)
parser.parse()
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Parse a C header file and check enum integrity.")
parser.add_argument("file_path", type=str, help="Path to the C header file")
args = parser.parse_args()
file_path = args.file_path
if not os.path.exists(file_path):
raise Exception(f"Error: file '{file_path}' does not exist")
enum_dict = parse_enum(file_path)

21
external/duckdb/scripts/windows_ci.py vendored Normal file
View File

@@ -0,0 +1,21 @@
import os
common_path = os.path.join('src', 'include', 'duckdb', 'common', 'common.hpp')
with open(common_path, 'r') as f:
text = f.read()
text = text.replace(
'#pragma once',
'''#pragma once
#ifdef _WIN32
#ifdef DUCKDB_MAIN_LIBRARY
#include "duckdb/common/windows.hpp"
#endif
#endif
''',
)
with open(common_path, 'w+') as f:
f.write(text)