should be it

2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions
--- a/external/duckdb/scripts/amalgamation.py
+++ b/external/duckdb/scripts/amalgamation.py
@@ -0,0 +1,608 @@
+# this script creates a single header + source file combination out of the DuckDB sources
+import os
+import re
+import sys
+import shutil
+import subprocess
+from python_helpers import open_utf8, normalize_path
+
+amal_dir = os.path.join('src', 'amalgamation')
+header_file = os.path.join(amal_dir, "duckdb.hpp")
+source_file = os.path.join(amal_dir, "duckdb.cpp")
+temp_header = 'duckdb.hpp.tmp'
+temp_source = 'duckdb.cpp.tmp'
+
+skip_duckdb_includes = False
+
+src_dir = 'src'
+include_dir = os.path.join('src', 'include')
+
+# files included in the amalgamated "duckdb.hpp" file
+main_header_files = [
+    os.path.join(include_dir, 'duckdb.hpp'),
+    os.path.join(include_dir, 'duckdb.h'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'date.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'adbc', 'adbc.h'),
+    os.path.join(include_dir, 'duckdb', 'common', 'adbc', 'adbc.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'arrow', 'arrow.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'arrow', 'arrow_converter.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'arrow', 'arrow_wrapper.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'blob.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'decimal.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'hugeint.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'uhugeint.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'uuid.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'interval.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'timestamp.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'time.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'buffered_file_writer.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'memory_stream.hpp'),
+    os.path.join(include_dir, 'duckdb', 'main', 'appender.hpp'),
+    os.path.join(include_dir, 'duckdb', 'main', 'client_context.hpp'),
+    os.path.join(include_dir, 'duckdb', 'main', 'extension', 'extension_loader.hpp'),
+    os.path.join(include_dir, 'duckdb', 'function', 'function.hpp'),
+    os.path.join(include_dir, 'duckdb', 'function', 'table_function.hpp'),
+    os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_table_function_info.hpp'),
+    os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_copy_function_info.hpp'),
+]
+extended_amalgamation = False
+if '--extended' in sys.argv:
+
+    def add_include_dir(dirpath):
+        return [os.path.join(dirpath, x) for x in os.listdir(dirpath)]
+
+    extended_amalgamation = True
+    main_header_files += [
+        os.path.join(include_dir, x)
+        for x in [
+            'duckdb/planner/expression/bound_constant_expression.hpp',
+            'duckdb/planner/expression/bound_function_expression.hpp',
+            'duckdb/catalog/catalog_entry/scalar_function_catalog_entry.hpp',
+            'duckdb/parser/parsed_data/create_table_info.hpp',
+            'duckdb/planner/parsed_data/bound_create_table_info.hpp',
+            'duckdb/parser/constraints/not_null_constraint.hpp',
+            'duckdb/storage/data_table.hpp',
+            'duckdb/function/pragma_function.hpp',
+            'duckdb/parser/qualified_name.hpp',
+            'duckdb/parser/parser.hpp',
+            'duckdb/planner/binder.hpp',
+            'duckdb/storage/object_cache.hpp',
+            'duckdb/planner/table_filter.hpp',
+            "duckdb/storage/statistics/base_statistics.hpp",
+            "duckdb/planner/filter/conjunction_filter.hpp",
+            "duckdb/planner/filter/constant_filter.hpp",
+            "duckdb/common/types/vector_cache.hpp",
+            "duckdb/common/string_map_set.hpp",
+            "duckdb/planner/filter/null_filter.hpp",
+            "duckdb/common/arrow/arrow_wrapper.hpp",
+            "duckdb/common/hive_partitioning.hpp",
+            "duckdb/common/multi_file/union_by_name.hpp",
+            "duckdb/planner/operator/logical_get.hpp",
+            "duckdb/common/compressed_file_system.hpp",
+        ]
+    ]
+    main_header_files += add_include_dir(os.path.join(include_dir, 'duckdb/parser/expression'))
+    main_header_files += add_include_dir(os.path.join(include_dir, 'duckdb/parser/parsed_data'))
+    main_header_files += add_include_dir(os.path.join(include_dir, 'duckdb/parser/tableref'))
+    main_header_files = normalize_path(main_header_files)
+
+import package_build
+
+# include paths for where to search for include files during amalgamation
+include_paths = [include_dir] + package_build.third_party_includes()
+# paths of where to look for files to compile and include to the final amalgamation
+compile_directories = [src_dir] + package_build.third_party_sources()
+
+# files always excluded
+always_excluded = normalize_path(
+    [
+        'src/amalgamation/duckdb.cpp',
+        'src/amalgamation/duckdb.hpp',
+        'src/amalgamation/parquet-amalgamation.cpp',
+        'src/amalgamation/parquet-amalgamation.hpp',
+    ]
+)
+# files excluded from the amalgamation
+excluded_files = ['grammar.cpp', 'grammar.hpp', 'symbols.cpp']
+# files excluded from individual file compilation during test_compile
+excluded_compilation_files = excluded_files + ['gram.hpp', 'kwlist.hpp', "duckdb-c.cpp"]
+
+linenumbers = False
+
+
+def get_includes(fpath, text):
+    # find all the includes referred to in the directory
+    regex_include_statements = re.findall("(^[\t ]*[#][\t ]*include[\t ]+[\"]([^\"]+)[\"])", text, flags=re.MULTILINE)
+    include_statements = []
+    include_files = []
+    # figure out where they are located
+    for x in regex_include_statements:
+        included_file = x[1]
+        if skip_duckdb_includes and 'duckdb' in included_file:
+            continue
+        if (
+            'extension_helper.cpp' in fpath
+            and (included_file.endswith('_extension.hpp'))
+            or included_file == 'generated_extension_loader.hpp'
+            or included_file == 'generated_extension_headers.hpp'
+        ):
+            continue
+        if 'allocator.cpp' in fpath and included_file.endswith('jemalloc_extension.hpp'):
+            continue
+        if x[0] in include_statements:
+            raise Exception(f"duplicate include {x[0]} in file {fpath}")
+        include_statements.append(x[0])
+        included_file = os.sep.join(included_file.split('/'))
+        found = False
+        for include_path in include_paths:
+            ipath = os.path.join(include_path, included_file)
+            if os.path.isfile(ipath):
+                include_files.append(ipath)
+                found = True
+                break
+        if not found:
+            raise Exception('Could not find include file "' + included_file + '", included from file "' + fpath + '"')
+    return (include_statements, include_files)
+
+
+def cleanup_file(text):
+    # remove all "#pragma once" notifications
+    text = re.sub('#pragma once', '', text)
+    return text
+
+
+# recursively get all includes and write them
+written_files = {}
+
+# licenses
+licenses = []
+
+
+def need_to_write_file(current_file, ignore_excluded=False):
+    if amal_dir in current_file:
+        return False
+    if current_file in always_excluded:
+        return False
+    if current_file.split(os.sep)[-1] in excluded_files and not ignore_excluded:
+        # file is in ignored files set
+        return False
+    if current_file in written_files:
+        # file is already written
+        return False
+    return True
+
+
+def find_license(original_file):
+    global licenses
+    file = original_file
+    license = ""
+    while True:
+        (file, end) = os.path.split(file)
+        if file == "":
+            break
+        potential_license = os.path.join(file, "LICENSE")
+        if os.path.exists(potential_license):
+            license = potential_license
+    if license == "":
+        raise "Could not find license for %s" % original_file
+
+    if license not in licenses:
+        licenses += [license]
+
+    return licenses.index(license)
+
+
+def write_file(current_file, ignore_excluded=False):
+    global linenumbers
+    global written_files
+    if not need_to_write_file(current_file, ignore_excluded):
+        return ""
+    written_files[current_file] = True
+
+    # first read this file
+    with open_utf8(current_file, 'r') as f:
+        text = f.read()
+
+    if current_file.startswith("third_party") and not current_file.endswith("LICENSE"):
+        lic_idx = find_license(current_file)
+        text = (
+            "\n\n// LICENSE_CHANGE_BEGIN\n// The following code up to LICENSE_CHANGE_END is subject to THIRD PARTY LICENSE #%s\n// See the end of this file for a list\n\n"
+            % str(lic_idx + 1)
+            + text
+            + "\n\n// LICENSE_CHANGE_END\n"
+        )
+
+    (statements, includes) = get_includes(current_file, text)
+    # find the linenr of the final #include statement we parsed
+    if len(statements) > 0:
+        index = text.find(statements[-1])
+        linenr = len(text[:index].split('\n'))
+
+        # now write all the dependencies of this header first
+        for i in range(len(includes)):
+            include_text = write_file(includes[i])
+            if linenumbers and i == len(includes) - 1:
+                # for the last include statement, we also include a #line directive
+                include_text += '\n#line %d "%s"\n' % (linenr, current_file)
+            text = text.replace(statements[i], include_text)
+
+    # add the initial line here
+    if linenumbers:
+        text = '\n#line 1 "%s"\n' % (current_file,) + text
+    # print(current_file)
+    # now read the header and write it
+    return cleanup_file(text)
+
+
+def write_dir(dir):
+    files = os.listdir(dir)
+    files.sort()
+    text = ""
+    for fname in files:
+        if fname in excluded_files:
+            continue
+        # print(fname)
+        fpath = os.path.join(dir, fname)
+        if os.path.isdir(fpath):
+            text += write_dir(fpath)
+        elif fname.endswith('.cpp') or fname.endswith('.c') or fname.endswith('.cc'):
+            text += write_file(fpath)
+    return text
+
+
+def copy_if_different(src, dest):
+    if os.path.isfile(dest):
+        # dest exists, check if the files are different
+        with open_utf8(src, 'r') as f:
+            source_text = f.read()
+        with open_utf8(dest, 'r') as f:
+            dest_text = f.read()
+        if source_text == dest_text:
+            # print("Skipping copy of " + src + ", identical copy already exists at " + dest)
+            return
+    # print("Copying " + src + " to " + dest)
+    shutil.copyfile(src, dest)
+
+
+def git_commit_hash():
+    git_describe = package_build.get_git_describe()
+    hash = git_describe.split('-')[2].lstrip('g')
+    return hash
+
+
+######
+# MAIN_BRANCH_VERSIONING default should be 'True' for main branch and feature branches
+# MAIN_BRANCH_VERSIONING default should be 'False' for release branches
+# MAIN_BRANCH_VERSIONING default value needs to keep in sync between:
+# - CMakeLists.txt
+# - scripts/amalgamation.py
+# - scripts/package_build.py
+######
+MAIN_BRANCH_VERSIONING = True
+if os.getenv('MAIN_BRANCH_VERSIONING') == "0":
+    MAIN_BRANCH_VERSIONING = False
+if os.getenv('MAIN_BRANCH_VERSIONING') == "1":
+    MAIN_BRANCH_VERSIONING = True
+
+
+def git_dev_version():
+    try:
+        long_version = package_build.get_git_describe()
+        version_splits = long_version.split('-')[0].lstrip('v').split('.')
+        dev_version = long_version.split('-')[1]
+        if int(dev_version) == 0:
+            # directly on a tag: emit the regular version
+            return "v" + '.'.join(version_splits)
+        else:
+            # not on a tag: increment the version by one and add a -devX suffix
+            # this needs to keep in sync with changes to CMakeLists.txt
+            if MAIN_BRANCH_VERSIONING == True:
+                # increment minor version
+                version_splits[1] = str(int(version_splits[1]) + 1)
+            else:
+                # increment patch version
+                version_splits[2] = str(int(version_splits[2]) + 1)
+            return "v" + '.'.join(version_splits) + "-dev" + dev_version
+    except:
+        return "v0.0.0"
+
+
+def generate_duckdb_hpp(header_file):
+    print("-----------------------")
+    print("-- Writing " + header_file + " --")
+    print("-----------------------")
+    with open_utf8(temp_header, 'w+') as hfile:
+        hfile.write("/*\n")
+        hfile.write(write_file("LICENSE"))
+        hfile.write("*/\n\n")
+
+        hfile.write("#pragma once\n")
+        hfile.write("#define DUCKDB_AMALGAMATION 1\n")
+        if extended_amalgamation:
+            hfile.write("#define DUCKDB_AMALGAMATION_EXTENDED 1\n")
+        hfile.write("#define DUCKDB_SOURCE_ID \"%s\"\n" % git_commit_hash())
+
+        dev_version = git_dev_version()
+        dev_v_parts = dev_version.lstrip('v').split('.')
+        hfile.write("#define DUCKDB_VERSION \"%s\"\n" % dev_version)
+        hfile.write("#define DUCKDB_MAJOR_VERSION %d\n" % int(dev_v_parts[0]))
+        hfile.write("#define DUCKDB_MINOR_VERSION %d\n" % int(dev_v_parts[1]))
+        hfile.write("#define DUCKDB_PATCH_VERSION \"%s\"\n" % dev_v_parts[2])
+
+        for fpath in main_header_files:
+            hfile.write(write_file(fpath))
+
+
+def generate_amalgamation(source_file, header_file):
+    # construct duckdb.hpp from these headers
+    generate_duckdb_hpp(header_file)
+
+    # now construct duckdb.cpp
+    print("------------------------")
+    print("-- Writing " + source_file + " --")
+    print("------------------------")
+
+    # scan all the .cpp files
+    with open_utf8(temp_source, 'w+') as sfile:
+        header_file_name = header_file.split(os.sep)[-1]
+        sfile.write('#include "' + header_file_name + '"\n\n')
+        sfile.write("#ifndef DUCKDB_AMALGAMATION\n#error header mismatch\n#endif\n\n")
+        sfile.write("#if (!defined(DEBUG) && !defined NDEBUG)\n#define NDEBUG\n#endif\n\n")
+        for compile_dir in compile_directories:
+            sfile.write(write_dir(compile_dir))
+
+        sfile.write('\n\n/*\n')
+        license_idx = 0
+        for license in licenses:
+            sfile.write("\n\n\n### THIRD PARTY LICENSE #%s ###\n\n" % str(license_idx + 1))
+            sfile.write(write_file(license))
+            license_idx += 1
+        sfile.write('\n\n*/\n')
+
+    copy_if_different(temp_header, header_file)
+    copy_if_different(temp_source, source_file)
+    try:
+        os.remove(temp_header)
+        os.remove(temp_source)
+    except:
+        pass
+
+
+def list_files(dname, file_list):
+    files = os.listdir(dname)
+    files.sort()
+    for fname in files:
+        if fname in excluded_files:
+            continue
+        fpath = os.path.join(dname, fname)
+        if os.path.isdir(fpath):
+            list_files(fpath, file_list)
+        elif fname.endswith(('.cpp', '.c', '.cc')):
+            if need_to_write_file(fpath):
+                file_list.append(fpath)
+
+
+def list_sources():
+    file_list = []
+    for compile_dir in compile_directories:
+        list_files(compile_dir, file_list)
+    return file_list
+
+
+def list_include_files_recursive(dname, file_list):
+    files = os.listdir(dname)
+    files.sort()
+    for fname in files:
+        if fname in excluded_files:
+            continue
+        fpath = os.path.join(dname, fname)
+        if os.path.isdir(fpath):
+            list_include_files_recursive(fpath, file_list)
+        elif fname.endswith(('.hpp', '.ipp', '.h', '.hh', '.tcc', '.inc')):
+            file_list.append(fpath)
+
+
+def list_includes_files(include_dirs):
+    file_list = []
+    for include_dir in include_dirs:
+        list_include_files_recursive(include_dir, file_list)
+    return file_list
+
+
+def list_includes():
+    return list_includes_files(include_paths)
+
+
+def gather_file(current_file, source_files, header_files):
+    global linenumbers
+    global written_files
+    if not need_to_write_file(current_file, False):
+        return ""
+    written_files[current_file] = True
+
+    # first read this file
+    with open_utf8(current_file, 'r') as f:
+        text = f.read()
+
+    (statements, includes) = get_includes(current_file, text)
+    # find the linenr of the final #include statement we parsed
+    if len(statements) > 0:
+        index = text.find(statements[-1])
+        linenr = len(text[:index].split('\n'))
+
+        # now write all the dependencies of this header first
+        for i in range(len(includes)):
+            # source file inclusions are inlined into the main text
+            include_text = write_file(includes[i])
+            if linenumbers and i == len(includes) - 1:
+                # for the last include statement, we also include a #line directive
+                include_text += '\n#line %d "%s"\n' % (linenr, current_file)
+            if includes[i].endswith('.cpp') or includes[i].endswith('.cc') or includes[i].endswith('.c'):
+                # source file inclusions are inlined into the main text
+                text = text.replace(statements[i], include_text)
+            else:
+                text = text.replace(statements[i], '')
+                header_files.append(include_text)
+
+    # add the initial line here
+    if linenumbers:
+        text = '\n#line 1 "%s"\n' % (current_file,) + text
+    source_files.append(cleanup_file(text))
+
+
+def gather_files(dir, source_files, header_files):
+    files = os.listdir(dir)
+    files.sort()
+    for fname in files:
+        if fname in excluded_files:
+            continue
+        fpath = os.path.join(dir, fname)
+        if os.path.isdir(fpath):
+            gather_files(fpath, source_files, header_files)
+        elif fname.endswith('.cpp') or fname.endswith('.c') or fname.endswith('.cc'):
+            gather_file(fpath, source_files, header_files)
+
+
+def write_license(hfile):
+    hfile.write("// See https://raw.githubusercontent.com/duckdb/duckdb/main/LICENSE for licensing information\n\n")
+
+
+def generate_amalgamation_splits(source_file, header_file, nsplits):
+    # construct duckdb.hpp from these headers
+    generate_duckdb_hpp(header_file)
+
+    # gather all files to read and write
+    source_files = []
+    header_files = []
+    for compile_dir in compile_directories:
+        if compile_dir != src_dir:
+            continue
+        gather_files(compile_dir, source_files, header_files)
+
+    # write duckdb-internal.hpp
+    if '.hpp' in header_file:
+        internal_header_file = header_file.replace('.hpp', '-internal.hpp')
+    elif '.h' in header_file:
+        internal_header_file = header_file.replace('.h', '-internal.h')
+    else:
+        raise "Unknown extension of header file"
+
+    temp_internal_header = internal_header_file + '.tmp'
+
+    with open_utf8(temp_internal_header, 'w+') as f:
+        write_license(f)
+        for hfile in header_files:
+            f.write(hfile)
+
+    # count the total amount of bytes in the source files
+    total_bytes = 0
+    for sfile in source_files:
+        total_bytes += len(sfile)
+
+    # now write the individual splits
+    # we approximate the splitting up by making every file have roughly the same amount of bytes
+    split_bytes = total_bytes / nsplits
+    current_bytes = 0
+    partitions = []
+    partition_names = []
+    current_partition = []
+    current_partition_idx = 1
+    for sfile in source_files:
+        current_partition.append(sfile)
+        current_bytes += len(sfile)
+        if current_bytes >= split_bytes:
+            partition_names.append(str(current_partition_idx))
+            partitions.append(current_partition)
+            current_partition = []
+            current_bytes = 0
+            current_partition_idx += 1
+    if len(current_partition) > 0:
+        partition_names.append(str(current_partition_idx))
+        partitions.append(current_partition)
+        current_partition = []
+        current_bytes = 0
+    # generate partitions from the third party libraries
+    for compile_dir in compile_directories:
+        if compile_dir != src_dir:
+            partition_names.append(compile_dir.split(os.sep)[-1])
+            partitions.append(write_dir(compile_dir))
+
+    header_file_name = header_file.split(os.sep)[-1]
+    internal_header_file_name = internal_header_file.split(os.sep)[-1]
+
+    partition_fnames = []
+    current_partition = 0
+    for partition in partitions:
+        partition_name = source_file.replace('.cpp', '-%s.cpp' % (partition_names[current_partition],))
+        temp_partition_name = partition_name + '.tmp'
+        partition_fnames.append([partition_name, temp_partition_name])
+        with open_utf8(temp_partition_name, 'w+') as f:
+            write_license(f)
+            f.write('#include "%s"\n#include "%s"' % (header_file_name, internal_header_file_name))
+            f.write(
+                '''
+#ifndef DUCKDB_AMALGAMATION
+#error header mismatch
+#endif
+'''
+            )
+            for sfile in partition:
+                f.write(sfile)
+        current_partition += 1
+
+    copy_if_different(temp_header, header_file)
+    copy_if_different(temp_internal_header, internal_header_file)
+    try:
+        os.remove(temp_header)
+        os.remove(temp_internal_header)
+    except:
+        pass
+    for p in partition_fnames:
+        copy_if_different(p[1], p[0])
+        try:
+            os.remove(p[1])
+        except:
+            pass
+
+
+def list_include_dirs():
+    return include_paths
+
+
+if __name__ == "__main__":
+    nsplits = 1
+    for arg in sys.argv:
+        if arg == '--linenumbers':
+            linenumbers = True
+        elif arg == '--no-linenumbers':
+            linenumbers = False
+        elif arg.startswith('--header='):
+            header_file = os.path.join(*arg.split('=', 1)[1].split('/'))
+        elif arg.startswith('--source='):
+            source_file = os.path.join(*arg.split('=', 1)[1].split('/'))
+        elif arg.startswith('--splits='):
+            nsplits = int(arg.split('=', 1)[1])
+        elif arg.startswith('--list-sources'):
+            file_list = list_sources()
+            print('\n'.join(file_list))
+            exit(1)
+        elif arg.startswith('--list-objects'):
+            file_list = list_sources()
+            print(' '.join([x.rsplit('.', 1)[0] + '.o' for x in file_list]))
+            exit(1)
+        elif arg.startswith('--includes'):
+            include_dirs = list_include_dirs()
+            print(' '.join(['-I' + x for x in include_dirs]))
+            exit(1)
+        elif arg.startswith('--include-directories'):
+            include_dirs = list_include_dirs()
+            print('\n'.join(include_dirs))
+            exit(1)
+    if os.path.exists(amal_dir):
+        shutil.rmtree(amal_dir)
+    os.makedirs(amal_dir)
+
+    if nsplits > 1:
+        generate_amalgamation_splits(source_file, header_file, nsplits)
+    else:
+        generate_amalgamation(source_file, header_file)
--- a/external/duckdb/scripts/append_metadata.cmake
+++ b/external/duckdb/scripts/append_metadata.cmake
@@ -0,0 +1,68 @@
+cmake_minimum_required(VERSION 3.15...3.29)
+
+# Usage: cmake -DEXTENSION=path/to/extension.duckdb_extension -DPLATFORM_FILE=README.md -DDUCKDB_VERSION=tag1 -DEXTENSION_VERSION=tag2 -P scripts/append_metadata.cmake
+# Currently hardcoded to host up to 8 fields
+# Example: ./scripts/append_metadata.sh file.duckdb_extension git_hash_duckdb_file git_hash_extension_file platfrom_file
+
+set(EXTENSION "" CACHE PATH "Path to the extension where to add metadata")
+set(NULL_FILE "" CACHE PATH "Path to file containing a single 0 byte")
+set(META1 "4" CACHE STRING "Metadata field" FORCE)
+set(PLATFORM_FILE "" CACHE PATH "Metadata field: path of file containing duckdb_platform")
+set(VERSION_FIELD "" CACHE STRING "Metadata field: path of file containing duckdb_version")
+set(EXTENSION_VERSION "" CACHE STRING "Metadata field: path of file containing extension_version")
+set(ABI_TYPE "" CACHE STRING "Metadata field: the ABI type of the extension")
+set(META6 "" CACHE STRING "Metadata field")
+set(META7 "" CACHE STRING "Metadata field")
+set(META8 "" CACHE STRING "Metadata field")
+
+# null.txt should contain exactly 1 byte of value \x00
+file(READ "${NULL_FILE}" EMPTY_BYTE)
+
+string(REPEAT "${EMPTY_BYTE}" 32 EMPTY_32)
+string(REPEAT "${EMPTY_BYTE}" 256 EMPTY_256)
+
+# 0 for custom section
+string(APPEND CUSTOM_SECTION "${EMPTY_BYTE}")
+# 213 in hex = 531 in decimal, total length of what follows (1 + 16 + 2 + 8x32 + 256)
+# [1(continuation) + 0010011(payload) = \x93 -> 147, 0(continuation) + 10(payload) = \x04 -> 4]
+# 10 in hex = 16 in decimal, length of name, 1 byte
+string(ASCII 147 4 16 CUSTOM_SECTION_2)
+string(APPEND CUSTOM_SECTION "${CUSTOM_SECTION_2}")
+
+# the name of the WebAssembly custom section, 16 bytes
+string(APPEND CUSTOM_SECTION "duckdb_signature")
+
+# 1000 in hex, 512 in decimal
+# [1(continuation) + 0000000(payload) = -> 128, 0(continuation) + 100(payload) -> 4],
+# for a grand total of 2 bytes
+string(ASCII 128 4 CUSTOM_SECTION_3)
+string(APPEND CUSTOM_SECTION "${CUSTOM_SECTION_3}")
+
+# Second metadata-field is special, since content comes from a file
+file(READ "${PLATFORM_FILE}" META2)
+
+# Build METADATAx variable by appending and then truncating empty strings
+string(SUBSTRING "${META1}${EMPTY_32}" 0 32 METADATA1)
+string(SUBSTRING "${META2}${EMPTY_32}" 0 32 METADATA2)
+string(SUBSTRING "${VERSION_FIELD}${EMPTY_32}" 0 32 METADATA3)
+string(SUBSTRING "${EXTENSION_VERSION}${EMPTY_32}" 0 32 METADATA4)
+string(SUBSTRING "${ABI_TYPE}${EMPTY_32}" 0 32 METADATA5)
+string(SUBSTRING "${META6}${EMPTY_32}" 0 32 METADATA6)
+string(SUBSTRING "${META7}${EMPTY_32}" 0 32 METADATA7)
+string(SUBSTRING "${META8}${EMPTY_32}" 0 32 METADATA8)
+
+# Append metadata fields, backwards
+string(APPEND CUSTOM_SECTION "${METADATA8}")
+string(APPEND CUSTOM_SECTION "${METADATA7}")
+string(APPEND CUSTOM_SECTION "${METADATA6}")
+string(APPEND CUSTOM_SECTION "${METADATA5}")
+string(APPEND CUSTOM_SECTION "${METADATA4}")
+string(APPEND CUSTOM_SECTION "${METADATA3}")
+string(APPEND CUSTOM_SECTION "${METADATA2}")
+string(APPEND CUSTOM_SECTION "${METADATA1}")
+
+# Append signature (yet to be computed)
+string(APPEND CUSTOM_SECTION "${EMPTY_256}")
+
+# Append generated custom section to the extension
+file(APPEND "${EXTENSION}" "${CUSTOM_SECTION}")
--- a/external/duckdb/scripts/apply_extension_patches.py
+++ b/external/duckdb/scripts/apply_extension_patches.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+import sys
+import glob
+import subprocess
+import os
+import tempfile
+
+# Get the directory and construct the patch file pattern
+directory = sys.argv[1]
+patch_pattern = f"{directory}*.patch"
+
+# Find patch files matching the pattern
+patches = glob.glob(patch_pattern)
+
+
+def raise_error(error_msg):
+    sys.stderr.write(error_msg + '\n')
+    sys.exit(1)
+
+
+patches = sorted(os.listdir(directory))
+for patch in patches:
+    if not patch.endswith('.patch'):
+        raise_error(
+            f'Patch file {patch} found in directory {directory} does not end in ".patch" - rename the patch file'
+        )
+
+
+# Exit if no patches are found
+if not patches:
+    error_message = (
+        f"\nERROR: Extension patching enabled, but no patches found in '{directory}'. "
+        "Please make sure APPLY_PATCHES is only enabled when there are actually patches present. "
+        "See .github/patches/extensions/README.md for more details."
+    )
+    raise_error(error_message)
+
+
+current_dir = os.getcwd()
+print(f"Applying patches at '{current_dir}'")
+print(f"Resetting patches in {directory}\n")
+
+# capture the current diff
+diff_proc = subprocess.run(["git", "diff"], capture_output=True, check=True)
+prev_diff = diff_proc.stdout
+
+output_proc = subprocess.run(["git", "diff", "--numstat"], capture_output=True, check=True)
+prev_output_lines = output_proc.stdout.decode('utf8').split('\n')
+prev_output_lines.sort()
+
+subprocess.run(["git", "clean", "-f"], check=True)
+subprocess.run(["git", "reset", "--hard", "HEAD"], check=True)
+
+
+def apply_patch(patch_file):
+    ARGUMENTS = ["patch", "-p1", "--forward", "-i"]
+    arguments = []
+    arguments.extend(ARGUMENTS)
+    arguments.append(patch_file)
+    try:
+        subprocess.run(arguments, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    except subprocess.CalledProcessError as e:
+        arguments[1:1] = ['-d', current_dir]
+        command = " ".join(arguments)
+        print(f"Failed to apply patch, command to reproduce locally:\n{command}")
+        print("\nError output:")
+        print(e.stderr.decode('utf-8'))
+        print("\nStandard output:")
+        print(e.stdout.decode('utf-8'))
+        print("Exiting")
+        exit(1)
+
+
+# Apply each patch file using patch
+for patch in patches:
+    print(f"Applying patch: {patch}\n")
+    apply_patch(os.path.join(directory, patch))
+
+# all patches have applied - check the current diff
+output_proc = subprocess.run(["git", "diff", "--numstat"], capture_output=True, check=True)
+output_lines = output_proc.stdout.decode('utf8').split('\n')
+output_lines.sort()
+
+if len(output_lines) <= len(prev_output_lines) and prev_output_lines != output_lines:
+    print("Detected local changes - rolling back patch application")
+
+    subprocess.run(["git", "clean", "-f"], check=True)
+    subprocess.run(["git", "reset", "--hard", "HEAD"], check=True)
+    with tempfile.NamedTemporaryFile() as f:
+        f.write(prev_diff)
+        apply_patch(f.name)
+
+    print("--------------------------------------------------")
+    print("Generate a patch file using the following command:")
+    print("--------------------------------------------------")
+    print(f"(cd {os.getcwd()} && git diff > {os.path.join(directory, 'fix.patch')})")
+    print("--------------------------------------------------")
+
+    exit(1)
--- a/external/duckdb/scripts/asset-upload-gha.py
+++ b/external/duckdb/scripts/asset-upload-gha.py
@@ -0,0 +1,126 @@
+import json
+import os
+import sys
+import glob
+import time
+import urllib.request
+
+api_url = 'https://api.github.com/repos/duckdb/duckdb/'
+
+if len(sys.argv) < 2:
+    print("Usage: [filename1] [filename2] ... ")
+    exit(1)
+
+# this essentially should run on release tag builds to fill up release assets and master
+
+repo = os.getenv("GITHUB_REPOSITORY", "")
+if repo != "duckdb/duckdb":
+    print("Not running on forks. Exiting.")
+    exit(0)
+
+ref = os.getenv("GITHUB_REF", '')  # this env var is always present just not always used
+if ref == 'refs/heads/main':
+    print("Not running on main branch. Exiting.")
+    exit(0)
+elif ref.startswith('refs/tags/'):
+    tag = ref.replace('refs/tags/', '')
+else:
+    print("Not running on branches. Exiting.")
+    exit(0)
+
+
+print("Running on tag %s" % tag)
+
+
+token = os.getenv("GH_TOKEN", "")
+if token == "":
+    raise ValueError('need a GitHub token in GH_TOKEN')
+
+
+def internal_gh_api(suburl, filename='', method='GET'):
+    url = api_url + suburl
+    headers = {"Content-Type": "application/json", 'Authorization': 'token ' + token}
+
+    body_data = b''
+    raw_resp = None
+    if len(filename) > 0:
+        method = 'POST'
+        body_data = open(filename, 'rb')
+        headers["Content-Type"] = "binary/octet-stream"
+        headers["Content-Length"] = os.path.getsize(local_filename)
+        url = suburl  # cough
+
+    req = urllib.request.Request(url, body_data, headers)
+    req.get_method = lambda: method
+    print(f'GH API URL: "{url}" Filename: "{filename}" Method: "{method}"')
+    raw_resp = urllib.request.urlopen(req).read().decode()
+
+    if method != 'DELETE':
+        return json.loads(raw_resp)
+    else:
+        return {}
+
+
+def gh_api(suburl, filename='', method='GET'):
+    timeout = 1
+    nretries = 10
+    success = False
+    for i in range(nretries + 1):
+        try:
+            response = internal_gh_api(suburl, filename, method)
+            success = True
+        except urllib.error.HTTPError as e:
+            print(e.read().decode())  # gah
+        except Exception as e:
+            print(e)
+        if success:
+            break
+        print(f"Failed upload, retrying in {timeout} seconds... ({i}/{nretries})")
+        time.sleep(timeout)
+        timeout = timeout * 2
+    if not success:
+        raise Exception("Failed to open URL " + suburl)
+    return response
+
+
+# check if tag exists
+resp = gh_api('git/ref/tags/%s' % tag)
+if 'object' not in resp or 'sha' not in resp['object']:  # or resp['object']['sha'] != sha
+    raise ValueError('tag %s not found' % tag)
+
+resp = gh_api('releases/tags/%s' % tag)
+if 'id' not in resp or 'upload_url' not in resp:
+    raise ValueError('release does not exist for tag ' % tag)
+
+
+# double-check that release exists and has correct sha
+# disabled to not spam people watching releases
+# if 'id' not in resp or 'upload_url' not in resp or 'target_commitish' not in resp or resp['target_commitish'] != sha:
+# 	raise ValueError('release does not point to requested commit %s' % sha)
+
+# TODO this could be a paged response!
+assets = gh_api('releases/%s/assets' % resp['id'])
+
+upload_url = resp['upload_url'].split('{')[0]  # gah
+files = sys.argv[1:]
+for filename in files:
+    if '=' in filename:
+        parts = filename.split("=")
+        asset_filename = parts[0]
+        paths = glob.glob(parts[1])
+        if len(paths) != 1:
+            raise ValueError("Could not find file for pattern %s" % parts[1])
+        local_filename = paths[0]
+    else:
+        asset_filename = os.path.basename(filename)
+        local_filename = filename
+
+    # delete if present
+    for asset in assets:
+        if asset['name'] == asset_filename:
+            gh_api('releases/assets/%s' % asset['id'], method='DELETE')
+
+    resp = gh_api(f'{upload_url}?name={asset_filename}', filename=local_filename)
+    if 'id' not in resp:
+        raise ValueError('upload failed :/ ' + str(resp))
+    print("%s -> %s" % (local_filename, resp['browser_download_url']))
--- a/external/duckdb/scripts/asset-upload.py
+++ b/external/duckdb/scripts/asset-upload.py
@@ -0,0 +1,106 @@
+import json
+import os
+import sys
+import glob
+import mimetypes
+import urllib.request
+
+api_url = 'https://api.github.com/repos/duckdb/duckdb/'
+
+if len(sys.argv) < 2:
+    print("Usage: [filename1] [filename2] ... ")
+    exit(1)
+
+# this essentially should run on release tag builds to fill up release assets and main
+
+pr = os.getenv("TRAVIS_PULL_REQUEST", "")
+if pr != "false":
+    print("Not running on PRs. Exiting.")
+    exit(0)
+
+tag = os.getenv("TRAVIS_TAG", '')  # this env var is always present just not always used
+if tag == '':
+    tag = 'main-builds'
+print("Running on tag %s" % tag)
+
+if tag == "main-builds" and os.getenv("TRAVIS_BRANCH", "") != "main":
+    print("Only running on main branch for %s tag. Exiting." % tag)
+    exit(0)
+
+
+token = os.getenv("GH_TOKEN", "")
+if token == "":
+    raise ValueError('need a GitHub token in GH_TOKEN')
+
+
+def gh_api(suburl, filename='', method='GET'):
+    url = api_url + suburl
+    headers = {"Content-Type": "application/json", 'Authorization': 'token ' + token}
+
+    body_data = b''
+
+    if len(filename) > 0:
+        method = 'POST'
+        body_data = open(filename, 'rb')
+
+        mime_type = mimetypes.guess_type(local_filename)[0]
+        if mime_type is None:
+            mime_type = "application/octet-stream"
+        headers["Content-Type"] = mime_type
+        headers["Content-Length"] = os.path.getsize(local_filename)
+
+        url = suburl  # cough
+
+    req = urllib.request.Request(url, body_data, headers)
+    req.get_method = lambda: method
+    try:
+        raw_resp = urllib.request.urlopen(req).read().decode()
+    except urllib.error.HTTPError as e:
+        raw_resp = e.read().decode()  # gah
+
+    if method != 'DELETE':
+        return json.loads(raw_resp)
+    else:
+        return {}
+
+
+# check if tag exists
+resp = gh_api('git/ref/tags/%s' % tag)
+if 'object' not in resp or 'sha' not in resp['object']:  # or resp['object']['sha'] != sha
+    raise ValueError('tag %s not found' % tag)
+
+resp = gh_api('releases/tags/%s' % tag)
+if 'id' not in resp or 'upload_url' not in resp:
+    raise ValueError('release does not exist for tag ' % tag)
+
+# double-check that release exists and has correct sha
+# disabled to not spam people watching releases
+# if 'id' not in resp or 'upload_url' not in resp or 'target_commitish' not in resp or resp['target_commitish'] != sha:
+# 	raise ValueError('release does not point to requested commit %s' % sha)
+
+# TODO this could be a paged response!
+assets = gh_api('releases/%s/assets' % resp['id'])
+
+upload_url = resp['upload_url'].split('{')[0]  # gah
+files = sys.argv[1:]
+for filename in files:
+    if '=' in filename:
+        parts = filename.split("=")
+        asset_filename = parts[0]
+        paths = glob.glob(parts[1])
+        if len(paths) != 1:
+            raise ValueError("Could not find file for pattern %s" % local_filename)
+        local_filename = paths[0]
+    else:
+        asset_filename = os.path.basename(filename)
+        local_filename = filename
+
+    # delete if present
+    for asset in assets:
+        if asset['name'] == asset_filename:
+            gh_api('releases/assets/%s' % asset['id'], method='DELETE')
+
+    resp = gh_api(upload_url + '?name=%s' % asset_filename, filename=local_filename)
+    if 'id' not in resp:
+        raise ValueError('upload failed :/ ' + str(resp))
+    print("%s -> %s" % (local_filename, resp['browser_download_url']))
--- a/external/duckdb/scripts/build_peg_grammar.sh
+++ b/external/duckdb/scripts/build_peg_grammar.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+set -euo pipefail
+# Print each command before executing (optional, for debug)
+# set -x
+
+# Activate virtual environment
+if [[ -d ".venv" ]]; then
+  source .venv/bin/activate
+else
+  echo "Error: .venv directory not found"
+  exit 1
+fi
+
+# Run grammar inlining with and without argument
+GRAMMAR_FILE="extension/autocomplete/inline_grammar.py"
+if [[ ! -f "$GRAMMAR_FILE" ]]; then
+  echo "Error: $GRAMMAR_FILE not found"
+  deactivate
+  exit 1
+fi
+
+python "$GRAMMAR_FILE" --grammar-file
+python "$GRAMMAR_FILE"
+
+echo "Successfully build PEG grammar files"
+
+# Deactivate virtual environment
+deactivate
--- a/external/duckdb/scripts/cancel_workflows.py
+++ b/external/duckdb/scripts/cancel_workflows.py
@@ -0,0 +1,63 @@
+import subprocess
+import duckdb
+import os
+import pandas as pd
+import argparse
+from io import StringIO
+
+
+parser = argparse.ArgumentParser(description='Cancel all workflows related to a PR.')
+parser.add_argument(
+    '--title',
+    dest='title',
+    action='store',
+    help='The title of the PR for which we want to rerun workflows (or part of the title) - or "master" for all pushes',
+    required=True,
+)
+parser.add_argument(
+    '--repo', dest='repo', action='store', help='The repository to run this workflow on', default='duckdb/duckdb'
+)
+parser.add_argument(
+    '--max_workflows',
+    dest='max_workflows',
+    action='store',
+    help='The maximum number of workflows to look at (starting from the latest)',
+    default=200,
+)
+args = parser.parse_args()
+
+nlimit = args.max_workflows
+query = args.title
+
+
+proc = subprocess.Popen(
+    [
+        'gh',
+        'run',
+        '-R',
+        args.repo,
+        'list',
+        '--json',
+        'displayTitle,databaseId,status,conclusion,headSha,event',
+        f'--limit={nlimit}',
+    ],
+    stdout=subprocess.PIPE,
+)
+text = proc.stdout.read().decode('utf8')
+df = pd.read_json(StringIO(text))
+
+if query == 'master':
+    result = duckdb.query(
+        f"select databaseId from df WHERE status IN ('queued', 'in_progress') AND event='push'"
+    ).fetchall()
+else:
+    result = duckdb.query(
+        f"select databaseId from df WHERE status IN ('queued', 'in_progress') AND displayTitle LIKE '%{query}%'"
+    ).fetchall()
+if len(result) == 0:
+    print(
+        f"No workflows found in the latest {nlimit} workflows that contain the text {query}.\nPerhaps try running with a higher --max_workflows parameter?"
+    )
+    exit(1)
+for databaseId in [x[0] for x in result]:
+    os.system(f'gh run -R {args.repo} cancel {databaseId}')
--- a/external/duckdb/scripts/check-issue-for-code-formatting.py
+++ b/external/duckdb/scripts/check-issue-for-code-formatting.py
@@ -0,0 +1,33 @@
+import re
+import sys
+
+post_text = sys.stdin.read()
+
+sql_keyword_list = ["select", "from", "where", "join", "group by", "order by", "having", "with recursive", "union"]
+sql_keyword_regex = f"({'|'.join(sql_keyword_list)})"
+
+sql_keywords = len(re.findall(rf"{sql_keyword_regex}", post_text, flags=re.MULTILINE | re.IGNORECASE))
+
+backticked_code_blocks = len(re.findall(r"^```", post_text))
+
+indented_sql_code_lines = len(re.findall(r"^{sql_keyword_regex}", post_text, flags=re.MULTILINE | re.IGNORECASE))
+indented_python_code_lines = len(re.findall(r"^    (import|duckdb)", post_text, flags=re.MULTILINE | re.IGNORECASE))
+indented_r_code_lines = len(re.findall(r"^    (library|dbExecute)", post_text, flags=re.MULTILINE | re.IGNORECASE))
+indented_hashbang_code_lines = len(re.findall(r"^    #!", post_text, flags=re.MULTILINE | re.IGNORECASE))
+
+indented_code_lines = indented_sql_code_lines + indented_python_code_lines + indented_r_code_lines
+inline_code_snippets = len(re.findall(r"`", post_text)) // 2
+
+print("Metrics computed by 'check-issue-for-code-formatting.py':")
+print(f"- {sql_keywords} SQL keyword(s)")
+print(f"- {backticked_code_blocks} backticked code block(s)")
+print(
+    f"- {indented_code_lines} indented code line(s): {indented_sql_code_lines} SQL, {indented_python_code_lines} Python, {indented_r_code_lines} R, {indented_hashbang_code_lines} hashbangs"
+)
+print(f"- {inline_code_snippets} inline code snippet(s)")
+
+if sql_keywords > 2 and backticked_code_blocks == 0 and indented_code_lines == 0 and inline_code_snippets == 0:
+    print("The post is likely not properly formatted.")
+    exit(1)
+else:
+    print("The post is likely properly formatted.")
--- a/external/duckdb/scripts/check_coverage.py
+++ b/external/duckdb/scripts/check_coverage.py
@@ -0,0 +1,129 @@
+import argparse
+import os
+import math
+import re
+
+parser = argparse.ArgumentParser(description='Check code coverage results')
+
+parser.add_argument(
+    '--uncovered_files',
+    action='store',
+    help='Set of files that are not 100% covered',
+    default=os.path.join(".github", "config", "uncovered_files.csv"),
+)
+parser.add_argument('--directory', help='Directory of generated HTML files', action='store', default='coverage_html')
+parser.add_argument('--fix', help='Fill up the uncovered_files.csv with all files', action='store_true', default=False)
+
+
+args = parser.parse_args()
+if not os.path.exists(args.directory):
+    print(f"The provided directory ({args.directory}) does not exist, please create it first")
+    exit(1)
+
+covered_regex = (
+    r'<a name="(\d+)">[ \t\n]*<span class="lineNum">[ \t\n0-9]+</span><span class="{COVERED_CLASS}">[ \t\n0-9]+:([^<]+)'
+)
+
+
+def get_original_path(path):
+    return (
+        path.replace('.gcov.html', '')
+        .replace(os.getcwd(), '')
+        .replace('coverage_html' + os.path.sep, '')
+        .replace('home/runner/work/duckdb/duckdb/', '')
+    )
+
+
+def cleanup_line(line):
+    return line.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"')
+
+
+partial_coverage_dict = {}
+with open(args.uncovered_files, 'r') as f:
+    for line in f.readlines():
+        splits = line.split('\t')
+        partial_coverage_dict[splits[0]] = int(splits[1].strip())
+
+if args.fix:
+    uncovered_file = open(args.uncovered_files, 'w+')
+
+DASH_COUNT = 80
+total_difference = 0
+allowed_difference = 0
+
+
+def check_file(path, partial_coverage_dict):
+    global any_failed
+    global total_difference
+    if not '.cpp' in path and not '.hpp' in path:
+        # files are named [path].[ch]pp
+        return
+    if not '.html' in path:
+        return
+    with open(path, 'r') as f:
+        text = f.read()
+    original_path = get_original_path(path)
+    uncovered_lines = re.findall(covered_regex.replace('{COVERED_CLASS}', 'lineNoCov'), text)
+    covered_lines = re.findall(covered_regex.replace('{COVERED_CLASS}', 'lineCov'), text)
+
+    total_lines = len(uncovered_lines) + len(covered_lines)
+    if total_lines == 0:
+        # no lines to cover - skip
+        return
+
+    coverage_percentage = round(len(covered_lines) / (total_lines) * 100, 2)
+    expected_uncovered_lines = 0
+    if original_path in partial_coverage_dict:
+        expected_uncovered_lines = partial_coverage_dict[original_path]
+    if args.fix:
+        if expected_uncovered_lines == 0 and len(uncovered_lines) == 0:
+            return
+        expected_uncovered = max(expected_uncovered_lines, len(uncovered_lines) + 1)
+        uncovered_file.write(f'{original_path}\t{expected_uncovered}\n')
+        return
+
+    if len(uncovered_lines) > expected_uncovered_lines:
+        total_difference += len(uncovered_lines) - expected_uncovered_lines
+
+        print("-" * DASH_COUNT)
+        print(f"Coverage failure in file {original_path}")
+        print("-" * DASH_COUNT)
+        print(f"Coverage percentage: {coverage_percentage}%")
+        print(f"Uncovered lines: {len(uncovered_lines)}")
+        print(f"Covered lines: {len(covered_lines)}")
+        print("-" * DASH_COUNT)
+        print(f"Expected uncovered lines: {expected_uncovered_lines}")
+        print("-" * DASH_COUNT)
+        print("Uncovered lines")
+        print("-" * DASH_COUNT)
+        for e in uncovered_lines:
+            print(e[0] + ' ' * 8 + cleanup_line(e[1]))
+
+
+def scan_directory(path):
+    file_list = []
+    if os.path.isfile(path):
+        file_list.append(path)
+    else:
+        files = os.listdir(path)
+        for file in files:
+            file_list += scan_directory(os.path.join(path, file))
+    return file_list
+
+
+files = scan_directory(args.directory)
+files.sort()
+
+for file in files:
+    check_file(file, partial_coverage_dict)
+
+if args.fix:
+    uncovered_file.close()
+
+if total_difference > allowed_difference:
+    exit(1)
+elif total_difference > 0:
+    print("-" * DASH_COUNT)
+    print("SUCCESS-ish")
+    print("-" * DASH_COUNT)
+    print(f"{total_difference} lines were uncovered but this falls within the margin of {allowed_difference}")
--- a/external/duckdb/scripts/clang-tidy-diff.py
+++ b/external/duckdb/scripts/clang-tidy-diff.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env python3
+#
+# ===- clang-tidy-diff.py - ClangTidy Diff Checker -----------*- python -*--===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===-----------------------------------------------------------------------===#
+
+r"""
+ClangTidy Diff Checker
+======================
+ 
+This script reads input from a unified diff, runs clang-tidy on all changed
+files and outputs clang-tidy warnings in changed lines only. This is useful to
+detect clang-tidy regressions in the lines touched by a specific patch.
+Example usage for git/svn users:
+ 
+  git diff -U0 HEAD^ | clang-tidy-diff.py -p1
+  svn diff --diff-cmd=diff -x-U0 | \
+      clang-tidy-diff.py -fix -checks=-*,modernize-use-override
+ 
+"""
+
+import argparse
+import glob
+import json
+import multiprocessing
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import threading
+import traceback
+
+try:
+    import yaml
+except ImportError:
+    yaml = None
+
+is_py2 = sys.version[0] == "2"
+
+if is_py2:
+    import Queue as queue
+else:
+    import queue as queue
+
+
+def run_tidy(task_queue, lock, timeout, failed_files):
+    watchdog = None
+    while True:
+        command = task_queue.get()
+        try:
+            proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+            if timeout is not None:
+                watchdog = threading.Timer(timeout, proc.kill)
+                watchdog.start()
+
+            stdout, stderr = proc.communicate()
+            if proc.returncode != 0:
+                if proc.returncode < 0:
+                    msg = "Terminated by signal %d : %s\n" % (
+                        -proc.returncode,
+                        " ".join(command),
+                    )
+                    stderr += msg.encode("utf-8")
+                failed_files.append(command)
+
+            with lock:
+                sys.stdout.write(stdout.decode("utf-8") + "\n")
+                sys.stdout.flush()
+                if stderr:
+                    sys.stderr.write(stderr.decode("utf-8") + "\n")
+                    sys.stderr.flush()
+        except Exception as e:
+            with lock:
+                sys.stderr.write("Failed: " + str(e) + ": ".join(command) + "\n")
+        finally:
+            with lock:
+                if not (timeout is None or watchdog is None):
+                    if not watchdog.is_alive():
+                        sys.stderr.write("Terminated by timeout: " + " ".join(command) + "\n")
+                    watchdog.cancel()
+            task_queue.task_done()
+
+
+def start_workers(max_tasks, tidy_caller, arguments):
+    for _ in range(max_tasks):
+        t = threading.Thread(target=tidy_caller, args=arguments)
+        t.daemon = True
+        t.start()
+
+
+def merge_replacement_files(tmpdir, mergefile):
+    """Merge all replacement files in a directory into a single file"""
+    # The fixes suggested by clang-tidy >= 4.0.0 are given under
+    # the top level key 'Diagnostics' in the output yaml files
+    mergekey = "Diagnostics"
+    merged = []
+    for replacefile in glob.iglob(os.path.join(tmpdir, "*.yaml")):
+        content = yaml.safe_load(open(replacefile, "r"))
+        if not content:
+            continue  # Skip empty files.
+        merged.extend(content.get(mergekey, []))
+
+    if merged:
+        # MainSourceFile: The key is required by the definition inside
+        # include/clang/Tooling/ReplacementsYaml.h, but the value
+        # is actually never used inside clang-apply-replacements,
+        # so we set it to '' here.
+        output = {"MainSourceFile": "", mergekey: merged}
+        with open(mergefile, "w") as out:
+            yaml.safe_dump(output, out)
+    else:
+        # Empty the file:
+        open(mergefile, "w").close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run clang-tidy against changed files, and " "output diagnostics only for modified " "lines."
+    )
+    parser.add_argument(
+        "-clang-tidy-binary",
+        metavar="PATH",
+        default="clang-tidy",
+        help="path to clang-tidy binary",
+    )
+    parser.add_argument(
+        "-p",
+        metavar="NUM",
+        default=0,
+        help="strip the smallest prefix containing P slashes",
+    )
+    parser.add_argument(
+        "-regex",
+        metavar="PATTERN",
+        default=None,
+        help="custom pattern selecting file paths to check " "(case sensitive, overrides -iregex)",
+    )
+    parser.add_argument(
+        "-iregex",
+        metavar="PATTERN",
+        default=r".*\.(cpp|cc|c\+\+|cxx|c|cl|h|hpp|m|mm|inc)",
+        help="custom pattern selecting file paths to check " "(case insensitive, overridden by -regex)",
+    )
+    parser.add_argument(
+        "-j",
+        type=int,
+        default=1,
+        help="number of tidy instances to be run in parallel.",
+    )
+    parser.add_argument("-timeout", type=int, default=None, help="timeout per each file in seconds.")
+    parser.add_argument("-fix", action="store_true", default=False, help="apply suggested fixes")
+    parser.add_argument(
+        "-checks",
+        help="checks filter, when not specified, use clang-tidy " "default",
+        default="",
+    )
+    parser.add_argument(
+        "-config-file",
+        dest="config_file",
+        help="Specify the path of .clang-tidy or custom config file",
+        default="",
+    )
+    parser.add_argument("-use-color", action="store_true", help="Use colors in output")
+    parser.add_argument("-path", dest="build_path", help="Path used to read a compile command database.")
+    if yaml:
+        parser.add_argument(
+            "-export-fixes",
+            metavar="FILE_OR_DIRECTORY",
+            dest="export_fixes",
+            help="A directory or a yaml file to store suggested fixes in, "
+            "which can be applied with clang-apply-replacements. If the "
+            "parameter is a directory, the fixes of each compilation unit are "
+            "stored in individual yaml files in the directory.",
+        )
+    else:
+        parser.add_argument(
+            "-export-fixes",
+            metavar="DIRECTORY",
+            dest="export_fixes",
+            help="A directory to store suggested fixes in, which can be applied "
+            "with clang-apply-replacements. The fixes of each compilation unit are "
+            "stored in individual yaml files in the directory.",
+        )
+    parser.add_argument(
+        "-extra-arg",
+        dest="extra_arg",
+        action="append",
+        default=[],
+        help="Additional argument to append to the compiler " "command line.",
+    )
+    parser.add_argument(
+        "-extra-arg-before",
+        dest="extra_arg_before",
+        action="append",
+        default=[],
+        help="Additional argument to prepend to the compiler " "command line.",
+    )
+    parser.add_argument(
+        "-quiet",
+        action="store_true",
+        default=False,
+        help="Run clang-tidy in quiet mode",
+    )
+    parser.add_argument(
+        "-load",
+        dest="plugins",
+        action="append",
+        default=[],
+        help="Load the specified plugin in clang-tidy.",
+    )
+    parser.add_argument(
+        "-allow-no-checks",
+        action="store_true",
+        help="Allow empty enabled checks.",
+    )
+
+    clang_tidy_args = []
+    argv = sys.argv[1:]
+    if "--" in argv:
+        clang_tidy_args.extend(argv[argv.index("--") :])
+        argv = argv[: argv.index("--")]
+
+    args = parser.parse_args(argv)
+
+    # Extract changed lines for each file.
+    filename = None
+    lines_by_file = {}
+    for line in sys.stdin:
+        match = re.search('^\\+\\+\\+\\ "?(.*?/){%s}([^ \t\n"]*)' % args.p, line)
+        if match:
+            filename = match.group(2)
+        if filename is None:
+            continue
+
+        if args.regex is not None:
+            if not re.match("^%s$" % args.regex, filename):
+                continue
+        else:
+            if not re.match("^%s$" % args.iregex, filename, re.IGNORECASE):
+                continue
+
+        match = re.search(r"^@@.*\+(\d+)(,(\d+))?", line)
+        if match:
+            start_line = int(match.group(1))
+            line_count = 1
+            if match.group(3):
+                line_count = int(match.group(3))
+            if line_count == 0:
+                continue
+            end_line = start_line + line_count - 1
+            lines_by_file.setdefault(filename, []).append([start_line, end_line])
+
+    if not any(lines_by_file):
+        print("No relevant changes found.")
+        sys.exit(0)
+
+    max_task_count = args.j
+    if max_task_count == 0:
+        max_task_count = multiprocessing.cpu_count()
+    max_task_count = min(len(lines_by_file), max_task_count)
+
+    combine_fixes = False
+    export_fixes_dir = None
+    delete_fixes_dir = False
+    if args.export_fixes is not None:
+        # if a directory is given, create it if it does not exist
+        if args.export_fixes.endswith(os.path.sep) and not os.path.isdir(args.export_fixes):
+            os.makedirs(args.export_fixes)
+
+        if not os.path.isdir(args.export_fixes):
+            if not yaml:
+                raise RuntimeError(
+                    "Cannot combine fixes in one yaml file. Either install PyYAML or specify an output directory."
+                )
+
+            combine_fixes = True
+
+        if os.path.isdir(args.export_fixes):
+            export_fixes_dir = args.export_fixes
+
+    if combine_fixes:
+        export_fixes_dir = tempfile.mkdtemp()
+        delete_fixes_dir = True
+
+    # Tasks for clang-tidy.
+    task_queue = queue.Queue(max_task_count)
+    # A lock for console output.
+    lock = threading.Lock()
+
+    # List of files with a non-zero return code.
+    failed_files = []
+
+    # Run a pool of clang-tidy workers.
+    start_workers(max_task_count, run_tidy, (task_queue, lock, args.timeout, failed_files))
+
+    # Form the common args list.
+    common_clang_tidy_args = []
+    if args.fix:
+        common_clang_tidy_args.append("-fix")
+    if args.checks != "":
+        common_clang_tidy_args.append("-checks=" + args.checks)
+    if args.config_file != "":
+        common_clang_tidy_args.append("-config-file=" + args.config_file)
+    if args.quiet:
+        common_clang_tidy_args.append("-quiet")
+    if args.build_path is not None:
+        common_clang_tidy_args.append("-p=%s" % args.build_path)
+    if args.use_color:
+        common_clang_tidy_args.append("--use-color")
+    if args.allow_no_checks:
+        common_clang_tidy_args.append("--allow-no-checks")
+    for arg in args.extra_arg:
+        common_clang_tidy_args.append("-extra-arg=%s" % arg)
+    for arg in args.extra_arg_before:
+        common_clang_tidy_args.append("-extra-arg-before=%s" % arg)
+    for plugin in args.plugins:
+        common_clang_tidy_args.append("-load=%s" % plugin)
+
+    for name in lines_by_file:
+        line_filter_json = json.dumps([{"name": name, "lines": lines_by_file[name]}], separators=(",", ":"))
+
+        # Run clang-tidy on files containing changes.
+        command = [args.clang_tidy_binary]
+        command.append("-line-filter=" + line_filter_json)
+        if args.export_fixes is not None:
+            # Get a temporary file. We immediately close the handle so clang-tidy can
+            # overwrite it.
+            (handle, tmp_name) = tempfile.mkstemp(suffix=".yaml", dir=export_fixes_dir)
+            os.close(handle)
+            command.append("-export-fixes=" + tmp_name)
+        command.extend(common_clang_tidy_args)
+        command.append(name)
+        command.extend(clang_tidy_args)
+
+        task_queue.put(command)
+
+    # Application return code
+    return_code = 0
+
+    # Wait for all threads to be done.
+    task_queue.join()
+    # Application return code
+    return_code = 0
+    if failed_files:
+        return_code = 1
+
+    if combine_fixes:
+        print("Writing fixes to " + args.export_fixes + " ...")
+        try:
+            merge_replacement_files(export_fixes_dir, args.export_fixes)
+        except:
+            sys.stderr.write("Error exporting fixes.\n")
+            traceback.print_exc()
+            return_code = 1
+
+    if delete_fixes_dir:
+        shutil.rmtree(export_fixes_dir)
+    sys.exit(return_code)
+
+
+if __name__ == "__main__":
+    main()
--- a/external/duckdb/scripts/compute-extension-hash.sh
+++ b/external/duckdb/scripts/compute-extension-hash.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+rm -f hash_concats
+touch hash_concats
+
+split -b 1M $1
+
+FILES="x*"
+for f in $FILES
+do
+	# sha256 a segment
+	openssl dgst -binary -sha256 $f >> hash_concats
+	rm $f
+done
+
+# sha256 the concatenation
+openssl dgst -binary -sha256 hash_concats > hash_composite
+
+cat hash_composite
--- a/external/duckdb/scripts/coverage_check.sh
+++ b/external/duckdb/scripts/coverage_check.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -e
+
+# prepare coverage file
+lcov --config-file .github/workflows/lcovrc --zerocounters --directory .
+lcov --config-file .github/workflows/lcovrc --capture --initial --directory . --base-directory . --no-external --output-file coverage.info
+
+# build with coverage enabled
+mkdir -p build/coverage
+(cd build/coverage && cmake -E env CXXFLAGS="--coverage" cmake -DBUILD_EXTENSIONS="parquet;json;jemalloc;autocomplete;icu" -DENABLE_SANITIZER=0 -DCMAKE_BUILD_TYPE=Debug ../.. && cmake --build .)
+
+# run tests
+build/coverage/test/unittest
+build/coverage/test/unittest "[detailed_profiler]"
+build/coverage/test/unittest test/sql/tpch/tpch_sf01.test_slow
+python3 -m pytest --shell-binary build/coverage/duckdb tools/shell/tests/
+
+# finalize coverage file
+lcov --config-file .github/workflows/lcovrc --directory . --base-directory . --no-external --capture --output-file coverage.info
+lcov --config-file .github/workflows/lcovrc --remove coverage.info $(< .github/workflows/lcov_exclude) -o lcov.info
+
+# generate coverage html
+genhtml -o coverage_html lcov.info
+
+# check that coverage passes threshold
+# python3 scripts/check_coverage.py
--- a/external/duckdb/scripts/create-release-notes.py
+++ b/external/duckdb/scripts/create-release-notes.py
@@ -0,0 +1,63 @@
+import json, os, sys, glob, mimetypes, urllib.request, re
+
+api_url = 'https://api.github.com/repos/duckdb/duckdb/'
+
+if len(sys.argv) < 2:
+    print("Usage: [last_tag] ")
+    exit(1)
+
+
+token = os.getenv("GH_TOKEN", "")
+if token == "":
+    raise ValueError('need a GitHub token in GH_TOKEN')
+
+
+# amazingly this is the entire code of the pypy package `linkheader-parser`
+def extract(link_header):
+    """Extract links and their relations from a Link Header Field."""
+    links = [l.strip() for l in link_header.split(',')]
+    rels = {}
+    pattern = r'<(?P<url>.*)>;\s*rel="(?P<rel>.*)"'
+    for link in links:
+        group_dict = re.match(pattern, link).groupdict()
+        rels[group_dict['rel']] = group_dict['url']
+    return rels
+
+
+def gh_api(suburl, full_url=''):
+    if full_url == '':
+        url = api_url + suburl
+    else:
+        url = full_url
+    headers = {"Content-Type": "application/json", 'Authorization': 'token ' + token}
+
+    req = urllib.request.Request(url, b'', headers)
+    req.get_method = lambda: 'GET'
+    next_link = None
+    try:
+        resp = urllib.request.urlopen(req)
+        if not resp.getheader("Link") is None:
+            link_data = extract(resp.getheader("Link"))
+            if "next" in link_data:
+                next_link = link_data["next"]
+        raw_resp = resp.read().decode()
+    except urllib.error.HTTPError as e:
+        raw_resp = e.read().decode()  # gah
+
+    ret_json = json.loads(raw_resp)
+    if next_link is not None:
+        return ret_json + gh_api('', full_url=next_link)
+    return ret_json
+
+
+# get time of tag
+old_release = gh_api('releases/tags/%s' % sys.argv[1])
+print(old_release["published_at"])
+
+pulls = gh_api('pulls?base=main&state=closed')
+for p in pulls:
+    if p["merged_at"] is None:
+        continue
+    if p["merged_at"] < old_release["published_at"]:
+        continue
+    print(" - #%s: %s" % (p["number"], p["title"]))
--- a/external/duckdb/scripts/create_local_extension_repo.py
+++ b/external/duckdb/scripts/create_local_extension_repo.py
@@ -0,0 +1,43 @@
+###
+# This script copies all extensions in a build folder from their cmake-produced structure into the extension repository
+# structure of ./<duckdb_version>/<build_archictecture>/<extension_name>.duckdb_extension
+# Note that it requires duckdb_platofrom_out file to be populated with the platform
+
+import os
+import sys
+import subprocess
+import glob
+import shutil
+
+if len(sys.argv) != 6:
+    print(
+        "Usage: scripts/create_local_extension_repo.py <duckdb_version> <duckdb_platform_out> <path/to/duckdb/build> <path/to/local_repo> <postfix>"
+    )
+    exit(1)
+
+duckdb_version = sys.argv[1]
+duckdb_platform_out = sys.argv[2]
+extension_path = sys.argv[3]
+dst_path = sys.argv[4]
+postfix = sys.argv[5]
+
+if os.name == 'nt':
+    duckdb_platform_out = duckdb_platform_out.replace("/", "\\")
+    extension_path = extension_path.replace("/", "\\")
+    dst_path = dst_path.replace("/", "\\")
+
+with open(duckdb_platform_out, 'r') as f:
+    lines = f.readlines()
+    duckdb_platform = lines[0]
+
+# Create destination path
+dest_path = os.path.join(dst_path, duckdb_version, duckdb_platform)
+if not os.path.exists(dest_path):
+    os.makedirs(dest_path)
+
+# Now copy over the extensions to the correct path
+glob_string = os.path.join(extension_path, 'extension', '*', '*.' + postfix)
+
+for file in glob.glob(glob_string):
+    dest_file = os.path.join(dest_path, os.path.basename(file))
+    shutil.copy(file, dest_file)
--- a/external/duckdb/scripts/create_patch.py
+++ b/external/duckdb/scripts/create_patch.py
@@ -0,0 +1,147 @@
+import os
+import argparse
+import sys
+import re
+import subprocess
+from typing import List, Dict
+from pathlib import Path
+
+SCRIPT_DIR = os.path.dirname(__file__)
+
+parser = argparse.ArgumentParser(description="Generate a patch file for a DuckDB extension.")
+
+parser.add_argument(
+    "repository_path",
+    type=str,
+    help="Path to the repository where the changes live that should be turned into a patch.",
+)
+
+parser.add_argument(
+    "extension_name",
+    type=str,
+    help="Name of the extension to patch, should match the name in `.github/config/extensions/<extension_name>.cmake`.",
+)
+
+parser.add_argument("patch_name", type=str, help="Name for the patch file to create.")
+
+parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the patch file if it already exists.")
+
+args = parser.parse_args()
+
+
+def verify_git_tag():
+    # Locate the cmake file to extract the GIT_TAG from
+    cmake_path = Path(SCRIPT_DIR) / '..' / ".github" / "config" / "extensions" / f"{args.extension_name}.cmake"
+    if not cmake_path.is_file():
+        print(f"Error: Extension CMake file not found: {cmake_path}")
+        sys.exit(1)
+
+    cmake_content = cmake_path.read_text()
+
+    # Extract GIT_TAG from the cmake file
+    match = re.search(r"\bGIT_TAG\s+([^\s\)]+)", cmake_content)
+    if not match:
+        print(f"Error: Could not find GIT_TAG in {cmake_path}")
+        sys.exit(1)
+
+    git_tag_in_cmake = match.group(1)
+
+    # Get the current commit hash in repository_path
+    try:
+        result = subprocess.run(
+            ["git", "rev-parse", "HEAD"],
+            cwd=args.repository_path,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            check=True,
+        )
+        current_commit = result.stdout.strip()
+    except subprocess.CalledProcessError as e:
+        print(f"Error: Failed to run git in {args.repository_path} — {e.stderr.strip()}")
+        sys.exit(1)
+
+    # Compare the tags
+    if git_tag_in_cmake != current_commit:
+        print(
+            f"Error: GIT_TAG in {cmake_path} is {git_tag_in_cmake}, "
+            f"but repository {args.repository_path} is checked out at {current_commit}."
+        )
+        sys.exit(1)
+
+
+def create_patch():
+    # Collect changes with git diff
+    try:
+        diff_result = subprocess.run(
+            ["git", "diff", "--ignore-submodules"],
+            cwd=args.repository_path,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            check=True,
+        )
+    except subprocess.CalledProcessError as e:
+        print(f"Error: Failed to run git diff — {e.stderr.strip()}")
+        sys.exit(1)
+
+    new_patch_content = diff_result.stdout
+    if not new_patch_content.strip():
+        print("⚠️ No changes detected in repository; no patch will be created.")
+        sys.exit(0)
+
+    def parse_patch_files_and_lines(patch_text):
+        changes = {}
+        current_file = None
+        for line in patch_text.splitlines():
+            if line.startswith("diff --git"):
+                parts = line.split()
+                if len(parts) >= 3:
+                    # Format: diff --git a/file b/file
+                    current_file = parts[2][2:]  # remove 'a/'
+                    changes.setdefault(current_file, set())
+            elif line.startswith("@@") and current_file:
+                # Format: @@ -old_start,old_count +new_start,new_count @@
+                m = re.match(r"@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@", line)
+                if m:
+                    start = int(m.group(1))
+                    length = int(m.group(2) or "1")
+                    for l in range(start, start + length):
+                        changes[current_file].add(l)
+        return changes
+
+    new_changes = parse_patch_files_and_lines(new_patch_content)
+
+    # Check conflicts with existing patches
+    patch_dir = (Path(SCRIPT_DIR) / ".." / ".github" / "patches" / "extensions" / args.extension_name).resolve()
+    patch_dir.mkdir(parents=True, exist_ok=True)
+
+    for existing_patch in patch_dir.glob("*.patch"):
+        if existing_patch.name == f"{args.patch_name}.patch":
+            if not args.overwrite:
+                print(f"A patch by the name '{args.patch_name}.patch' already exists, failed to create patch")
+                sys.exit(1)
+            else:
+                continue
+        existing_changes = parse_patch_files_and_lines(existing_patch.read_text())
+
+        for file, lines in new_changes.items():
+            if file in existing_changes:
+                overlap = lines & existing_changes[file]
+                if overlap:
+                    print(f"❌ Conflict detected with existing patch: {existing_patch.name}")
+                    print(f"   File: {file}")
+                    print(f"   Overlapping lines: {sorted(overlap)}")
+                    sys.exit(1)
+
+    # Save patch file
+    patch_dir = (Path(SCRIPT_DIR) / ".." / ".github" / "patches" / "extensions" / args.extension_name).resolve()
+    patch_dir.mkdir(parents=True, exist_ok=True)
+
+    patch_path = patch_dir / f"{args.patch_name}.patch"
+    patch_path.write_text(diff_result.stdout)
+
+
+verify_git_tag()
+
+create_patch()
--- a/external/duckdb/scripts/exported_symbols_check.py
+++ b/external/duckdb/scripts/exported_symbols_check.py
@@ -0,0 +1,71 @@
+import subprocess
+import sys
+import os
+
+if len(sys.argv) < 2 or not os.path.isfile(sys.argv[1]):
+    print("Usage: [libduckdb dynamic library file, release build]")
+    exit(1)
+
+res = subprocess.run('nm -g -C -P'.split(' ') + [sys.argv[1]], check=True, capture_output=True)
+if res.returncode != 0:
+    raise ValueError('Failed to run `nm`')
+
+culprits = []
+
+whitelist = [
+    '@GLIBC',
+    '@CXXABI',
+    '__gnu_cxx::',
+    'std::',
+    'N6duckdb',
+    'duckdb::',
+    'duckdb_miniz::',
+    'duckdb_fmt::',
+    'duckdb_hll::',
+    'duckdb_moodycamel::',
+    'duckdb_yyjson::',
+    'duckdb_',
+    'RefCounter',
+    'registerTMCloneTable',
+    'RegisterClasses',
+    'Unwind_Resume',
+    '__gmon_start',
+    '_fini',
+    '_init',
+    '_version',
+    '_end',
+    '_edata',
+    '__bss_start',
+    '__udivti3',
+    '__popcount',
+    'Adbc',
+    'ErrorArrayStream',
+    'ErrorFromArrayStream',
+]
+
+for symbol in res.stdout.decode('utf-8').split('\n'):
+    if len(symbol.strip()) == 0:
+        continue
+    if symbol.endswith(' U'):  # undefined because dynamic linker
+        continue
+    if symbol.endswith(' U 0 0') and "random_device" not in symbol:  # undefined because dynamic linker
+        continue
+
+    is_whitelisted = False
+    for entry in whitelist:
+        if entry in symbol and "random_device" not in symbol:
+            is_whitelisted = True
+    if is_whitelisted:
+        continue
+
+    culprits.append(symbol)
+
+
+if len(culprits) > 0:
+    print("Found leaked symbols. Either white-list above or change visibility:")
+    for symbol in culprits:
+        print(symbol)
+    sys.exit(1)
+
+
+sys.exit(0)
--- a/external/duckdb/scripts/extension-upload-all.sh
+++ b/external/duckdb/scripts/extension-upload-all.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Uploads all extensions found in <base_dir_glob> (default: build/release/extension/*)
+# this script is used by DuckDB CI to upload all extensions at once
+
+# Usage: ./extension-upload-all.sh <architecture> <duckdb_version> [<base_dir_glob>]
+
+# The directory that the script lives in, thanks @Tishj
+script_dir="$(dirname "$(readlink -f "$0")")"
+
+if [ -z "$1" ] || [ -z "$2" ]; then
+    echo "Usage: ./extension-upload-all.sh <architecture> <duckdb_version> [<base_dir_glob>]"
+    exit 1
+fi
+
+if [ -z "$3" ]; then
+    BASE_DIR="build/release/extension/*"
+else
+    BASE_DIR="$3"
+fi
+
+set -e
+
+# Ensure we do nothing on failed globs
+shopt -s nullglob
+
+# Print dry run / real run
+if [ "$DUCKDB_DEPLOY_SCRIPT_MODE" == "for_real" ]; then
+    echo "Deploying extensions.."
+else
+    echo "Deploying extensions.. (DRY RUN)"
+fi
+
+if [[ $1 == wasm* ]]; then
+  FILES="$BASE_DIR/*.duckdb_extension.wasm"
+else
+  FILES="$BASE_DIR/*.duckdb_extension"
+fi
+
+for f in $FILES
+do
+    if [[ $1 == wasm* ]]; then
+      ext_name=`basename $f .duckdb_extension.wasm`
+    else
+      ext_name=`basename $f .duckdb_extension`
+    fi
+    echo "found extension: '$ext_name'"
+
+    # args: <name> <extension_version> <duckdb_version> <architecture> <s3_bucket> <copy_to_latest> <copy_to_versioned> [<path_to_ext>]
+	  $script_dir/extension-upload-single.sh $ext_name "" "$2" "$1" "duckdb-core-extensions" true false "$(dirname "$f")"
+done
--- a/external/duckdb/scripts/extension-upload-from-nightly.sh
+++ b/external/duckdb/scripts/extension-upload-from-nightly.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+
+# This script deploys the extension binaries that are currently deployed to the nightly bucket to the main bucket
+
+# WARNING: don't use this script if you don't know exactly what you're doing. To deploy a binary:
+# - Run the script with ./extension-upload-from-nightly.sh <extension_name> <duckdb_version> (<nightly_commit>)
+# - CHECK the output of the dry run thoroughly
+# - If successful, set the DUCKDB_DEPLOY_SCRIPT_MODE env variable to the correct value
+# - run the script again now deploying for real
+# - check the output
+# - unset the DUCKDB_DEPLOY_SCRIPT_MODE env var
+
+if [ -z "$1" ] || [ -z "$2" ]; then
+    echo "Usage: ./extension-upload-from-nightly.sh <extension_name> <duckdb_version> (<nightly_commit>)"
+    exit 1
+fi
+
+if [ -z "$3" ]; then
+    BASE_NIGHTLY_DIR="$2"
+else
+    BASE_NIGHTLY_DIR="$1/$3/$2"
+fi
+
+# CONFIG
+FROM_BUCKET=duckdb-extensions-nightly
+TO_BUCKET=duckdb-core-extensions
+CLOUDFRONT_DISTRIBUTION_ID=E2Z28NDMI4PVXP
+
+### COPY THE FILES
+## REAL_RUN is to be used to move non-Wasm extensions
+REAL_RUN="aws s3 cp s3://$FROM_BUCKET/$BASE_NIGHTLY_DIR s3://$TO_BUCKET/$2 --recursive --exclude '*' --include '*/$1.duckdb_extension.gz' --acl public-read --region us-east-2"
+DRY_RUN="$REAL_RUN --dryrun"
+## REAL_RUN_WASM is to be used to move Wasm extensions to new style path (no extra duckdb-wasm)
+REAL_RUN_WASM="aws s3 cp s3://$FROM_BUCKET/$BASE_NIGHTLY_DIR s3://$TO_BUCKET/$2 --recursive --exclude '*' --include '*/$1.duckdb_extension.wasm' --acl public-read --content-encoding br --content-type='application/wasm' --region us-east-2"
+DRY_RUN_WASM="$REAL_RUN_WASM --dryrun"
+
+if [ "$DUCKDB_DEPLOY_SCRIPT_MODE" == "for_real" ]; then
+  echo "DEPLOYING"
+  echo "> FROM: $FROM_BUCKET"
+  echo "> TO  : $TO_BUCKET"
+  echo "> AWS CLI deploy: "
+  eval "$REAL_RUN"
+  eval "$REAL_RUN_WASM"
+else
+  echo "DEPLOYING (DRY RUN)"
+  echo "> FROM: $FROM_BUCKET"
+  echo "> TO  : $TO_BUCKET"
+  echo "> AWS CLI Dry run: "
+  eval "$DRY_RUN"
+  eval "$DRY_RUN_WASM"
+fi
+
+echo ""
+
+### INVALIDATE THE CLOUDFRONT CACHE AND CLOUDFLARE
+# For double checking we are invalidating the correct domain
+CLOUDFRONT_ORIGINS=`aws cloudfront get-distribution --id $CLOUDFRONT_DISTRIBUTION_ID --query 'Distribution.DistributionConfig.Origins.Items[*].DomainName' --output text`
+
+# Parse the dry run output
+output=$(eval "$DRY_RUN" && eval "$DRY_RUN_WASM" && eval "$DRY_RUN_WASM_OLD_STYLE")
+s3_paths=()
+while IFS= read -r line; do
+    if [[ $line == *"copy:"* ]]; then
+        s3_path=$(echo $line | grep -o 's3://[^ ]*' | awk 'NR%2==0' | awk -F "s3://$TO_BUCKET" '{print $2}' | cut -d' ' -f1)
+        s3_paths+=("$s3_path")
+    fi
+done <<< "$output"
+
+if [ "$DUCKDB_DEPLOY_SCRIPT_MODE" == "for_real" ]; then
+  echo "CLOUDFRONT INVALIDATION"
+  echo "> Total files: ${#s3_paths[@]}"
+  echo "> Domain: $CLOUDFRONT_ORIGINS"
+  for path in "${s3_paths[@]}"; do
+    aws cloudfront create-invalidation --distribution-id "$CLOUDFRONT_DISTRIBUTION_ID" --paths "$path"
+  done
+else
+  echo "INVALIDATION (DRY RUN)"
+  echo "> Total files: ${#s3_paths[@]}"
+  echo "> Domain: $CLOUDFRONT_ORIGINS"
+  echo "> Paths:"
+  for path in "${s3_paths[@]}"; do
+    echo "    $path"
+  done
+fi
+
+echo ""
+
+if [ ! -z "$CLOUDFLARE_CACHE_PURGE_TOKEN" ]; then
+   if [ "$DUCKDB_DEPLOY_SCRIPT_MODE" == "for_real" ]; then
+     echo "CLOUDFLARE INVALIDATION"
+     echo "> Total files: ${#s3_paths[@]}"
+     for path in "${s3_paths[@]}"; do
+       curl  --request POST --url https://api.cloudflare.com/client/v4/zones/84f631c38b77d4631b561207f2477332/purge_cache --header 'Content-Type: application/json' --header "Authorization: Bearer $CLOUDFLARE_CACHE_PURGE_TOKEN" --data "{\"files\": [\"http://extensions.duckdb.org$path\"]}"
+       echo ""
+     done
+   else
+     echo "CLOUDFLARE INVALIDATION (DRY RUN)"
+     echo "> Total files: ${#s3_paths[@]}"
+     echo "> Domain: $CLOUDFRONT_ORIGINS"
+     echo "> Paths:"
+     for path in "${s3_paths[@]}"; do
+       echo "    http://extensions.duckdb.org$path"
+     done
+   fi
+else
+    echo "##########################################"
+    echo "WARNING! CLOUDFLARE INVALIDATION DISABLED!"
+    echo "##########################################"
+fi
--- a/external/duckdb/scripts/extension-upload-repository.sh
+++ b/external/duckdb/scripts/extension-upload-repository.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# Uploads all extensions found in <base_dir_glob> (default: build/release/extension/*)
+# this script is used by DuckDB CI to upload all extensions at once
+
+# Usage: ./extension-upload-all.sh <base_dir_glob>
+# Expected directory structure: <base_dir_glob>/<duckdb_version>/<architecture>/
+
+# The directory that the script lives in, thanks @Tishj
+script_dir="$(dirname "$(readlink -f "$0")")"
+
+if [ -z "$1" ]; then
+    BASE_DIR="build/release/repository/*"
+else
+    BASE_DIR="$1"
+fi
+
+echo $BASE_DIR
+
+set -e
+
+# Ensure we do nothing on failed globs
+shopt -s nullglob
+
+if [ "$DUCKDB_DEPLOY_SCRIPT_MODE" == "for_real" ]; then
+    echo "Deploying extensions.."
+else
+    echo "Deploying extensions.. (DRY RUN)"
+fi
+
+for version_dir in $BASE_DIR/*; do
+    duckdb_version=$(basename "$version_dir")
+    for arch_dir in "$version_dir"/*; do
+        architecture=$(basename "$arch_dir")
+        if [[ $architecture == wasm* ]]; then
+            FILES="$arch_dir/*.duckdb_extension.wasm"
+        else
+            FILES="$arch_dir/*.duckdb_extension"
+        fi
+
+        for f in $FILES; do
+            if [[ $architecture == wasm* ]]; then
+                ext_name=`basename $f .duckdb_extension.wasm`
+            else
+                ext_name=`basename $f .duckdb_extension`
+            fi
+            
+            echo "Processing extension: $ext_name (architecture: $architecture, version: $duckdb_version, path: $f)"
+            
+            # args: <name> <extension_version> <duckdb_version> <architecture> <s3_bucket> <copy_to_latest> <copy_to_versioned> [<path_to_ext>]
+            $script_dir/extension-upload-single.sh $ext_name "" "$duckdb_version" "$architecture" "duckdb-core-extensions" true false "$(dirname "$f")"
+        done
+        echo ""
+    done
+done
+
--- a/external/duckdb/scripts/extension-upload-single.sh
+++ b/external/duckdb/scripts/extension-upload-single.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+# Main extension uploading script
+
+# Note: use the DUCKDB_DEPLOY_SCRIPT_MODE variable to disable dryrun mode
+
+# Usage: ./extension-upload-single.sh <name> <extension_version> <duckdb_version> <architecture> <s3_bucket> <copy_to_latest> <copy_to_versioned> [<path_to_ext>]
+# <name>                : Name of the extension
+# <extension_version>   : Version (commit / version tag) of the extension
+# <duckdb_version>      : Version (commit / version tag) of DuckDB
+# <architecture>        : Architecture target of the extension binary
+# <s3_bucket>           : S3 bucket to upload to
+# <copy_to_latest>      : Set this as the latest version ("true" / "false", default: "false")
+# <copy_to_versioned>   : Set this as a versioned version that will not be overwritten
+# <path_to_ext>         : (optional) Search this path for the extension
+
+set -e
+
+if [ -z "$8" ]; then
+    BASE_EXT_DIR="/tmp/extension"
+else
+    BASE_EXT_DIR="$8"
+fi
+
+if [[ $4 == wasm* ]]; then
+  ext="$BASE_EXT_DIR/$1.duckdb_extension.wasm"
+else
+  ext="$BASE_EXT_DIR/$1.duckdb_extension"
+fi
+
+script_dir="$(dirname "$(readlink -f "$0")")"
+
+# calculate SHA256 hash of extension binary
+cat $ext > $ext.append
+
+( command -v truncate && truncate -s -256 $ext.append ) || ( command -v gtruncate && gtruncate -s -256 $ext.append ) || exit 1
+
+# (Optionally) Sign binary
+if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then
+  echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem
+  $script_dir/compute-extension-hash.sh $ext.append > $ext.hash
+  openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign
+  rm -f private.pem
+else
+  # Default to 256 zeros
+  dd if=/dev/zero of=$ext.sign bs=256 count=1
+fi
+
+# append signature to extension binary
+cat $ext.sign >> $ext.append
+
+# compress extension binary
+if [[ $4 == wasm_* ]]; then
+  brotli < $ext.append > "$ext.compressed"
+else
+  gzip < $ext.append > "$ext.compressed"
+fi
+
+set -e
+
+# Abort if AWS key is not set
+if [ -z "$AWS_ACCESS_KEY_ID" ]; then
+    echo "No AWS key found, skipping.."
+    exit 0
+fi
+
+# Set dry run unless guard var is set
+DRY_RUN_PARAM="--dryrun"
+if [ "$DUCKDB_DEPLOY_SCRIPT_MODE" == "for_real" ]; then
+  DRY_RUN_PARAM=""
+fi
+
+# upload versioned version
+if [[ $7 = 'true' ]]; then
+  if [ -z "$3" ]; then
+    echo "extension-upload-single.sh called with upload_versioned=true but no extension version was passed"
+    exit 1
+  fi
+
+  if [[ $4 == wasm* ]]; then
+    aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm $DRY_RUN_PARAM --acl public-read --content-encoding br --content-type="application/wasm"
+  else
+    aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz $DRY_RUN_PARAM --acl public-read
+  fi
+fi
+
+# upload to latest version
+if [[ $6 = 'true' ]]; then
+  if [[ $4 == wasm* ]]; then
+    aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm $DRY_RUN_PARAM --acl public-read --content-encoding br --content-type="application/wasm"
+  else
+    aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz $DRY_RUN_PARAM --acl public-read
+  fi
+fi
--- a/external/duckdb/scripts/extension-upload-test.sh
+++ b/external/duckdb/scripts/extension-upload-test.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+set -e
+set -x
+
+CMAKE_CONFIG=Release
+EXT_BASE_PATH=build/release
+
+if [ "${FORCE_32_BIT:0}" == "1" ]; then
+  FORCE_32_BIT_FLAG="-DFORCE_32_BIT=1"
+else
+  FORCE_32_BIT_FLAG=""
+fi
+
+FILES="${EXT_BASE_PATH}/extension/*/*.duckdb_extension"
+
+EXTENSION_LIST=""
+for f in $FILES
+do
+	ext=`basename $f .duckdb_extension`
+	EXTENSION_LIST="${EXTENSION_LIST}-$ext"
+done
+mkdir -p testext
+cd testext
+
+if [ "$2" = "oote" ]; then
+  CMAKE_ROOT="../duckdb"
+else
+  CMAKE_ROOT=".."
+fi
+
+cmake -DCMAKE_BUILD_TYPE=${CMAKE_CONFIG} ${FORCE_32_BIT_FLAG} -DEXTENSION_TESTS_ONLY=1 -DDUCKDB_EXTENSION_CONFIGS=".github/config/in_tree_extensions.cmake;.github/config/out_of_tree_extensions.cmake" ${CMAKE_ROOT}
+cmake --build . --config ${CMAKE_CONFIG}
+cd ..
+
+duckdb_path="testext/duckdb"
+unittest_path="testext/test/unittest"
+if [ ! -f "${duckdb_path}" ]; then
+  duckdb_path="testext/${CMAKE_CONFIG}/duckdb.exe"
+  unittest_path="testext/test/${CMAKE_CONFIG}/unittest.exe"
+fi
+
+${duckdb_path} -c "FROM duckdb_extensions()"
+
+for f in $FILES
+do
+	ext=`basename $f .duckdb_extension`
+	install_path=${ext}
+	unsigned_flag=
+	if [ "$1" = "local" ]
+	then
+		install_path=${f}
+		unsigned_flag=-unsigned
+	fi
+	echo ${install_path}
+	${duckdb_path} ${unsigned_flag} -c "FORCE INSTALL '${install_path}'"
+	${duckdb_path} ${unsigned_flag} -c "LOAD '${ext}'"
+done
+
+# Only run tests for non-local, we have tested in enough other ways
+if [ "$1" != "local" ]
+then
+  ${unittest_path} --autoloading all --skip-compiled
+fi
--- a/external/duckdb/scripts/extension-upload-wasm.sh
+++ b/external/duckdb/scripts/extension-upload-wasm.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Usage: ./extension-upload-wasm.sh <architecture> <commithash or version_tag>
+
+set -e
+
+# Ensure we do nothing on failed globs
+shopt -s nullglob
+
+if [[ -z "${DUCKDB_EXTENSION_SIGNING_PK}" ]]; then
+	# no private key provided, use the test private key (NOT SAFE)
+	# this is made so private.pem at the end of the block will be in
+	# a valid state, and the rest of the signing process can be tested
+	# even without providing the key
+	cp test/mbedtls/private.pem private.pem
+else
+	# actual private key provided
+	echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem
+fi
+
+FILES="build/to_be_deployed/$2/$1/*.duckdb_extension.wasm"
+for f in $FILES
+do
+        ext=`basename $f .duckdb_extension.wasm`
+        echo $ext
+        # calculate SHA256 hash of extension binary
+        cat $f > $f.append
+        # 0 for custom section
+        # 113 in hex = 275 in decimal, total length of what follows (1 + 16 + 2 + 256)
+        # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02]
+        echo -n -e '\x00' >> $f.append
+        echo -n -e '\x93\x02' >> $f.append
+        # 10 in hex = 16 in decimal, length of name, 1 byte
+        echo -n -e '\x10' >> $f.append
+        echo -n -e 'duckdb_signature' >> $f.append
+        # the name of the WebAssembly custom section, 16 bytes
+        # 100 in hex, 256 in decimal
+        # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)],
+        # for a grand total of 2 bytes
+        echo -n -e '\x80\x02' >> $f.append
+        # the actual payload, 256 bytes, to be added later
+        scripts/compute-extension-hash.sh $f.append > $f.hash
+        # encrypt hash with extension signing private key to create signature
+        openssl pkeyutl -sign -in $f.hash -inkey private.pem -pkeyopt digest:sha256 -out $f.sign
+        # append signature to extension binary
+        cat $f.sign >> $f.append
+        # compress extension binary
+        brotli < $f.append > "$f.brotli"
+        # upload compressed extension binary to S3
+	if [[ -z "${AWS_SECRET_ACCESS_KEY}" ]]; then
+		#AWS_SECRET_ACCESS_KEY is empty -> dry run
+		aws s3 cp $f.brotli s3://duckdb-core-extensions/$2/$1/$ext.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" --dryrun
+	else
+		aws s3 cp $f.brotli s3://duckdb-core-extensions/$2/$1/$ext.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
+	fi
+done
+
+# remove private key
+rm private.pem
--- a/external/duckdb/scripts/format.py
+++ b/external/duckdb/scripts/format.py
@@ -0,0 +1,461 @@
+#!/usr/bin/python
+
+# this script is used to format the source directory
+
+import os
+import time
+import sys
+import inspect
+import subprocess
+import difflib
+import re
+import tempfile
+import uuid
+import concurrent.futures
+import argparse
+import shutil
+import traceback
+from python_helpers import open_utf8
+
+try:
+    ver = subprocess.check_output(('black', '--version'), text=True)
+    if int(ver.split(' ')[1].split('.')[0]) < 24:
+        print('you need to run `pip install "black>=24"`', ver)
+        exit(-1)
+except Exception as e:
+    print('you need to run `pip install "black>=24"`', e)
+    exit(-1)
+
+try:
+    ver = subprocess.check_output(('clang-format', '--version'), text=True)
+    if '11.' not in ver:
+        print('you need to run `pip install clang_format==11.0.1 - `', ver)
+        exit(-1)
+except Exception as e:
+    print('you need to run `pip install clang_format==11.0.1 - `', e)
+    exit(-1)
+
+cpp_format_command = 'clang-format --sort-includes=0 -style=file'
+cmake_format_command = 'cmake-format'
+
+try:
+    subprocess.check_output(('cmake-format', '--version'), text=True)
+except Exception as e:
+    print('you need to run `pip install cmake-format`', e)
+    exit(-1)
+
+extensions = [
+    '.cpp',
+    '.ipp',
+    '.c',
+    '.hpp',
+    '.h',
+    '.cc',
+    '.hh',
+    'CMakeLists.txt',
+    '.test',
+    '.test_slow',
+    '.test_coverage',
+    '.benchmark',
+    '.py',
+    '.java',
+]
+formatted_directories = ['src', 'benchmark', 'test', 'tools', 'examples', 'extension', 'scripts']
+ignored_files = [
+    'tpch_constants.hpp',
+    'tpcds_constants.hpp',
+    '_generated',
+    'tpce_flat_input.hpp',
+    'test_csv_header.hpp',
+    'duckdb.cpp',
+    'duckdb.hpp',
+    'json.hpp',
+    'sqlite3.h',
+    'shell.c',
+    'termcolor.hpp',
+    'test_insert_invalid.test',
+    'httplib.hpp',
+    'os_win.c',
+    'glob.c',
+    'printf.c',
+    'helper.hpp',
+    'single_thread_ptr.hpp',
+    'types.hpp',
+    'default_views.cpp',
+    'default_functions.cpp',
+    'release.h',
+    'genrand.cpp',
+    'address.cpp',
+    'visualizer_constants.hpp',
+    'icu-collate.cpp',
+    'icu-collate.hpp',
+    'yyjson.cpp',
+    'yyjson.hpp',
+    'duckdb_pdqsort.hpp',
+    'pdqsort.h',
+    'stubdata.cpp',
+    'nf_calendar.cpp',
+    'nf_calendar.h',
+    'nf_localedata.cpp',
+    'nf_localedata.h',
+    'nf_zformat.cpp',
+    'nf_zformat.h',
+    'expr.cc',
+    'function_list.cpp',
+    'inlined_grammar.hpp',
+]
+ignored_directories = [
+    '.eggs',
+    '__pycache__',
+    'dbgen',
+    os.path.join('tools', 'rpkg', 'src', 'duckdb'),
+    os.path.join('tools', 'rpkg', 'inst', 'include', 'cpp11'),
+    os.path.join('extension', 'tpcds', 'dsdgen'),
+    os.path.join('extension', 'jemalloc', 'jemalloc'),
+    os.path.join('extension', 'icu', 'third_party'),
+    os.path.join('tools', 'nodejs', 'src', 'duckdb'),
+]
+format_all = False
+check_only = True
+confirm = True
+silent = False
+force = False
+
+
+parser = argparse.ArgumentParser(prog='python scripts/format.py', description='Format source directory files')
+parser.add_argument(
+    'revision', nargs='?', default='HEAD', help='Revision number or --all to format all files (default: HEAD)'
+)
+parser.add_argument('--check', action='store_true', help='Only print differences (default)')
+parser.add_argument('--fix', action='store_true', help='Fix the files')
+parser.add_argument('-a', '--all', action='store_true', help='Format all files')
+parser.add_argument('-d', '--directories', nargs='*', default=[], help='Format specified directories')
+parser.add_argument('-y', '--noconfirm', action='store_true', help='Skip confirmation prompt')
+parser.add_argument('-q', '--silent', action='store_true', help='Suppress output')
+parser.add_argument('-f', '--force', action='store_true', help='Force formatting')
+args = parser.parse_args()
+
+revision = args.revision
+if args.check and args.fix:
+    parser.print_usage()
+    exit(1)
+check_only = not args.fix
+confirm = not args.noconfirm
+silent = args.silent
+force = args.force
+format_all = args.all
+if args.directories:
+    formatted_directories = args.directories
+
+
+def file_is_ignored(full_path):
+    if os.path.basename(full_path) in ignored_files:
+        return True
+    dirnames = os.path.sep.join(full_path.split(os.path.sep)[:-1])
+    for ignored_directory in ignored_directories:
+        if ignored_directory in dirnames:
+            return True
+    return False
+
+
+def can_format_file(full_path):
+    global extensions, formatted_directories, ignored_files
+    if not os.path.isfile(full_path):
+        return False
+    fname = full_path.split(os.path.sep)[-1]
+    found = False
+    # check file extension
+    for ext in extensions:
+        if full_path.endswith(ext):
+            found = True
+            break
+    if not found:
+        return False
+    # check ignored files
+    if file_is_ignored(full_path):
+        return False
+    # now check file directory
+    for dname in formatted_directories:
+        if full_path.startswith(dname):
+            return True
+    return False
+
+
+action = "Formatting"
+if check_only:
+    action = "Checking"
+
+
+def get_changed_files(revision):
+    proc = subprocess.Popen(['git', 'diff', '--name-only', revision], stdout=subprocess.PIPE)
+    files = proc.stdout.read().decode('utf8').split('\n')
+    changed_files = []
+    for f in files:
+        if not can_format_file(f):
+            continue
+        if file_is_ignored(f):
+            continue
+        changed_files.append(f)
+    return changed_files
+
+
+if os.path.isfile(revision):
+    print(action + " individual file: " + revision)
+    changed_files = [revision]
+elif os.path.isdir(revision):
+    print(action + " files in directory: " + revision)
+    changed_files = [os.path.join(revision, x) for x in os.listdir(revision)]
+
+    print("Changeset:")
+    for fname in changed_files:
+        print(fname)
+elif not format_all:
+    if revision == 'main':
+        # fetch new changes when comparing to the master
+        os.system("git fetch origin main:main")
+    print(action + " since branch or revision: " + revision)
+    changed_files = get_changed_files(revision)
+    if len(changed_files) == 0:
+        print("No changed files found!")
+        exit(0)
+
+    print("Changeset:")
+    for fname in changed_files:
+        print(fname)
+else:
+    print(action + " all files")
+
+if confirm and not check_only:
+    print("The files listed above will be reformatted.")
+    result = input("Continue with changes (y/n)?\n")
+    if result != 'y':
+        print("Aborting.")
+        exit(0)
+
+format_commands = {
+    '.cpp': cpp_format_command,
+    '.ipp': cpp_format_command,
+    '.c': cpp_format_command,
+    '.hpp': cpp_format_command,
+    '.h': cpp_format_command,
+    '.hh': cpp_format_command,
+    '.cc': cpp_format_command,
+    '.txt': cmake_format_command,
+    '.py': 'black --quiet - --skip-string-normalization --line-length 120 --stdin-filename',
+    '.java': cpp_format_command,
+}
+
+difference_files = []
+
+header_top = "//===----------------------------------------------------------------------===//\n"
+header_top += "//                         DuckDB\n" + "//\n"
+header_bottom = "//\n" + "//\n"
+header_bottom += "//===----------------------------------------------------------------------===//\n\n"
+base_dir = os.path.join(os.getcwd(), 'src/include')
+
+
+def get_formatted_text(f, full_path, directory, ext):
+    if not can_format_file(full_path):
+        if not force:
+            print(
+                "File "
+                + full_path
+                + " is not normally formatted - but attempted to format anyway. Use --force if formatting is desirable"
+            )
+            exit(1)
+    if f == 'list.hpp':
+        # fill in list file
+        file_list = [
+            os.path.join(dp, f)
+            for dp, dn, filenames in os.walk(directory)
+            for f in filenames
+            if os.path.splitext(f)[1] == '.hpp' and not f.endswith("list.hpp")
+        ]
+        file_list = [x.replace('src/include/', '') for x in file_list]
+        file_list.sort()
+        result = ""
+        for x in file_list:
+            result += '#include "%s"\n' % (x)
+        return result
+
+    if ext == ".hpp" and directory.startswith("src/include"):
+        with open_utf8(full_path, 'r') as f:
+            lines = f.readlines()
+
+        # format header in files
+        header_middle = "// " + os.path.relpath(full_path, base_dir) + "\n"
+        text = header_top + header_middle + header_bottom
+        is_old_header = True
+        for line in lines:
+            if not (line.startswith("//") or line.startswith("\n")) and is_old_header:
+                is_old_header = False
+            if not is_old_header:
+                text += line
+
+    if ext == '.test' or ext == '.test_slow' or ext == '.test_coverage' or ext == '.benchmark':
+        f = open_utf8(full_path, 'r')
+        lines = f.readlines()
+        f.close()
+
+        found_name = False
+        found_group = False
+        group_name = full_path.split('/')[-2]
+        new_path_line = '# name: ' + full_path + '\n'
+        new_group_line = '# group: [' + group_name + ']' + '\n'
+        found_diff = False
+        # Find description.
+        found_description = False
+        for line in lines:
+            if line.lower().startswith('# description:') or line.lower().startswith('#description:'):
+                if found_description:
+                    print("Error formatting file " + full_path + ", multiple lines starting with # description found")
+                    exit(1)
+                found_description = True
+                new_description_line = '# description: ' + line.split(':', 1)[1].strip() + '\n'
+        # Filter old meta.
+        meta = ['#name:', '# name:', '#description:', '# description:', '#group:', '# group:']
+        lines = [line for line in lines if not any(line.lower().startswith(m) for m in meta)]
+        # Clean up empty leading lines.
+        while lines and not lines[0].strip():
+            lines.pop(0)
+        # Ensure header is prepended.
+        header = [new_path_line]
+        if found_description:
+            header.append(new_description_line)
+        header.append(new_group_line)
+        header.append('\n')
+        return ''.join(header + lines)
+    proc_command = format_commands[ext].split(' ') + [full_path]
+    proc = subprocess.Popen(
+        proc_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=open(full_path) if ext == '.py' else None
+    )
+    new_text = proc.stdout.read().decode('utf8')
+    stderr = proc.stderr.read().decode('utf8')
+    if len(stderr) > 0:
+        print(os.getcwd())
+        print("Failed to format file " + full_path)
+        print(' '.join(proc_command))
+        print(stderr)
+        exit(1)
+    new_text = new_text.replace('\r', '')
+    new_text = re.sub(r'\n*$', '', new_text)
+    return new_text + '\n'
+
+
+def file_is_generated(text):
+    if '// This file is automatically generated by scripts/' in text:
+        return True
+    return False
+
+
+def format_file(f, full_path, directory, ext):
+    global difference_files
+    with open_utf8(full_path, 'r') as f:
+        old_text = f.read()
+    # do not format auto-generated files
+    if file_is_generated(old_text) and ext != '.py':
+        return
+    old_lines = old_text.split('\n')
+
+    new_text = get_formatted_text(f, full_path, directory, ext)
+    if ext in ('.cpp', '.hpp'):
+        new_text = new_text.replace('ARGS &&...args', 'ARGS &&... args')
+    if check_only:
+        new_lines = new_text.split('\n')
+        old_lines = [x for x in old_lines if '...' not in x]
+        new_lines = [x for x in new_lines if '...' not in x]
+        diff_result = difflib.unified_diff(old_lines, new_lines)
+        total_diff = ""
+        for diff_line in diff_result:
+            total_diff += diff_line + "\n"
+        total_diff = total_diff.strip()
+
+        if len(total_diff) > 0:
+            print("----------------------------------------")
+            print("----------------------------------------")
+            print("Found differences in file " + full_path)
+            print("----------------------------------------")
+            print("----------------------------------------")
+            print(total_diff)
+            difference_files.append(full_path)
+    else:
+        tmpfile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
+        with open_utf8(tmpfile, 'w+') as f:
+            f.write(new_text)
+        shutil.move(tmpfile, full_path)
+
+
+class ToFormatFile:
+    def __init__(self, filename, full_path, directory):
+        self.filename = filename
+        self.full_path = full_path
+        self.directory = directory
+        self.ext = '.' + filename.split('.')[-1]
+
+
+def format_directory(directory):
+    files = os.listdir(directory)
+    files.sort()
+    result = []
+    for f in files:
+        full_path = os.path.join(directory, f)
+        if os.path.isdir(full_path):
+            if f in ignored_directories or full_path in ignored_directories:
+                continue
+            result += format_directory(full_path)
+        elif can_format_file(full_path):
+            result += [ToFormatFile(f, full_path, directory)]
+    return result
+
+
+files = []
+if format_all:
+    try:
+        os.system(cmake_format_command.replace("${FILE}", "CMakeLists.txt"))
+    except:
+        pass
+
+    for direct in formatted_directories:
+        files += format_directory(direct)
+
+else:
+    for full_path in changed_files:
+        splits = full_path.split(os.path.sep)
+        fname = splits[-1]
+        dirname = os.path.sep.join(splits[:-1])
+        files.append(ToFormatFile(fname, full_path, dirname))
+
+
+def process_file(f):
+    if not silent:
+        print(f.full_path)
+    try:
+        format_file(f.filename, f.full_path, f.directory, f.ext)
+    except:
+        print(traceback.format_exc())
+        sys.exit(1)
+
+
+# Create thread for each file
+with concurrent.futures.ThreadPoolExecutor() as executor:
+    try:
+        threads = [executor.submit(process_file, f) for f in files]
+        # Wait for all tasks to complete
+        concurrent.futures.wait(threads)
+    except KeyboardInterrupt:
+        executor.shutdown(wait=True, cancel_futures=True)
+        raise
+
+if check_only:
+    if len(difference_files) > 0:
+        print("")
+        print("")
+        print("")
+        print("Failed format-check: differences were found in the following files:")
+        for fname in difference_files:
+            print("- " + fname)
+        print('Run "make format-fix" to fix these differences automatically')
+        exit(1)
+    else:
+        print("Passed format-check")
+        exit(0)
--- a/external/duckdb/scripts/generate_builtin_types.py
+++ b/external/duckdb/scripts/generate_builtin_types.py
@@ -0,0 +1,81 @@
+import os
+import re
+import json
+
+header = '''//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/catalog/default/builtin_types/types.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+// This file is generated by scripts/generate_builtin_types.py
+
+#pragma once
+
+#include "duckdb/common/types.hpp"
+#include "duckdb/common/array.hpp"
+
+namespace duckdb {
+'''
+
+footer = '''} // namespace duckdb
+'''
+
+
+def normalize_path_separators(x):
+    return os.path.sep.join(x.split('/'))
+
+
+def legal_struct_name(name):
+    return name.isalnum()
+
+
+def get_struct_name(function_name):
+    return function_name.replace('_', ' ').title().replace(' ', '') + 'Fun'
+
+
+def sanitize_string(text):
+    return text.replace('"', '\\"')
+
+
+new_text = header
+
+type_entries = []
+json_path = normalize_path_separators(f'src/include/duckdb/catalog/default/builtin_types/types.json')
+with open(json_path, 'r') as f:
+    parsed_json = json.load(f)
+
+# Extract all the types from the json
+for type in parsed_json:
+    names = type['names']
+
+    type_id = type['id']
+
+    type_entries += ['\t{' + f'''"{name}", LogicalTypeId::{type_id}''' + '}' for name in names]
+
+TYPE_COUNT = len(type_entries)
+new_text += '''
+struct DefaultType {
+	const char *name;
+	LogicalTypeId type;
+};
+'''
+new_text += f'''
+using builtin_type_array = std::array<DefaultType, {TYPE_COUNT}>;
+'''
+new_text += '''
+static constexpr const builtin_type_array BUILTIN_TYPES{{
+'''
+
+type_text = ",\n".join(type_entries)
+new_text += type_text
+new_text += '''
+}};
+
+'''
+
+new_text += footer
+
+with open('src/include/duckdb/catalog/default/builtin_types/types.hpp', 'w+') as f:
+    f.write(new_text)
--- a/external/duckdb/scripts/generate_c_api.py
+++ b/external/duckdb/scripts/generate_c_api.py
--- a/external/duckdb/scripts/generate_csv_header.py
+++ b/external/duckdb/scripts/generate_csv_header.py
@@ -0,0 +1,98 @@
+# this script generates data for the TPC-H dbgen
+import os
+from python_helpers import open_utf8
+
+
+def get_csv_text(fpath, add_null_terminator=False):
+    with open(fpath, 'rb') as f:
+        text = bytearray(f.read())
+    result_text = ""
+    first = True
+    for byte in text:
+        if first:
+            result_text += str(byte)
+        else:
+            result_text += ", " + str(byte)
+        first = False
+    if add_null_terminator:
+        result_text += ", 0"
+    return result_text
+
+
+def write_dir(dirname, varname):
+    files = os.listdir(dirname)
+    files.sort()
+    result = ""
+    aggregated_result = "const char *%s[] = {\n" % (varname,)
+    for fname in files:
+        file_varname = "%s_%s" % (varname, fname.split('.')[0])
+        result += "const uint8_t %s[] = {" % (file_varname,) + get_csv_text(os.path.join(dirname, fname), True) + "};\n"
+        aggregated_result += "\t(const char*) %s,\n" % (file_varname,)
+    aggregated_result = aggregated_result[:-2] + "\n};\n"
+    return result + aggregated_result
+
+
+# ------------------------------------------- #
+# ------------------------------------------- #
+# -------------      TPC-H       ------------ #
+# ------------------------------------------- #
+# ------------------------------------------- #
+tpch_dir = 'extension/tpch/dbgen'
+tpch_queries = os.path.join(tpch_dir, 'queries')
+tpch_answers_sf001 = os.path.join(tpch_dir, 'answers', 'sf0.01')
+tpch_answers_sf01 = os.path.join(tpch_dir, 'answers', 'sf0.1')
+tpch_answers_sf1 = os.path.join(tpch_dir, 'answers', 'sf1')
+tpch_header = os.path.join(tpch_dir, 'include', 'tpch_constants.hpp')
+
+
+def create_tpch_header(tpch_dir):
+    result = """/* THIS FILE WAS AUTOMATICALLY GENERATED BY generate_csv_header.py */
+
+#pragma once
+
+const int TPCH_QUERIES_COUNT = 22;
+"""
+    # write the queries
+    result += write_dir(tpch_queries, "TPCH_QUERIES")
+    result += write_dir(tpch_answers_sf001, "TPCH_ANSWERS_SF0_01")
+    result += write_dir(tpch_answers_sf01, "TPCH_ANSWERS_SF0_1")
+    result += write_dir(tpch_answers_sf1, "TPCH_ANSWERS_SF1")
+
+    with open_utf8(tpch_header, 'w+') as f:
+        f.write(result)
+
+
+print(tpch_header)
+create_tpch_header(tpch_dir)
+
+# ------------------------------------------- #
+# ------------------------------------------- #
+# -------------      TPC-DS      ------------ #
+# ------------------------------------------- #
+# ------------------------------------------- #
+tpcds_dir = 'extension/tpcds/dsdgen'
+tpcds_queries = os.path.join(tpcds_dir, 'queries')
+tpcds_answers_sf001 = os.path.join(tpcds_dir, 'answers', 'sf0.01')
+tpcds_answers_sf1 = os.path.join(tpcds_dir, 'answers', 'sf1')
+tpcds_header = os.path.join(tpcds_dir, 'include', 'tpcds_constants.hpp')
+
+
+def create_tpcds_header(tpch_dir):
+    result = """/* THIS FILE WAS AUTOMATICALLY GENERATED BY generate_csv_header.py */
+
+#pragma once
+
+const int TPCDS_QUERIES_COUNT = 99;
+const int TPCDS_TABLE_COUNT = 24;
+"""
+    # write the queries
+    result += write_dir(tpcds_queries, "TPCDS_QUERIES")
+    result += write_dir(tpcds_answers_sf001, "TPCDS_ANSWERS_SF0_01")
+    result += write_dir(tpcds_answers_sf1, "TPCDS_ANSWERS_SF1")
+
+    with open_utf8(tpcds_header, 'w+') as f:
+        f.write(result)
+
+
+print(tpcds_header)
+create_tpcds_header(tpcds_dir)
--- a/external/duckdb/scripts/generate_enum_util.py
+++ b/external/duckdb/scripts/generate_enum_util.py
@@ -0,0 +1,245 @@
+import os
+import csv
+import re
+import argparse
+import glob
+
+os.chdir(os.path.dirname(__file__))
+
+# Dont generate serialization for these enums
+blacklist = [
+    "RegexOptions",
+    "Flags",
+    "ContainerType",
+    "Type",
+    "DictionaryAppendState",
+    "DictFSSTMode",
+    "ComplexJSONType",
+]
+
+enum_util_header_file = os.path.join("..", "src", "include", "duckdb", "common", "enum_util.hpp")
+enum_util_source_file = os.path.join("..", "src", "common", "enum_util.cpp")
+
+# Overrides conversions for the following enums:
+overrides = {
+    "LogicalTypeId": {
+        "SQLNULL": "NULL",
+        "TIMESTAMP_TZ": "TIMESTAMP WITH TIME ZONE",
+        "TIME_TZ": "TIME WITH TIME ZONE",
+        "TIMESTAMP_SEC": "TIMESTAMP_S",
+    },
+    "JoinType": {"OUTER": "FULL"},
+    "OrderType": {
+        "ORDER_DEFAULT": ["ORDER_DEFAULT", "DEFAULT"],
+        "DESCENDING": ["DESCENDING", "DESC"],
+        "ASCENDING": ["ASCENDING", "ASC"],
+    },
+    "OrderByNullType": {
+        "ORDER_DEFAULT": ["ORDER_DEFAULT", "DEFAULT"],
+        "NULLS_FIRST": ["NULLS FIRST", "NULLS_FIRST"],
+        "NULLS_LAST": ["NULLS LAST", "NULLS_LAST"],
+    },
+    "CheckpointAbort": {
+        "NO_ABORT": "NONE",
+        "DEBUG_ABORT_BEFORE_TRUNCATE": "BEFORE_TRUNCATE",
+        "DEBUG_ABORT_BEFORE_HEADER": "BEFORE_HEADER",
+        "DEBUG_ABORT_AFTER_FREE_LIST_WRITE": "AFTER_FREE_LIST_WRITE",
+    },
+    "SampleMethod": {"SYSTEM_SAMPLE": "System", "BERNOULLI_SAMPLE": "Bernoulli", "RESERVOIR_SAMPLE": "Reservoir"},
+    "TableReferenceType": {"EMPTY_FROM": "EMPTY"},
+    "LogLevel": {
+        "LOG_TRACE": "TRACE",
+        "LOG_DEBUG": "DEBUG",
+        "LOG_INFO": "INFO",
+        "LOG_WARN": "WARN",
+        "LOG_ERROR": "ERROR",
+        "LOG_FATAL": "FATAL",
+    },
+    "RequestType": {
+        "GET_REQUEST": "GET",
+        "PUT_REQUEST": "PUT",
+        "HEAD_REQUEST": "HEAD",
+        "DELETE_REQUEST": "DELETE",
+        "POST_REQUEST": "POST",
+    },
+    "ArrowFormatVersion": {"V1_0": "1.0", "V1_1": "1.1", "V1_2": "1.2", "V1_3": "1.3", "V1_4": "1.4", "V1_5": "1.5"},
+}
+
+# get all the headers
+hpp_files = []
+for root, dirs, files in os.walk(os.path.join("..", "src")):
+    for file in files:
+        # Dont include the generated header itself recursively
+        if file == "enum_util.hpp":
+            continue
+        if 'amalgamation' in root:
+            continue
+
+        if file.endswith(".hpp"):
+            hpp_files.append(os.path.join(root, file))
+
+
+def remove_prefix(str, prefix):
+    if str.startswith(prefix):
+        return str[len(prefix) :]
+    return str
+
+
+# get all the enum classes
+enums = []
+enum_paths = []
+enum_path_set = set()
+
+for hpp_file in hpp_files:
+    with open(hpp_file, "r") as f:
+        text = f.read()
+        for res in re.finditer(r"enum class (\w*)\s*:\s*(\w*)\s*{((?:\s*[^}])*)}", text, re.MULTILINE):
+            file_path = remove_prefix(os.path.relpath(hpp_file, os.path.join("..", "src")), "include/")
+            enum_name = res.group(1)
+
+            if enum_name in blacklist:
+                print(f"Skipping {enum_name} because it is blacklisted")
+                continue
+
+            enum_type = res.group(2)
+
+            enum_members = []
+            # Capture All members: \w+(\s*\=\s*-?\w*)?
+            # group one is the member name
+            # group two is the member value
+            # First clean group from comments
+            s = res.group(3)
+            s = re.sub(r"\/\/.*", "", s)
+            s = re.sub(r"\/\*.*\*\/", "", s)
+
+            enum_values = {}
+            for member in re.finditer(r"(\w+)(\s*\=\s*-?\w*)?", s):
+                key = member.group(1)
+                strings = [key]
+                if enum_name in overrides and key in overrides[enum_name]:
+                    override = overrides[enum_name][key]
+                    if isinstance(override, list):
+                        print(f"Overriding {enum_name}::{key} to one of {override}")
+                        strings = override
+                    else:
+                        print(f"Overriding {enum_name}::{key} to {override}")
+                        strings = [override]
+
+                if member.group(2):
+                    # If the member has a value, make sure it isnt already covered by another member
+                    # If it is, we cant do anything else than ignore it
+                    value = remove_prefix(member.group(2).strip(), "=").strip()
+                    if value not in enum_values and value not in dict(enum_members):
+                        enum_members.append((key, strings))
+                    else:
+                        print(f"Skipping {enum_name}::{key} because it has a duplicate value {value}")
+                else:
+                    enum_members.append((key, strings))
+
+            if not file_path in enum_path_set:
+                enum_path_set.add(file_path)
+                enum_paths.append(file_path)
+
+            enums.append((enum_name, enum_type, enum_members))
+
+enum_paths.sort()
+enums.sort(key=lambda x: x[0])
+
+header = """//-------------------------------------------------------------------------
+// This file is automatically generated by scripts/generate_enum_util.py
+// Do not edit this file manually, your changes will be overwritten
+// If you want to exclude an enum from serialization, add it to the blacklist in the script
+//
+// Note: The generated code will only work properly if the enum is a top level item in the duckdb namespace
+// If the enum is nested in a class, or in another namespace, the generated code will not compile.
+// You should move the enum to the duckdb namespace, manually write a specialization or add it to the blacklist
+//-------------------------------------------------------------------------\n\n
+"""
+
+# Write the enum util header
+with open(enum_util_header_file, "w") as f:
+    f.write(header)
+
+    f.write('#pragma once\n\n')
+    f.write('#include <stdint.h>\n')
+    f.write('#include "duckdb/common/string.hpp"\n\n')
+
+    f.write("namespace duckdb {\n\n")
+
+    f.write(
+        """struct EnumUtil {
+    // String -> Enum
+    template <class T>
+    static T FromString(const char *value) = delete;
+
+    template <class T>
+    static T FromString(const string &value) { return FromString<T>(value.c_str()); }
+
+    // Enum -> String
+    template <class T>
+    static const char *ToChars(T value) = delete;
+
+    template <class T>
+    static string ToString(T value) { return string(ToChars<T>(value)); }
+};\n\n"""
+    )
+
+    # Forward declare all enums
+    for enum_name, enum_type, _ in enums:
+        f.write(f"enum class {enum_name} : {enum_type};\n\n")
+    f.write("\n")
+
+    # Forward declare all enum serialization functions
+    for enum_name, enum_type, _ in enums:
+        f.write(f"template<>\nconst char* EnumUtil::ToChars<{enum_name}>({enum_name} value);\n\n")
+    f.write("\n")
+
+    # Forward declare all enum dserialization functions
+    for enum_name, enum_type, _ in enums:
+        f.write(f"template<>\n{enum_name} EnumUtil::FromString<{enum_name}>(const char *value);\n\n")
+    f.write("\n")
+
+    f.write("}\n")
+
+
+with open(enum_util_source_file, "w") as f:
+    f.write(header)
+
+    f.write('#include "duckdb/common/enum_util.hpp"\n')
+
+    # Write the includes
+    for enum_path in enum_paths:
+        f.write(f'#include "{enum_path}"\n')
+    f.write("\n")
+
+    f.write("namespace duckdb {\n\n")
+
+    for enum_name, enum_type, enum_members in enums:
+        enum_string_array = "Get" + enum_name + "Values()"
+        # Write the enum from string
+        f.write(f"const StringUtil::EnumStringLiteral *{enum_string_array} {{\n")
+        f.write(f"\tstatic constexpr StringUtil::EnumStringLiteral values[] {{\n")
+        member_count = 0
+        for key, strings in enum_members:
+            for str_val in strings:
+                if member_count != 0:
+                    f.write(",\n")
+                f.write(f"\t\t{{ static_cast<uint32_t>({enum_name}::{key}), \"{str_val}\" }}")
+                member_count += 1
+        f.write("\n\t};")
+        f.write("\n\treturn values;")
+        f.write("\n}\n\n")
+        f.write(f"template<>\nconst char* EnumUtil::ToChars<{enum_name}>({enum_name} value) {{\n")
+        f.write(
+            f"\treturn StringUtil::EnumToString({enum_string_array}, {member_count}, \"{enum_name}\", static_cast<uint32_t>(value));\n"
+        )
+        f.write("}\n\n")
+
+        # Write the string to enum
+        f.write(f"template<>\n{enum_name} EnumUtil::FromString<{enum_name}>(const char *value) {{\n")
+        f.write(
+            f"\treturn static_cast<{enum_name}>(StringUtil::StringToEnum({enum_string_array}, {member_count}, \"{enum_name}\", value));"
+        )
+        f.write("\n}\n\n")
+
+    f.write("}\n\n")
--- a/external/duckdb/scripts/generate_enums.py
+++ b/external/duckdb/scripts/generate_enums.py
@@ -0,0 +1,161 @@
+import os
+import json
+import re
+
+targets = [{'source': 'extension/json/include/', 'target': 'extension/json'}]
+
+file_list = []
+for target in targets:
+    source_base = os.path.sep.join(target['source'].split('/'))
+    target_base = os.path.sep.join(target['target'].split('/'))
+    for fname in os.listdir(source_base):
+        if '_enums.json' not in fname:
+            continue
+        file_list.append(
+            {
+                'source': os.path.join(source_base, fname),
+                'include_path': fname.replace('.json', '.hpp'),
+                'target_hpp': os.path.join(source_base, fname.replace('.json', '.hpp')),
+                'target_cpp': os.path.join(target_base, fname.replace('.json', '.cpp')),
+            }
+        )
+
+header = '''//===----------------------------------------------------------------------===//
+// This file is automatically generated by scripts/generate_enums.py
+// Do not edit this file manually, your changes will be overwritten
+//===----------------------------------------------------------------------===//
+
+${INCLUDE_LIST}
+namespace duckdb {
+'''
+
+footer = '''
+} // namespace duckdb
+'''
+
+include_base = '#include "${FILENAME}"\n'
+
+enum_header = '\nenum class ${ENUM_NAME} : ${ENUM_TYPE} {\n'
+
+enum_footer = '};'
+
+enum_value = '\t${ENUM_MEMBER} = ${ENUM_VALUE},\n'
+
+enum_util_header = '''
+template<>
+const char* EnumUtil::ToChars<${ENUM_NAME}>(${ENUM_NAME} value);
+
+template<>
+${ENUM_NAME} EnumUtil::FromString<${ENUM_NAME}>(const char *value);
+'''
+
+enum_util_conversion_begin = '''
+template<>
+const char* EnumUtil::ToChars<${ENUM_NAME}>(${ENUM_NAME} value) {
+	switch(value) {
+'''
+
+enum_util_switch = '\tcase ${ENUM_NAME}::${ENUM_MEMBER}:\n\t\treturn "${ENUM_MEMBER}";\n'
+
+enum_util_conversion_end = '''	default:
+		throw NotImplementedException(StringUtil::Format("Enum value of type ${ENUM_NAME}: '%d' not implemented", value));
+	}
+}
+'''
+
+from_string_begin = '''
+template<>
+${ENUM_NAME} EnumUtil::FromString<${ENUM_NAME}>(const char *value) {
+'''
+
+from_string_comparison = '''    if (StringUtil::Equals(value, "${ENUM_MEMBER}")) {
+		return ${ENUM_NAME}::${ENUM_MEMBER};
+	}
+'''
+
+from_string_end = '''   throw NotImplementedException(StringUtil::Format("Enum value of type ${ENUM_NAME}: '%s' not implemented", value));
+}
+'''
+
+
+class EnumMember:
+    def __init__(self, entry, index):
+        self.comment = None
+        self.index = index
+        if type(entry) == str:
+            self.name = entry
+        else:
+            self.name = entry['name']
+            if 'comment' in entry:
+                self.comment = entry['comment']
+            if 'index' in entry:
+                self.index = int(entry['index'])
+
+
+class EnumClass:
+    def __init__(self, entry):
+        self.name = entry['name']
+        self.type = 'uint8_t'
+        self.values = []
+        index = 0
+        for value_entry in entry['values']:
+            self.values.append(EnumMember(value_entry, index))
+            index += 1
+
+
+for entry in file_list:
+    source_path = entry['source']
+    target_header = entry['target_hpp']
+    target_source = entry['target_cpp']
+    include_path = entry['include_path']
+    with open(source_path, 'r') as f:
+        json_data = json.load(f)
+
+    include_list = ['duckdb/common/constants.hpp', 'duckdb/common/enum_util.hpp']
+    enums = []
+
+    for entry in json_data:
+        if 'includes' in entry:
+            include_list += entry['includes']
+        enums.append(EnumClass(entry))
+
+    with open(target_header, 'w+') as f:
+        include_text = '#pragma once\n\n'
+        include_text += ''.join([include_base.replace('${FILENAME}', x) for x in include_list])
+        f.write(header.replace('${INCLUDE_LIST}', include_text))
+
+        for enum in enums:
+            f.write(enum_header.replace('${ENUM_NAME}', enum.name).replace('${ENUM_TYPE}', enum.type))
+            for value in enum.values:
+                if value.comment is not None:
+                    f.write('\t//! ' + value.comment + '\n')
+                f.write(enum_value.replace('${ENUM_MEMBER}', value.name).replace('${ENUM_VALUE}', str(value.index)))
+
+            f.write(enum_footer)
+            f.write('\n')
+
+        for enum in enums:
+            f.write(enum_util_header.replace('${ENUM_NAME}', enum.name))
+
+        f.write(footer)
+
+    with open(target_source, 'w+') as f:
+        source_include_list = [include_path, 'duckdb/common/string_util.hpp']
+        f.write(
+            header.replace(
+                '${INCLUDE_LIST}', ''.join([include_base.replace('${FILENAME}', x) for x in source_include_list])
+            )
+        )
+
+        for enum in enums:
+            f.write(enum_util_conversion_begin.replace('${ENUM_NAME}', enum.name))
+            for value in enum.values:
+                f.write(enum_util_switch.replace('${ENUM_MEMBER}', value.name).replace('${ENUM_NAME}', enum.name))
+
+            f.write(enum_util_conversion_end.replace('${ENUM_NAME}', enum.name))
+            f.write(from_string_begin.replace('${ENUM_NAME}', enum.name))
+            for value in enum.values:
+                f.write(from_string_comparison.replace('${ENUM_MEMBER}', value.name).replace('${ENUM_NAME}', enum.name))
+
+            f.write(from_string_end.replace('${ENUM_NAME}', enum.name))
+        f.write(footer)
--- a/external/duckdb/scripts/generate_extensions_function.py
+++ b/external/duckdb/scripts/generate_extensions_function.py
@@ -0,0 +1,972 @@
+import os
+import csv
+import re
+import argparse
+import glob
+from typing import Set, Tuple, cast
+import pathlib
+from typing import NamedTuple
+from typing import List, Dict
+import json
+
+os.chdir(os.path.join(os.path.dirname(__file__), '..'))
+
+# Example usage:
+
+parser = argparse.ArgumentParser(description='Generates/Validates extension_functions.hpp file')
+
+parser.add_argument(
+    '--validate',
+    action=argparse.BooleanOptionalAction,
+    help='If set will validate that extension_entries.hpp is up to date, otherwise it generates the extension_functions.hpp file.',
+)
+parser.add_argument(
+    '--extension_repository',
+    action='store',
+    help="The repository to look for the '**/<extension>.duckdb_extension' files",
+    default='build/release/repository',
+)
+parser.add_argument(
+    '--shell',
+    action='store',
+    help="Path to the DuckDB shell",
+    default='build/release/duckdb',
+)
+parser.add_argument(
+    '--extensions',
+    action='store',
+    help="Comma separated list of extensions - if not provided this is read from the extension configuration",
+    default='',
+)
+
+args = parser.parse_args()
+
+EXTENSIONS_PATH = os.path.join("build", "extension_configuration", "extensions.csv")
+DUCKDB_PATH = os.path.join(*args.shell.split('/'))
+HEADER_PATH = os.path.join("src", "include", "duckdb", "main", "extension_entries.hpp")
+
+EXTENSION_DEPENDENCIES = {
+    'iceberg': [
+        'avro',
+        'parquet',
+    ]
+}
+
+from enum import Enum
+
+
+class CatalogType(str, Enum):
+    SCALAR = "CatalogType::SCALAR_FUNCTION_ENTRY"
+    TABLE = "CatalogType::TABLE_FUNCTION_ENTRY"
+    AGGREGATE = "CatalogType::AGGREGATE_FUNCTION_ENTRY"
+    PRAGMA = "CatalogType::PRAGMA_FUNCTION_ENTRY"
+    MACRO = "CatalogType::MACRO_ENTRY"
+    TABLE_MACRO = "CatalogType::TABLE_MACRO_ENTRY"
+
+
+parameter_type_map = {"TIMESTAMP WITH TIME ZONE": "TIMESTAMPTZ", "TIME WITH TIME ZONE": "TIMETZ"}
+
+
+def catalog_type_from_type(catalog_type: str) -> CatalogType:
+    TYPE_MAP = {
+        CatalogType.SCALAR.value: CatalogType.SCALAR,
+        CatalogType.TABLE.value: CatalogType.TABLE,
+        CatalogType.AGGREGATE.value: CatalogType.AGGREGATE,
+        CatalogType.PRAGMA.value: CatalogType.PRAGMA,
+        CatalogType.MACRO.value: CatalogType.MACRO,
+        CatalogType.TABLE_MACRO.value: CatalogType.TABLE_MACRO,
+    }
+    if catalog_type not in TYPE_MAP:
+        raise Exception(f"Unrecognized function type: '{catalog_type}'")
+    return TYPE_MAP[catalog_type]
+
+
+def catalog_type_from_string(catalog_type: str) -> CatalogType:
+    TYPE_MAP = {
+        CatalogType.SCALAR.name.lower(): CatalogType.SCALAR,
+        CatalogType.TABLE.name.lower(): CatalogType.TABLE,
+        CatalogType.AGGREGATE.name.lower(): CatalogType.AGGREGATE,
+        CatalogType.PRAGMA.name.lower(): CatalogType.PRAGMA,
+        CatalogType.MACRO.name.lower(): CatalogType.MACRO,
+        CatalogType.TABLE_MACRO.name.lower(): CatalogType.TABLE_MACRO,
+    }
+    if catalog_type not in TYPE_MAP:
+        raise Exception(f"Unrecognized function type: '{catalog_type}'")
+    return TYPE_MAP[catalog_type]
+
+
+def parse_records(text):
+    records = []  # Will hold all parsed records
+    current_record = []  # Holds items for the current record
+    current_item = []  # Accumulates characters for the current item
+    in_quote = False  # True if we're inside a double-quoted string
+    inside_braces = False  # True if we're inside a { ... } block
+
+    for char in text:
+        if char == '"':
+            # Toggle the quote state and always include the quote.
+            in_quote = not in_quote
+        elif char == '{' and not in_quote:
+            # Start of a new record.
+            inside_braces = True
+            # Reset any previous record state.
+            current_record = []
+            current_item = []
+        elif char == '}' and not in_quote and inside_braces:
+            # End of the current record.
+            token = ''.join(current_item).strip()
+            if token:
+                current_record.append(token)
+            records.append(current_record)
+            # Reset state for subsequent records.
+            current_record = []
+            current_item = []
+            inside_braces = False
+        elif char == ',' and not in_quote and inside_braces:
+            # A comma outside quotes indicates the end of the current item.
+            token = ''.join(current_item).strip()
+            if token:
+                current_record.append(token)
+            current_item = []
+        else:
+            # Otherwise, just add the character if we're inside braces.
+            if inside_braces:
+                current_item.append(char)
+    return records
+
+
+class LogicalType(NamedTuple):
+    type: str
+
+
+class Function(NamedTuple):
+    name: str
+    type: CatalogType
+
+
+class FunctionOverload(NamedTuple):
+    name: str
+    type: CatalogType
+    parameters: Tuple
+    return_type: LogicalType
+
+
+class ExtensionFunctionOverload(NamedTuple):
+    extension: str
+    name: str
+    type: CatalogType
+    parameters: Tuple
+    return_type: LogicalType
+
+    @staticmethod
+    def create_map(input: List[Tuple[str, str, str, str]]) -> Dict[Function, List["ExtensionFunctionOverload"]]:
+        output: Dict[Function, List["ExtensionFunctionOverload"]] = {}
+        for x in input:
+            function = Function(x[0], catalog_type_from_type(x[2]))
+            # parse the signature
+            signature = x[3]
+            splits = signature.split('>')
+            return_type = LogicalType(splits[1])
+            parameters = [LogicalType(param) for param in splits[0][1:-1].split(',')]
+            extension_function = ExtensionFunctionOverload(x[1], function.name, function.type, parameters, return_type)
+            if function not in output:
+                output[function] = []
+            output[function].append(extension_function)
+        return output
+
+
+class ExtensionFunction(NamedTuple):
+    extension: str
+    name: str
+    type: CatalogType
+
+    @staticmethod
+    def create_map(input: List[Tuple[str, str, str]]) -> Dict[Function, "ExtensionFunction"]:
+        output: Dict[Function, "ExtensionFunction"] = {}
+        for x in input:
+            key = Function(x[0], catalog_type_from_type(x[2]))
+            output[key] = ExtensionFunction(x[1], key.name, key.type)
+        return output
+
+
+class ExtensionSetting(NamedTuple):
+    extension: str
+    name: str
+
+    @staticmethod
+    def create_map(input: List[Tuple[str, str]]) -> Dict[str, "ExtensionSetting"]:
+        output: Dict[str, "ExtensionSetting"] = {}
+        for x in input:
+            output[x[0]] = ExtensionSetting(x[1], x[0])
+        return output
+
+
+class ExtensionSecretType(NamedTuple):
+    extension: str
+    name: str
+
+    @staticmethod
+    def create_map(input: List[Tuple[str, str]]) -> Dict[str, "ExtensionSecretType"]:
+        output: Dict[str, "ExtensionSecretType"] = {}
+        for x in input:
+            output[x[0]] = ExtensionSecretType(x[1], x[0])
+        return output
+
+
+class ExtensionCopyFunction(NamedTuple):
+    extension: str
+    name: str
+
+    @staticmethod
+    def create_map(input: List[Tuple[str, str]]) -> Dict[str, "ExtensionCopyFunction"]:
+        output: Dict[str, "ExtensionCopyFunction"] = {}
+        for x in input:
+            output[x[0]] = ExtensionCopyFunction(x[1], x[0])
+        return output
+
+
+class ExtensionType(NamedTuple):
+    extension: str
+    name: str
+
+    @staticmethod
+    def create_map(input: List[Tuple[str, str]]) -> Dict[str, "ExtensionType"]:
+        output: Dict[str, "ExtensionType"] = {}
+        for x in input:
+            output[x[0]] = ExtensionType(x[1], x[0])
+        return output
+
+
+class ParsedEntries:
+    def __init__(self, file_path):
+        self.path = file_path
+        self.functions = {}
+        self.function_overloads = {}
+        self.settings = {}
+        self.secret_types = {}
+        self.types = {}
+        self.copy_functions = {}
+
+        file = open(file_path, 'r')
+        file_blob = file.read()
+
+        # Get the extension functions
+        ext_functions_file_blob = get_slice_of_file("EXTENSION_FUNCTIONS", file_blob)
+        res = parse_records(ext_functions_file_blob)
+        res = [(x[0], x[1], x[2]) for x in res]
+        self.functions = ExtensionFunction.create_map(res)
+
+        # Get the extension function overloads
+        ext_function_overloads_file_blob = get_slice_of_file("EXTENSION_FUNCTION_OVERLOADS", file_blob)
+        res = parse_records(ext_function_overloads_file_blob)
+        res = [(x[0], x[1], x[2], x[3]) for x in res]
+        self.function_overloads = ExtensionFunctionOverload.create_map(res)
+
+        # Get the extension settings
+        ext_settings_file_blob = get_slice_of_file("EXTENSION_SETTINGS", file_blob)
+        res = parse_records(ext_settings_file_blob)
+        res = [(x[0], x[1]) for x in res]
+        self.settings = ExtensionSetting.create_map(res)
+
+        # Get the extension secret types
+        ext_secret_types_file_blob = get_slice_of_file("EXTENSION_SECRET_TYPES", file_blob)
+        res = parse_records(ext_secret_types_file_blob)
+        res = [(x[0], x[1]) for x in res]
+        self.secret_types = ExtensionSecretType.create_map(res)
+
+        # Get the extension types
+        ext_copy_functions_blob = get_slice_of_file("EXTENSION_COPY_FUNCTIONS", file_blob)
+        res = parse_records(ext_copy_functions_blob)
+        res = [(x[0], x[1]) for x in res]
+        self.copy_functions = ExtensionCopyFunction.create_map(res)
+
+        # Get the extension types
+        ext_types_file_blob = get_slice_of_file("EXTENSION_TYPES", file_blob)
+        res = parse_records(ext_types_file_blob)
+        res = [(x[0], x[1]) for x in res]
+        self.types = ExtensionType.create_map(res)
+
+    def strip_unloaded_extensions(self, extensions: List[str], functions):
+        return [x for x in functions if x.extension not in extensions]
+
+    def filter_entries(self, extensions: List[str]):
+        self.functions = {k: v for k, v in self.functions.items() if v.extension not in extensions}
+        self.function_overloads = {
+            k: self.strip_unloaded_extensions(extensions, v)
+            for k, v in self.function_overloads.items()
+            if len(self.strip_unloaded_extensions(extensions, v)) > 0
+        }
+        self.copy_functions = {k: v for k, v in self.copy_functions.items() if v.extension not in extensions}
+        self.settings = {k: v for k, v in self.settings.items() if v.extension not in extensions}
+        self.secret_types = {k: v for k, v in self.secret_types.items() if v.extension not in extensions}
+        self.types = {k: v for k, v in self.types.items() if v.extension not in extensions}
+
+
+def check_prerequisites():
+    if not os.path.isfile(DUCKDB_PATH):
+        print(f"{DUCKDB_PATH} not found")
+        print(
+            "please run 'GENERATE_EXTENSION_ENTRIES=1 BUILD_ALL_EXT=1 make release', you might have to manually add DONT_LINK to all extension_configs"
+        )
+        exit(1)
+    if len(args.extensions) == 0 and not os.path.isfile(EXTENSIONS_PATH):
+        print(f"{EXTENSIONS_PATH} not found and --extensions it not set")
+        print("Either:")
+        print(
+            "* run 'GENERATE_EXTENSION_ENTRIES=1 BUILD_ALL_EXT=1 make release', you might have to manually add DONT_LINK to all extension_configs"
+        )
+        print("* Specify a comma separated list of extensions using --extensions")
+        exit(1)
+    if not os.path.isdir(args.extension_repository):
+        print(f"provided --extension_repository '{args.extension_repository}' is not a valid directory")
+        exit(1)
+
+
+# Parses the extension config files for which extension names there are to be expected
+def get_extension_names() -> List[str]:
+    if len(args.extensions) > 0:
+        return args.extensions.split(',')
+    extension_names = []
+    with open(EXTENSIONS_PATH) as f:
+        # Skip the csv header
+        next(f)
+        for line in f:
+            extension_name = line.split(',')[0].rstrip()
+            if "jemalloc" in extension_name:
+                # We skip jemalloc as it doesn't produce a loadable extension but is in the config
+                continue
+            extension_names.append(extension_name)
+    return extension_names
+
+
+def get_query(sql_query, load_query) -> list:
+    # Optionally perform a LOAD of an extension
+    # Then perform a SQL query, fetch the output
+    query = f'{DUCKDB_PATH} -json -unsigned -c "{load_query}{sql_query}" '
+    query_result = os.popen(query).read()
+    result = [x for x in query_result[1:-2].split("\n") if x != '']
+    return result
+
+
+def transform_parameter(parameter) -> LogicalType:
+    parameter = parameter.upper()
+    if parameter.endswith('[]'):
+        return LogicalType(transform_parameter(parameter[0 : len(parameter) - 2]).type + '[]')
+    if parameter in parameter_type_map:
+        return LogicalType(parameter_type_map[parameter])
+    return LogicalType(parameter)
+
+
+def transform_parameters(parameters) -> FunctionOverload:
+    parameters = parameters[1:-1].split(', ')
+    return tuple(transform_parameter(param) for param in parameters)
+
+
+def get_functions(load="") -> (Set[Function], Dict[Function, List[FunctionOverload]]):
+    GET_FUNCTIONS_QUERY = """
+        select distinct
+            function_name,
+            function_type,
+            parameter_types,
+            return_type
+        from duckdb_functions()
+        ORDER BY function_name, function_type;
+    """
+    # ['name_1,type_1', ..., 'name_n,type_n']
+    results = set(get_query(GET_FUNCTIONS_QUERY, load))
+
+    functions = set()
+    function_overloads = {}
+    for x in results:
+        if x[-1] == ',':
+            # Remove the trailing comma
+            x = x[:-1]
+        function_name, function_type, parameter_types, return_type = [
+            x.lower() if x else "null" for x in json.loads(x).values()
+        ]
+        function_parameters = transform_parameters(parameter_types)
+        function_return = transform_parameter(return_type)
+        function = Function(function_name, catalog_type_from_string(function_type))
+        function_overload = FunctionOverload(
+            function_name, catalog_type_from_string(function_type), function_parameters, function_return
+        )
+        if function not in functions:
+            functions.add(function)
+            function_overloads[function] = [function_overload]
+        else:
+            function_overloads[function].append(function_overload)
+
+    return (functions, function_overloads)
+
+
+def get_settings(load="") -> Set[str]:
+    GET_SETTINGS_QUERY = """
+        select distinct
+            name
+        from duckdb_settings();
+    """
+    settings = set(get_query(GET_SETTINGS_QUERY, load))
+    res = set()
+    for x in settings:
+        if x[-1] == ',':
+            # Remove the trailing comma
+            x = x[:-1]
+        name = json.loads(x)['name']
+        res.add(name)
+    return res
+
+
+def get_secret_types(load="") -> Set[str]:
+    GET_SECRET_TYPES_QUERY = """
+        select distinct
+            type
+        from duckdb_secret_types();
+    """
+    secret_types = set(get_query(GET_SECRET_TYPES_QUERY, load))
+    res = set()
+    for x in secret_types:
+        if x[-1] == ',':
+            # Remove the trailing comma
+            x = x[:-1]
+        type = json.loads(x)['type']
+        res.add(type)
+    return res
+
+
+class ExtensionData:
+    def __init__(self):
+        # Map of extension -> ExtensionFunction
+        self.function_map: Dict[Function, ExtensionFunction] = {}
+        # Map of extension -> ExtensionSetting
+        self.settings_map: Dict[str, ExtensionSetting] = {}
+        # Map of extension -> ExtensionSecretType
+        self.secret_types_map: Dict[str, ExtensionSecretType] = {}
+        # Map of function -> extension function overloads
+        self.function_overloads: Dict[Function, List[ExtensionFunctionOverload]] = {}
+        # All function overloads (also ones that will not be written to the file)
+        self.all_function_overloads: Dict[Function, List[ExtensionFunctionOverload]] = {}
+
+        self.base_settings: Set[str] = set()
+        self.base_secret_types: Set[str] = set()
+        self.base_functions: Set[Function] = set()
+
+        self.extension_settings: Dict[str, Set[str]] = {}
+        self.extension_secret_types: Dict[str, Set[str]] = {}
+        self.extension_functions: Dict[str, Set[Function]] = {}
+
+        self.added_extensions: Set[str] = set()
+
+        # Map of extension -> extension_path
+        self.extensions: Dict[str, str] = get_extension_path_map()
+
+        self.stored_functions: Dict[str, List[Function]] = {
+            'arrow': [Function("scan_arrow_ipc", CatalogType.TABLE), Function("to_arrow_ipc", CatalogType.TABLE)],
+            'spatial': [],
+        }
+        self.stored_settings: Dict[str, List[str]] = {'arrow': [], 'spatial': []}
+
+    def set_base(self):
+        (functions, function_overloads) = get_functions()
+        self.base_functions: Set[Function] = functions
+        self.base_settings: Set[str] = get_settings()
+        self.base_secret_types: Set[str] = get_secret_types()
+
+    def add_entries(self, entries: ParsedEntries):
+        self.function_map.update(entries.functions)
+        self.function_overloads.update(entries.function_overloads)
+        self.settings_map.update(entries.settings)
+        self.secret_types_map.update(entries.secret_types)
+
+    def load_dependencies(self, extension_name: str) -> str:
+        if extension_name not in EXTENSION_DEPENDENCIES:
+            return ''
+
+        res = ''
+        dependencies = EXTENSION_DEPENDENCIES[extension_name]
+        for item in dependencies:
+            if item not in self.extensions:
+                print(f"Could not load extension '{extension_name}', dependency '{item}' is missing")
+                exit(1)
+            extension_path = self.extensions[item]
+            print(f"Load {item} at {extension_path}")
+            res += f"LOAD '{extension_path}';"
+        return res
+
+    def add_extension(self, extension_name: str):
+        if extension_name in EXTENSION_DEPENDENCIES:
+            for item in EXTENSION_DEPENDENCIES[extension_name]:
+                if item not in self.added_extensions:
+                    self.add_extension(item)
+
+        if extension_name in self.extensions:
+            # Perform a LOAD and add the added settings/functions/secret_types
+            extension_path = self.extensions[extension_name]
+
+            print(f"Load {extension_name} at {extension_path}")
+            load = self.load_dependencies(extension_name)
+            load += f"LOAD '{extension_path}';"
+
+            (functions, function_overloads) = get_functions(load)
+            extension_functions = list(functions)
+            extension_settings = list(get_settings(load))
+            extension_secret_types = list(get_secret_types(load))
+
+            self.add_settings(extension_name, extension_settings)
+            self.add_secret_types(extension_name, extension_secret_types)
+            self.add_functions(extension_name, extension_functions, function_overloads)
+        elif extension_name in self.stored_functions or extension_name in self.stored_settings:
+            # Retrieve the list of settings/functions from our hardcoded list
+            extension_functions = self.stored_functions[extension_name]
+            extension_settings = self.stored_settings[extension_name]
+            extension_secret_types = self.stored_secret_types[extension_name]
+
+            print(f"Loading {extension_name} from stored functions: {extension_functions}")
+            self.add_settings(extension_name, extension_settings)
+            self.add_secret_types(extension_name, extension_secret_types)
+            self.add_functions(extension_name, extension_functions)
+        else:
+            error = f"""Missing extension {extension_name} and not found in stored_functions/stored_settings/stored_secret_types
+Please double check if '{args.extension_repository}' is the right location to look for ./**/*.duckdb_extension files"""
+            print(error)
+            exit(1)
+        self.added_extensions.add(extension_name)
+
+    def add_settings(self, extension_name: str, settings_list: List[str]):
+        extension_name = extension_name.lower()
+
+        base_settings = set()
+        base_settings.update(self.base_settings)
+        if extension_name in EXTENSION_DEPENDENCIES:
+            dependencies = EXTENSION_DEPENDENCIES[extension_name]
+            for item in dependencies:
+                assert item in self.extension_settings
+                base_settings.update(self.extension_settings[item])
+
+        added_settings: Set[str] = set(settings_list) - base_settings
+
+        self.extension_settings[extension_name] = added_settings
+
+        settings_to_add: Dict[str, ExtensionSetting] = {}
+        for setting in added_settings:
+            setting_name = setting.lower()
+            settings_to_add[setting_name] = ExtensionSetting(extension_name, setting_name)
+
+        self.settings_map.update(settings_to_add)
+
+    def add_secret_types(self, extension_name: str, secret_types_list: List[str]):
+        extension_name = extension_name.lower()
+
+        base_secret_types = set()
+        base_secret_types.update(self.base_secret_types)
+        if extension_name in EXTENSION_DEPENDENCIES:
+            dependencies = EXTENSION_DEPENDENCIES[extension_name]
+            for item in dependencies:
+                assert item in self.extension_secret_types
+                base_secret_types.update(self.extension_secret_types[item])
+
+        added_secret_types: Set[str] = set(secret_types_list) - base_secret_types
+
+        self.extension_secret_types[extension_name] = added_secret_types
+
+        secret_types_to_add: Dict[str, ExtensionSecretType] = {}
+        for secret_type in added_secret_types:
+            secret_type_name = secret_type.lower()
+            secret_types_to_add[secret_type_name] = ExtensionSecretType(extension_name, secret_type_name)
+
+        self.secret_types_map.update(secret_types_to_add)
+
+    def get_extension_overloads(
+        self, extension_name: str, overloads: Dict[Function, List[FunctionOverload]]
+    ) -> Dict[Function, List[ExtensionFunctionOverload]]:
+        result = {}
+        for function, function_overloads in overloads.items():
+            extension_overloads = []
+            for overload in function_overloads:
+                extension_overloads.append(
+                    ExtensionFunctionOverload(
+                        extension_name, overload.name, overload.type, overload.parameters, overload.return_type
+                    )
+                )
+            result[function] = extension_overloads
+        return result
+
+    def add_functions(
+        self, extension_name: str, function_list: List[Function], overloads: Dict[Function, List[FunctionOverload]]
+    ):
+        extension_name = extension_name.lower()
+
+        base_functions = set()
+        base_functions.update(self.base_functions)
+        if extension_name in EXTENSION_DEPENDENCIES:
+            dependencies = EXTENSION_DEPENDENCIES[extension_name]
+            for item in dependencies:
+                assert item in self.extension_functions
+                base_functions.update(self.extension_functions[item])
+
+        overloads = self.get_extension_overloads(extension_name, overloads)
+        added_functions: Set[Function] = set(function_list) - base_functions
+
+        self.extension_functions[extension_name] = added_functions
+
+        functions_to_add: Dict[Function, ExtensionFunction] = {}
+        for function in added_functions:
+            if function in self.function_overloads:
+                # function is in overload map - add overloads
+                self.function_overloads[function] += overloads[function]
+            elif function in self.function_map:
+                # function is in function map and we are trying to add it again
+                # this means the function is present in multiple extensions
+                # remove from function map, and add to overload map
+                self.function_overloads[function] = self.all_function_overloads[function] + overloads[function]
+                del self.function_map[function]
+            else:
+                functions_to_add[function] = ExtensionFunction(extension_name, function.name, function.type)
+
+        self.all_function_overloads.update(overloads)
+        self.function_map.update(functions_to_add)
+
+    def validate(self):
+        parsed_entries = ParsedEntries(HEADER_PATH)
+        if self.function_map != parsed_entries.functions:
+            print("Function map mismatches:")
+            print_map_diff(self.function_map, parsed_entries.functions)
+            exit(1)
+        if self.settings_map != parsed_entries.settings:
+            print("Settings map mismatches:")
+            print_map_diff(self.settings_map, parsed_entries.settings)
+            exit(1)
+        if self.secret_types_map != parsed_entries.secret_types:
+            print("SecretTypes map mismatches:")
+            print_map_diff(self.secret_types_map, parsed_entries.secret_types)
+            exit(1)
+
+        print("All entries found: ")
+        print(" > functions: " + str(len(parsed_entries.functions)))
+        print(" > settings:  " + str(len(parsed_entries.settings)))
+        print(" > secret_types:  " + str(len(parsed_entries.secret_types)))
+
+    def verify_export(self):
+        if len(self.function_map) == 0 or len(self.settings_map) == 0 or len(self.secret_types_map) == 0:
+            print(
+                """
+The provided configuration produced an empty function map or empty settings map or empty secret types map
+This is likely caused by building DuckDB with extensions linked in
+"""
+            )
+            exit(1)
+
+    def export_functions(self) -> str:
+        result = """
+static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = {\n"""
+        sorted_function = sorted(self.function_map)
+
+        for func in sorted_function:
+            function: ExtensionFunction = self.function_map[func]
+            result += "\t{"
+            result += f'"{function.name}", "{function.extension}", {function.type.value}'
+            result += "},\n"
+        result += "}; // END_OF_EXTENSION_FUNCTIONS\n"
+        return result
+
+    def export_function_overloads(self) -> str:
+        result = """
+static constexpr ExtensionFunctionOverloadEntry EXTENSION_FUNCTION_OVERLOADS[] = {\n"""
+        sorted_function = sorted(self.function_overloads)
+
+        for func in sorted_function:
+            overloads: List[ExtensionFunctionOverload] = sorted(self.function_overloads[func])
+            for overload in overloads:
+                result += "\t{"
+                result += f'"{overload.name}", "{overload.extension}", {overload.type.value}, "'
+                signature = "["
+                signature += ",".join([parameter.type for parameter in overload.parameters])
+                signature += "]>" + overload.return_type.type
+                result += signature
+                result += '"},\n'
+        result += "}; // END_OF_EXTENSION_FUNCTION_OVERLOADS\n"
+        return result
+
+    def export_settings(self) -> str:
+        result = """
+static constexpr ExtensionEntry EXTENSION_SETTINGS[] = {\n"""
+        sorted_settings = sorted(self.settings_map)
+
+        for settings_name in sorted_settings:
+            setting: ExtensionSetting = self.settings_map[settings_name]
+            result += "\t{"
+            result += f'"{settings_name.lower()}", "{setting.extension}"'
+            result += "},\n"
+        result += "}; // END_OF_EXTENSION_SETTINGS\n"
+        return result
+
+    def export_secret_types(self) -> str:
+        result = """
+static constexpr ExtensionEntry EXTENSION_SECRET_TYPES[] = {\n"""
+        sorted_secret_types = sorted(self.secret_types_map)
+
+        for secret_types_name in sorted_secret_types:
+            secret_type: ExtensionSecretType = self.secret_types_map[secret_types_name]
+            result += "\t{"
+            result += f'"{secret_types_name.lower()}", "{secret_type.extension}"'
+            result += "},\n"
+        result += "}; // END_OF_EXTENSION_SECRET_TYPES\n"
+        return result
+
+
+# Get the slice of the file containing the var (assumes // END_OF_<varname> comment after var)
+def get_slice_of_file(var_name, file_str):
+    begin = file_str.find(var_name)
+    end = file_str.find("END_OF_" + var_name)
+    return file_str[begin:end]
+
+
+def print_map_diff(d1, d2):
+    s1 = sorted(set(d1.items()))
+    s2 = sorted(set(d2.items()))
+
+    diff1 = str(set(s1) - set(s2))
+    diff2 = str(set(s2) - set(s1))
+    print("Diff between maps: " + diff1 + "\n")
+    print("Diff between maps: " + diff2 + "\n")
+
+
+def get_extension_path_map() -> Dict[str, str]:
+    extension_paths: Dict[str, str] = {}
+    # extension_repository = pathlib.Path('../build/release/repository')
+    extension_repository = args.extension_repository
+    for location in glob.iglob(extension_repository + '/**/*.duckdb_extension', recursive=True):
+        name, _ = os.path.splitext(os.path.basename(location))
+        print(f"Located extension: {name} in path: '{location}'")
+        extension_paths[name] = location
+    return extension_paths
+
+
+def write_header(data: ExtensionData):
+    INCLUDE_HEADER = """//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// duckdb/main/extension_entries.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include \"duckdb/common/unordered_map.hpp\"
+#include \"duckdb/common/enums/catalog_type.hpp\"
+
+// NOTE: this file is generated by scripts/generate_extensions_function.py.
+// Example usage to refresh one extension (replace "icu" with the desired extension):
+// GENERATE_EXTENSION_ENTRIES=1 make debug
+// python3 scripts/generate_extensions_function.py --extensions icu --shell build/debug/duckdb --extension_repository build/debug/repository
+
+// Check out the check-load-install-extensions  job in .github/workflows/LinuxRelease.yml for more details
+
+namespace duckdb {
+
+struct ExtensionEntry {
+    char name[48];
+    char extension[48];
+};
+
+struct ExtensionFunctionEntry {
+    char name[48];
+    char extension[48];
+    CatalogType type;
+};
+
+struct ExtensionFunctionOverloadEntry {
+    char name[48];
+    char extension[48];
+    CatalogType type;
+    char signature[96];
+};
+"""
+
+    INCLUDE_FOOTER = """
+// Note: these are currently hardcoded in scripts/generate_extensions_function.py
+// TODO: automate by passing though to script via duckdb
+static constexpr ExtensionEntry EXTENSION_COPY_FUNCTIONS[] = {
+    {"parquet", "parquet"},
+    {"json", "json"},
+    {"avro", "avro"}
+}; // END_OF_EXTENSION_COPY_FUNCTIONS
+
+// Note: these are currently hardcoded in scripts/generate_extensions_function.py
+// TODO: automate by passing though to script via duckdb
+static constexpr ExtensionEntry EXTENSION_TYPES[] = {
+    {"json", "json"},
+    {"inet", "inet"},
+    {"geometry", "spatial"}
+}; // END_OF_EXTENSION_TYPES
+
+// Note: these are currently hardcoded in scripts/generate_extensions_function.py
+// TODO: automate by passing though to script via duckdb
+static constexpr ExtensionEntry EXTENSION_COLLATIONS[] = {
+    {"af", "icu"},    {"am", "icu"},    {"ar", "icu"},     {"ar_sa", "icu"}, {"as", "icu"},    {"az", "icu"},
+    {"be", "icu"},    {"bg", "icu"},    {"bn", "icu"},     {"bo", "icu"},    {"br", "icu"},    {"bs", "icu"},
+    {"ca", "icu"},    {"ceb", "icu"},   {"chr", "icu"},    {"cs", "icu"},    {"cy", "icu"},    {"da", "icu"},
+    {"de", "icu"},    {"de_at", "icu"}, {"dsb", "icu"},    {"dz", "icu"},    {"ee", "icu"},    {"el", "icu"},
+    {"en", "icu"},    {"en_us", "icu"}, {"eo", "icu"},     {"es", "icu"},    {"et", "icu"},    {"fa", "icu"},
+    {"fa_af", "icu"}, {"ff", "icu"},    {"fi", "icu"},     {"fil", "icu"},   {"fo", "icu"},    {"fr", "icu"},
+    {"fr_ca", "icu"}, {"fy", "icu"},    {"ga", "icu"},     {"gl", "icu"},    {"gu", "icu"},    {"ha", "icu"},
+    {"haw", "icu"},   {"he", "icu"},    {"he_il", "icu"},  {"hi", "icu"},    {"hr", "icu"},    {"hsb", "icu"},
+    {"hu", "icu"},    {"hy", "icu"},    {"id", "icu"},     {"id_id", "icu"}, {"ig", "icu"},    {"is", "icu"},
+    {"it", "icu"},    {"ja", "icu"},    {"ka", "icu"},     {"kk", "icu"},    {"kl", "icu"},    {"km", "icu"},
+    {"kn", "icu"},    {"ko", "icu"},    {"kok", "icu"},    {"ku", "icu"},    {"ky", "icu"},    {"lb", "icu"},
+    {"lkt", "icu"},   {"ln", "icu"},    {"lo", "icu"},     {"lt", "icu"},    {"lv", "icu"},    {"mk", "icu"},
+    {"ml", "icu"},    {"mn", "icu"},    {"mr", "icu"},     {"ms", "icu"},    {"mt", "icu"},    {"my", "icu"},
+    {"nb", "icu"},    {"nb_no", "icu"}, {"ne", "icu"},     {"nl", "icu"},    {"nn", "icu"},    {"om", "icu"},
+    {"or", "icu"},    {"pa", "icu"},    {"pa_in", "icu"},  {"pl", "icu"},    {"ps", "icu"},    {"pt", "icu"},
+    {"ro", "icu"},    {"ru", "icu"},    {"sa", "icu"},     {"se", "icu"},    {"si", "icu"},    {"sk", "icu"},
+    {"sl", "icu"},    {"smn", "icu"},   {"sq", "icu"},     {"sr", "icu"},    {"sr_ba", "icu"}, {"sr_me", "icu"},
+    {"sr_rs", "icu"}, {"sv", "icu"},    {"sw", "icu"},     {"ta", "icu"},    {"te", "icu"},    {"th", "icu"},
+    {"tk", "icu"},    {"to", "icu"},    {"tr", "icu"},     {"ug", "icu"},    {"uk", "icu"},    {"ur", "icu"},
+    {"uz", "icu"},    {"vi", "icu"},    {"wae", "icu"},    {"wo", "icu"},    {"xh", "icu"},    {"yi", "icu"},
+    {"yo", "icu"},    {"yue", "icu"},   {"yue_cn", "icu"}, {"zh", "icu"},    {"zh_cn", "icu"}, {"zh_hk", "icu"},
+    {"zh_mo", "icu"}, {"zh_sg", "icu"}, {"zh_tw", "icu"},  {"zu", "icu"}}; // END_OF_EXTENSION_COLLATIONS
+
+// Note: these are currently hardcoded in scripts/generate_extensions_function.py
+// TODO: automate by passing though to script via duckdb
+static constexpr ExtensionEntry EXTENSION_FILE_PREFIXES[] = {
+     {"http://", "httpfs"}, {"https://", "httpfs"}, {"s3://", "httpfs"}, {"s3a://", "httpfs"}, {"s3n://", "httpfs"},
+     {"gcs://", "httpfs"},  {"gs://", "httpfs"},    {"r2://", "httpfs"}, {"azure://", "azure"}, {"az://", "azure"},
+     {"abfss://", "azure"}, {"hf://", "httpfs"}
+}; // END_OF_EXTENSION_FILE_PREFIXES
+
+// Note: these are currently hardcoded in scripts/generate_extensions_function.py
+// TODO: automate by passing though to script via duckdb
+static constexpr ExtensionEntry EXTENSION_FILE_POSTFIXES[] = {
+    {".parquet", "parquet"},
+    {".json", "json"},
+    {".jsonl", "json"},
+    {".ndjson", "json"},
+    {".shp", "spatial"},
+    {".gpkg", "spatial"},
+    {".fgb", "spatial"},
+    {".xlsx", "excel"},
+    {".avro", "avro"},
+}; // END_OF_EXTENSION_FILE_POSTFIXES
+
+// Note: these are currently hardcoded in scripts/generate_extensions_function.py
+// TODO: automate by passing though to script via duckdb
+static constexpr ExtensionEntry EXTENSION_FILE_CONTAINS[] = {
+    {".parquet?", "parquet"},
+    {".json?", "json"},
+    {".ndjson?", ".jsonl?"},
+    {".jsonl?", ".ndjson?"}
+}; // EXTENSION_FILE_CONTAINS
+
+// Note: these are currently hardcoded in scripts/generate_extensions_function.py
+// TODO: automate by passing though to script via duckdb
+static constexpr ExtensionEntry EXTENSION_SECRET_PROVIDERS[] = {{"s3/config", "httpfs"},
+                                                                {"gcs/config", "httpfs"},
+                                                                {"r2/config", "httpfs"},
+                                                                {"s3/credential_chain", "aws"},
+                                                                {"gcs/credential_chain", "aws"},
+                                                                {"r2/credential_chain", "aws"},
+                                                                {"aws/credential_chain", "aws"},
+                                                                {"azure/access_token", "azure"},
+                                                                {"azure/config", "azure"},
+                                                                {"azure/credential_chain", "azure"},
+                                                                {"azure/service_principal", "azure"},
+                                                                {"huggingface/config", "httfps"},
+                                                                {"huggingface/credential_chain", "httpfs"},
+                                                                {"bearer/config", "httpfs"},
+                                                                {"mysql/config", "mysql_scanner"},
+                                                                {"postgres/config", "postgres_scanner"}
+}; // EXTENSION_SECRET_PROVIDERS
+
+static constexpr const char *AUTOLOADABLE_EXTENSIONS[] = {
+    "avro",
+    "aws",
+    "azure",
+    "autocomplete",
+    "core_functions",
+    "delta",
+    "ducklake",
+    "encodings",
+    "excel",
+    "fts",
+    "httpfs",
+    "iceberg",
+    "inet",
+    "icu",
+    "json",
+    "motherduck",
+    "mysql_scanner",
+    "parquet",
+    "sqlite_scanner",
+    "sqlsmith",
+    "postgres_scanner",
+    "tpcds",
+    "tpch",
+    "uc_catalog",
+    "ui"
+}; // END_OF_AUTOLOADABLE_EXTENSIONS
+
+} // namespace duckdb"""
+
+    data.verify_export()
+
+    file = open(HEADER_PATH, 'w')
+    file.write(INCLUDE_HEADER)
+
+    exported_functions = data.export_functions()
+    file.write(exported_functions)
+
+    exported_overloads = data.export_function_overloads()
+    file.write(exported_overloads)
+
+    exported_settings = data.export_settings()
+    file.write(exported_settings)
+
+    exported_secret_types = data.export_secret_types()
+    file.write(exported_secret_types)
+
+    file.write(INCLUDE_FOOTER)
+    file.close()
+
+
+# Extensions that can be autoloaded, but are not buildable by DuckDB CI
+HARDCODED_EXTENSION_FUNCTIONS = ExtensionFunction.create_map(
+    [
+        ("delta_scan", "delta", "CatalogType::TABLE_FUNCTION_ENTRY"),
+    ]
+)
+
+
+def main():
+    check_prerequisites()
+
+    extension_names: List[str] = get_extension_names()
+
+    extension_data = ExtensionData()
+    # Collect the list of functions/settings without any extensions loaded
+    extension_data.set_base()
+
+    # TODO: add 'purge' option to ignore existing entries ??
+    parsed_entries = ParsedEntries(HEADER_PATH)
+    parsed_entries.filter_entries(extension_names)
+
+    # Add the entries we parsed from the HEADER_PATH
+    extension_data.add_entries(parsed_entries)
+
+    for extension_name in extension_names:
+        print(extension_name)
+        # For every extension, add the functions/settings added by the extension
+        extension_data.add_extension(extension_name)
+
+    # Add hardcoded extension entries (
+    for key, value in HARDCODED_EXTENSION_FUNCTIONS.items():
+        extension_data.function_map[key] = value
+
+    if args.validate:
+        extension_data.validate()
+        return
+
+    write_header(extension_data)
+
+
+if __name__ == '__main__':
+    main()
--- a/external/duckdb/scripts/generate_flex.py
+++ b/external/duckdb/scripts/generate_flex.py
@@ -0,0 +1,93 @@
+# use flex to generate the scanner file for the parser
+# the following version of bison is used:
+# flex 2.5.35 Apple(flex-32)
+import os
+import subprocess
+import re
+from sys import platform
+import sys
+from python_helpers import open_utf8
+
+flex_bin = 'flex'
+pg_path = os.path.join('third_party', 'libpg_query')
+namespace = 'duckdb_libpgquery'
+
+for arg in sys.argv[1:]:
+    if arg.startswith("--flex="):
+        flex_bin = arg.replace("--flex=", "")
+    elif arg.startswith("--custom_dir_prefix"):
+        pg_path = arg.split("=")[1] + pg_path
+    elif arg.startswith("--namespace"):
+        namespace = arg.split("=")[1]
+    else:
+        raise Exception("Unrecognized argument: " + arg + ", expected --flex, --custom_dir_prefix, --namespace")
+
+flex_file_path = os.path.join(pg_path, 'scan.l')
+target_file = os.path.join(pg_path, 'src_backend_parser_scan.cpp')
+
+proc = subprocess.Popen(
+    [flex_bin, '--nounistd', '-o', target_file, flex_file_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+)
+stdout = proc.stdout.read().decode('utf8')
+stderr = proc.stderr.read().decode('utf8')
+if proc.returncode != None or len(stderr) > 0:
+    print("Flex failed")
+    print("stdout: ", stdout)
+    print("stderr: ", stderr)
+    exit(1)
+
+with open_utf8(target_file, 'r') as f:
+    text = f.read()
+
+# convert this from 'int' to 'yy_size_t' to avoid triggering a warning
+text = text.replace('int yy_buf_size;\n', 'yy_size_t yy_buf_size;\n')
+
+# add the libpg_query namespace
+text = text.replace(
+    '''
+#ifndef FLEXINT_H
+#define FLEXINT_H
+''',
+    '''
+#ifndef FLEXINT_H
+#define FLEXINT_H
+namespace '''
+    + namespace
+    + ''' {
+''',
+)
+text = text.replace('register ', '')
+
+text = text + "\n} /* " + namespace + " */\n"
+
+text = re.sub('(?:[(]void[)][ ]*)?fprintf', '//', text)
+text = re.sub('exit[(]', 'throw std::runtime_error(msg); //', text)
+text = re.sub(r'\n\s*if\s*[(]\s*!\s*yyin\s*[)]\s*\n\s*yyin\s*=\s*stdin;\s*\n', '\n', text)
+text = re.sub(r'\n\s*if\s*[(]\s*!\s*yyout\s*[)]\s*\n\s*yyout\s*=\s*stdout;\s*\n', '\n', text)
+
+file_null = 'NULL' if platform == 'linux' else '[(]FILE [*][)] 0'
+
+text = re.sub(
+    rf'[#]ifdef\s*YY_STDINIT\n\s*yyin = stdin;\n\s*yyout = stdout;\n[#]else\n\s*yyin = {file_null};\n\s*yyout = {file_null};\n[#]endif',
+    '    yyin = (FILE *) 0;\n    yyout = (FILE *) 0;',
+    text,
+)
+
+if 'stdin;' in text:
+    print("STDIN not removed!")
+    # exit(1)
+
+if 'stdout' in text:
+    print("STDOUT not removed!")
+    # exit(1)
+
+if 'fprintf(' in text:
+    print("PRINTF not removed!")
+    # exit(1)
+
+if 'exit(' in text:
+    print("EXIT not removed!")
+    # exit(1)
+
+with open_utf8(target_file, 'w+') as f:
+    f.write(text)
--- a/external/duckdb/scripts/generate_functions.py
+++ b/external/duckdb/scripts/generate_functions.py
@@ -0,0 +1,259 @@
+import os
+import json
+from pathlib import Path
+
+
+function_groups = {
+    ('src', 'include/duckdb', 'function'): ['scalar', 'aggregate'],
+    ('extension', 'core_functions/include', 'core_functions'): ['scalar', 'aggregate'],
+}
+
+
+def get_header():
+    return '''//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// {HEADER}_functions.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+// This file is automatically generated by scripts/generate_functions.py
+// Do not edit this file manually, your changes will be overwritten
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "duckdb/function/function_set.hpp"
+
+namespace duckdb {
+
+'''
+
+
+def get_footer():
+    return '''} // namespace duckdb
+'''
+
+
+def main():
+    function_type_set = {}
+    for (root, include_dir, group), function_types in sorted(function_groups.items()):
+        all_functions_group = []
+        group_dir = Path(group)
+        for function_type in function_types:
+            type_dir = Path(root).joinpath(group_dir.joinpath(function_type))
+            relative_function_paths = sorted(
+                [f'{group}/{function_type}/{f.name}' for f in type_dir.iterdir() if f.is_dir()]
+            )
+            for function_path in relative_function_paths:
+                if Path(normalize_path_separators(f'{root}/{function_path}/functions.json')).exists():
+                    create_header_file(root, include_dir, function_path, all_functions_group, function_type_set)
+        create_function_list_file(root, group, all_functions_group)
+
+
+def normalize_path_separators(x):
+    return os.path.sep.join(x.split('/'))
+
+
+def legal_struct_name(name):
+    return name.isalnum()
+
+
+def get_struct_name(function_name):
+    return function_name.replace('_', ' ').title().replace(' ', '') + 'Fun'
+
+
+def get_parameter_line(variants):
+    if not all(
+        isinstance(variant['parameters'], list)
+        and all(isinstance(param, dict) for param in variant['parameters'])
+        and all('name' in param.keys() for param in variant['parameters'])
+        for variant in variants
+    ):
+        raise ValueError(
+            f"invalid parameters for variants {variants}\nParameters should have format: \"parameters\": [{{\"name\": <param_name>, \"type\": <param_type>}}, ...]"
+        )
+    return "\\001".join(
+        ",".join(
+            param['name'] + "::" + param['type'] if ('type' in param) else param['name']
+            for param in variant['parameters']
+        )
+        for variant in variants
+    )
+
+
+def get_description_line(variants):
+    return "\\001".join([variant['description'] for variant in variants])
+
+
+def get_example_line(variants):
+    return "\\001".join([example_from_json(variant) for variant in variants])
+
+
+def example_from_json(json_record):
+    if 'example' in json_record:
+        example_line = sanitize_string(json_record['example'])
+    elif 'examples' in json_record:
+        example_line = examples_to_line(json_record['examples'])
+    else:
+        example_line = ''
+    return example_line
+
+
+def examples_to_line(example_list):
+    return "\\002".join([sanitize_string(example) for example in example_list])
+
+
+def get_category_line(variants):
+    return "\\001".join([categories_from_json(variant) for variant in variants])
+
+
+def categories_from_json(json_record):
+    if 'categories' in json_record:
+        category_line = ','.join([category.strip() for category in json_record['categories']])
+    else:
+        category_line = ''
+    return category_line
+
+
+def sanitize_string(text):
+    return text.replace('\\', '\\\\').replace('"', '\\"')
+
+
+def create_header_file(root, include_dir, path, all_function_list, function_type_set):
+    header_path = normalize_path_separators(f'{root}/{include_dir}/{path}_functions.hpp')
+    json_path = normalize_path_separators(f'{root}/{path}/functions.json')
+    with open(json_path, 'r') as f:
+        parsed_json = json.load(f)
+    new_text = get_header().replace('{HEADER}', path)
+    for entry in parsed_json:
+        function_text = ''
+        if 'struct' in entry:
+            struct_name = entry['struct']
+        else:
+            struct_name = get_struct_name(entry['name'])
+        if not legal_struct_name(struct_name):
+            print(f'Struct name {struct_name} is not a valid struct name!')
+            exit(1)
+        if struct_name in function_type_set:
+            raise Exception("Duplicate entry " + struct_name)
+        function_type_set[struct_name] = entry['type']
+        if entry['type'] == 'scalar_function':
+            function_text = 'static ScalarFunction GetFunction();'
+            all_function_list.append([entry['name'], f"DUCKDB_SCALAR_FUNCTION({struct_name})"])
+        elif entry['type'] == 'scalar_function_set':
+            function_text = 'static ScalarFunctionSet GetFunctions();'
+            all_function_list.append([entry['name'], f"DUCKDB_SCALAR_FUNCTION_SET({struct_name})"])
+        elif entry['type'] == 'aggregate_function':
+            function_text = 'static AggregateFunction GetFunction();'
+            all_function_list.append([entry['name'], f"DUCKDB_AGGREGATE_FUNCTION({struct_name})"])
+        elif entry['type'] == 'aggregate_function_set':
+            function_text = 'static AggregateFunctionSet GetFunctions();'
+            all_function_list.append([entry['name'], f"DUCKDB_AGGREGATE_FUNCTION_SET({struct_name})"])
+        else:
+            print("Unknown entry type " + entry['type'] + ' for entry ' + struct_name)
+            exit(1)
+        if 'variants' in entry:
+            parameter_line = get_parameter_line(entry['variants'])
+            description_line = get_description_line(entry['variants'])
+            example_line = get_example_line(entry['variants'])
+            category_line = get_category_line(entry['variants'])
+        else:
+            parameter_line = entry['parameters'].replace(' ', '') if 'parameters' in entry else ''
+            description_line = sanitize_string(entry['description'])
+            example_line = example_from_json(entry)
+            category_line = categories_from_json(entry)
+        if 'extra_functions' in entry:
+            for func_text in entry['extra_functions']:
+                function_text += '\n	' + func_text
+        new_text += (
+            '''struct {STRUCT} {
+	static constexpr const char *Name = "{NAME}";
+	static constexpr const char *Parameters = "{PARAMETERS}";
+	static constexpr const char *Description = "{DESCRIPTION}";
+	static constexpr const char *Example = "{EXAMPLE}";
+	static constexpr const char *Categories = "{CATEGORIES}";
+
+	{FUNCTION}
+};
+
+'''.replace(
+                '{STRUCT}', struct_name
+            )
+            .replace('{NAME}', entry['name'])
+            .replace('{PARAMETERS}', parameter_line)
+            .replace('{DESCRIPTION}', description_line)
+            .replace('{EXAMPLE}', example_line)
+            .replace('{CATEGORIES}', category_line)
+            .replace('{FUNCTION}', function_text)
+        )
+        alias_count = 1
+        if 'aliases' in entry:
+            for alias in entry['aliases']:
+                alias_struct_name = get_struct_name(alias)
+                if not legal_struct_name(alias_struct_name):
+                    alias_struct_name = struct_name + 'Alias'
+                    if alias_count > 1:
+                        alias_struct_name += str(alias_count)
+                    alias_count += 1
+
+                aliased_type = entry['type']
+                if aliased_type == 'scalar_function':
+                    all_function_list.append([alias, f"DUCKDB_SCALAR_FUNCTION_ALIAS({alias_struct_name})"])
+                elif aliased_type == 'scalar_function_set':
+                    all_function_list.append([alias, f"DUCKDB_SCALAR_FUNCTION_SET_ALIAS({alias_struct_name})"])
+                elif aliased_type == 'aggregate_function':
+                    all_function_list.append([alias, f"DUCKDB_AGGREGATE_FUNCTION_ALIAS({alias_struct_name})"])
+                elif aliased_type == 'aggregate_function_set':
+                    all_function_list.append([alias, f"DUCKDB_AGGREGATE_FUNCTION_SET_ALIAS({alias_struct_name})"])
+                else:
+                    print("Unknown entry type " + aliased_type + ' for entry ' + struct_name)
+                    exit(1)
+                function_type_set[alias_struct_name] = aliased_type
+                new_text += (
+                    '''struct {STRUCT} {
+	using ALIAS = {ALIAS};
+
+	static constexpr const char *Name = "{NAME}";
+};
+
+'''.replace(
+                        '{STRUCT}', alias_struct_name
+                    )
+                    .replace('{NAME}', alias)
+                    .replace('{ALIAS}', struct_name)
+                )
+    new_text += get_footer()
+    with open(header_path, 'w+') as f:
+        f.write(new_text)
+
+
+def create_function_list_file(root, group, all_function_list):
+    function_list_file = normalize_path_separators(f'{root}/{group}/function_list.cpp')
+    with open(function_list_file, 'r') as f:
+        text = f.read()
+
+    static_function = f'static const StaticFunctionDefinition {group}[]' ' = {'
+    pos = text.find(static_function)
+    header = text[:pos]
+    footer_lines = text[pos:].split('\n')
+    footer = ''
+    for i in range(len(footer_lines)):
+        if len(footer_lines[i]) == 0:
+            footer = '\n'.join(footer_lines[i:])
+            break
+
+    new_text = header
+    new_text += static_function + '\n'
+    all_function_list = sorted(all_function_list, key=lambda x: x[0])
+    for entry in all_function_list:
+        new_text += '\t' + entry[1] + ',\n'
+    new_text += '\tFINAL_FUNCTION\n};\n'
+    new_text += footer
+
+    with open(function_list_file, 'w+') as f:
+        f.write(new_text)
+
+
+if __name__ == "__main__":
+    main()
--- a/external/duckdb/scripts/generate_grammar.py
+++ b/external/duckdb/scripts/generate_grammar.py
@@ -0,0 +1,309 @@
+# use bison to generate the parser files
+# the following version of bison is used:
+# bison (GNU Bison) 2.3
+import os
+import subprocess
+import re
+import sys
+from python_helpers import open_utf8
+
+bison_location = "bison"
+base_dir = 'third_party/libpg_query/grammar'
+pg_dir = 'third_party/libpg_query'
+namespace = 'duckdb_libpgquery'
+
+counterexamples = False
+run_update = False
+verbose = False
+for arg in sys.argv[1:]:
+    if arg.startswith("--bison="):
+        bison_location = arg.replace("--bison=", "")
+    elif arg.startswith("--counterexamples"):
+        counterexamples = True
+    elif arg.startswith("--update"):
+        run_update = True
+    # allow a prefix to the source and target directories
+    elif arg.startswith("--custom_dir_prefix"):
+        base_dir = arg.split("=")[1] + base_dir
+        pg_dir = arg.split("=")[1] + pg_dir
+    elif arg.startswith("--namespace"):
+        namespace = arg.split("=")[1]
+    elif arg.startswith("--verbose"):
+        verbose = True
+    else:
+        raise Exception(
+            "Unrecognized argument: "
+            + arg
+            + ", expected --counterexamples, --bison=/loc/to/bison, --custom_dir_prefix, --namespace, --verbose"
+        )
+
+template_file = os.path.join(base_dir, 'grammar.y')
+target_file = os.path.join(base_dir, 'grammar.y.tmp')
+header_file = os.path.join(base_dir, 'grammar.hpp')
+source_file = os.path.join(base_dir, 'grammar.cpp')
+type_dir = os.path.join(base_dir, 'types')
+rule_dir = os.path.join(base_dir, 'statements')
+result_source = os.path.join(base_dir, 'grammar_out.cpp')
+result_header = os.path.join(base_dir, 'grammar_out.hpp')
+target_source_loc = os.path.join(pg_dir, 'src_backend_parser_gram.cpp')
+target_header_loc = os.path.join(pg_dir, 'include/parser/gram.hpp')
+kwlist_header = os.path.join(pg_dir, 'include/parser/kwlist.hpp')
+
+
+# parse the keyword lists
+def read_list_from_file(fname):
+    with open_utf8(fname, 'r') as f:
+        return [x.strip() for x in f.read().split('\n') if len(x.strip()) > 0]
+
+
+kwdir = os.path.join(base_dir, 'keywords')
+unreserved_keywords = read_list_from_file(os.path.join(kwdir, 'unreserved_keywords.list'))
+colname_keywords = read_list_from_file(os.path.join(kwdir, 'column_name_keywords.list'))
+func_name_keywords = read_list_from_file(os.path.join(kwdir, 'func_name_keywords.list'))
+type_name_keywords = read_list_from_file(os.path.join(kwdir, 'type_name_keywords.list'))
+reserved_keywords = read_list_from_file(os.path.join(kwdir, 'reserved_keywords.list'))
+
+
+def strip_p(x):
+    if x.endswith("_P"):
+        return x[:-2]
+    else:
+        return x
+
+
+unreserved_keywords.sort(key=lambda x: strip_p(x))
+colname_keywords.sort(key=lambda x: strip_p(x))
+func_name_keywords.sort(key=lambda x: strip_p(x))
+type_name_keywords.sort(key=lambda x: strip_p(x))
+reserved_keywords.sort(key=lambda x: strip_p(x))
+
+statements = read_list_from_file(os.path.join(base_dir, 'statements.list'))
+statements.sort()
+if len(statements) < 0:
+    print("Need at least one statement")
+    exit(1)
+
+# verify there are no duplicate keywords and create big sorted list of keywords
+kwdict = {}
+for kw in unreserved_keywords:
+    kwdict[kw] = 'UNRESERVED_KEYWORD'
+
+for kw in colname_keywords:
+    kwdict[kw] = 'COL_NAME_KEYWORD'
+
+for kw in func_name_keywords:
+    kwdict[kw] = 'TYPE_FUNC_NAME_KEYWORD'
+
+for kw in type_name_keywords:
+    kwdict[kw] = 'TYPE_FUNC_NAME_KEYWORD'
+
+for kw in reserved_keywords:
+    kwdict[kw] = 'RESERVED_KEYWORD'
+
+kwlist = [(x, kwdict[x]) for x in kwdict.keys()]
+kwlist.sort(key=lambda x: strip_p(x[0]))
+
+# now generate kwlist.h
+# PG_KEYWORD("abort", ABORT_P, UNRESERVED_KEYWORD)
+kwtext = (
+    """
+namespace """
+    + namespace
+    + """ {
+#define PG_KEYWORD(a,b,c) {a,b,c},
+
+const PGScanKeyword ScanKeywords[] = {
+"""
+)
+for tpl in kwlist:
+    kwtext += 'PG_KEYWORD("%s", %s, %s)\n' % (strip_p(tpl[0]).lower(), tpl[0], tpl[1])
+kwtext += (
+    """
+};
+
+const int NumScanKeywords = lengthof(ScanKeywords);
+} // namespace """
+    + namespace
+    + """
+"""
+)
+
+with open_utf8(kwlist_header, 'w+') as f:
+    f.write(kwtext)
+
+
+# generate the final main.y.tmp file
+# first read the template file
+with open_utf8(template_file, 'r') as f:
+    text = f.read()
+
+# now perform a series of replacements in the file to construct the final yacc file
+
+
+def get_file_contents(fpath, add_line_numbers=False):
+    with open_utf8(fpath, 'r') as f:
+        result = f.read()
+        if add_line_numbers:
+            return '#line 1 "%s"\n' % (fpath,) + result
+        else:
+            return result
+
+
+# grammar.hpp
+text = text.replace("{{{ GRAMMAR_HEADER }}}", get_file_contents(header_file, True))
+
+# grammar.cpp
+text = text.replace("{{{ GRAMMAR_SOURCE }}}", get_file_contents(source_file, True))
+
+# keyword list
+kw_token_list = "%token <keyword> " + " ".join([x[0] for x in kwlist])
+
+text = text.replace("{{{ KEYWORDS }}}", kw_token_list)
+
+# statements
+stmt_list = "stmt: " + "\n\t| ".join(statements) + "\n\t| /*EMPTY*/\n\t{ $$ = NULL; }\n"
+text = text.replace("{{{ STATEMENTS }}}", stmt_list)
+
+# keywords
+# keywords can EITHER be reserved, unreserved, or some combination of (col_name, type_name, func_name)
+# that means duplicates are ONLY allowed between (col_name, type_name and func_name)
+# having a keyword be both reserved and unreserved is an error
+# as is having a keyword both reserved and col_name, for example
+# verify that this is the case
+reserved_dict = {}
+unreserved_dict = {}
+other_dict = {}
+for r in reserved_keywords:
+    if r in reserved_dict:
+        print("Duplicate keyword " + r + " in reserved keywords")
+        exit(1)
+    reserved_dict[r] = True
+
+for ur in unreserved_keywords:
+    if ur in unreserved_dict:
+        print("Duplicate keyword " + ur + " in unreserved keywords")
+        exit(1)
+    if ur in reserved_dict:
+        print("Keyword " + ur + " is marked as both unreserved and reserved")
+        exit(1)
+    unreserved_dict[ur] = True
+
+
+def add_to_other_keywords(kw, list_name):
+    global unreserved_dict
+    global reserved_dict
+    global other_dict
+    if kw in unreserved_dict:
+        print("Keyword " + kw + " is marked as both unreserved and " + list_name)
+        exit(1)
+    if kw in reserved_dict:
+        print("Keyword " + kw + " is marked as both reserved and " + list_name)
+        exit(1)
+    other_dict[kw] = True
+
+
+for cr in colname_keywords:
+    add_to_other_keywords(cr, "colname")
+
+type_func_name_dict = {}
+for tr in type_name_keywords:
+    add_to_other_keywords(tr, "typename")
+    type_func_name_dict[tr] = True
+
+for fr in func_name_keywords:
+    add_to_other_keywords(fr, "funcname")
+    type_func_name_dict[fr] = True
+
+type_func_name_keywords = list(type_func_name_dict.keys())
+type_func_name_keywords.sort()
+
+all_keywords = list(reserved_dict.keys()) + list(unreserved_dict.keys()) + list(other_dict.keys())
+all_keywords.sort()
+
+other_keyword = list(other_dict.keys())
+other_keyword.sort()
+
+kw_definitions = "unreserved_keyword: " + " | ".join(unreserved_keywords) + "\n"
+kw_definitions += "col_name_keyword: " + " | ".join(colname_keywords) + "\n"
+kw_definitions += "func_name_keyword: " + " | ".join(func_name_keywords) + "\n"
+kw_definitions += "type_name_keyword: " + " | ".join(type_name_keywords) + "\n"
+kw_definitions += "other_keyword: " + " | ".join(other_keyword) + "\n"
+kw_definitions += "type_func_name_keyword: " + " | ".join(type_func_name_keywords) + "\n"
+kw_definitions += "reserved_keyword: " + " | ".join(reserved_keywords) + "\n"
+text = text.replace("{{{ KEYWORD_DEFINITIONS }}}", kw_definitions)
+
+
+# types
+def concat_dir(dname, extension, add_line_numbers=False):
+    result = ""
+    for fname in os.listdir(dname):
+        fpath = os.path.join(dname, fname)
+        if os.path.isdir(fpath):
+            result += concat_dir(fpath, extension)
+        else:
+            if not fname.endswith(extension):
+                continue
+            result += get_file_contents(fpath, add_line_numbers)
+    return result
+
+
+type_definitions = concat_dir(type_dir, ".yh")
+# add statement types as well
+for stmt in statements:
+    type_definitions += "%type <node> " + stmt + "\n"
+
+text = text.replace("{{{ TYPES }}}", type_definitions)
+
+# grammar rules
+grammar_rules = concat_dir(rule_dir, ".y", True)
+
+text = text.replace("{{{ GRAMMAR RULES }}}", grammar_rules)
+
+# finally write the yacc file into the target file
+with open_utf8(target_file, 'w+') as f:
+    f.write(text)
+
+# generate the bison
+cmd = [bison_location]
+if counterexamples:
+    print("Attempting to print counterexamples (-Wcounterexamples)")
+    cmd += ["-Wcounterexamples"]
+if run_update:
+    cmd += ["--update"]
+if verbose:
+    cmd += ["--verbose"]
+cmd += ["-o", result_source, "-d", target_file]
+print(' '.join(cmd))
+proc = subprocess.Popen(cmd, stderr=subprocess.PIPE)
+res = proc.wait(timeout=10)  # ensure CI does not hang as was seen when running with Bison 3.x release.
+
+if res != 0:
+    text = proc.stderr.read().decode('utf8')
+    print(text)
+    if 'shift/reduce' in text and not counterexamples:
+        print("---------------------------------------------------------------------")
+        print("In case of shift/reduce conflicts, try re-running with --counterexamples")
+        print("Note: this requires a more recent version of Bison (e.g. version 3.8)")
+        print("On a Macbook you can obtain this using \"brew install bison\"")
+    if counterexamples and 'time limit exceeded' in text:
+        print("---------------------------------------------------------------------")
+        print(
+            "The counterexamples time limit was exceeded. This likely means that no useful counterexample was generated."
+        )
+        print("")
+        print("The counterexamples time limit can be increased by setting the TIME_LIMIT environment variable, e.g.:")
+        print("export TIME_LIMIT=100")
+    exit(1)
+
+
+os.rename(result_source, target_source_loc)
+os.rename(result_header, target_header_loc)
+
+with open_utf8(target_source_loc, 'r') as f:
+    text = f.read()
+
+text = text.replace('#include "grammar_out.hpp"', '#include "include/parser/gram.hpp"')
+text = text.replace('yynerrs = 0;', 'yynerrs = 0; (void)yynerrs;')
+
+with open_utf8(target_source_loc, 'w+') as f:
+    f.write(text)
--- a/external/duckdb/scripts/generate_metric_enums.py
+++ b/external/duckdb/scripts/generate_metric_enums.py
@@ -0,0 +1,399 @@
+# Script that takes src/include/duckdb/common/enums/optimizer_type.hpp, extracts the optimizer types
+# and adds them to the metrics types.
+# Then it creates a new file src/include/duckdb/common/enums/metric_type.hpp with the new metrics types as enums.
+# and generates both test/sql/pragma/profiling/test_default_profiling_settings.test
+# and test/sql/pragma/profiling/test_custom_profiling_optimizer.test
+
+import re
+import os
+
+os.chdir(os.path.dirname(__file__))
+
+metrics_header_file = os.path.join("..", "src", "include", "duckdb", "common", "enums", "metric_type.hpp")
+metrics_cpp_file = os.path.join("..", "src", "common", "enums", "metric_type.cpp")
+optimizer_file = os.path.join("..", "src", "include", "duckdb", "common", "enums", "optimizer_type.hpp")
+
+metrics = [
+    "ATTACH_LOAD_STORAGE_LATENCY",
+    "ATTACH_REPLAY_WAL_LATENCY",
+    "BLOCKED_THREAD_TIME",
+    "CHECKPOINT_LATENCY",
+    "CPU_TIME",
+    "CUMULATIVE_CARDINALITY",
+    "CUMULATIVE_ROWS_SCANNED",
+    "EXTRA_INFO",
+    "LATENCY",
+    "OPERATOR_CARDINALITY",
+    "OPERATOR_NAME",
+    "OPERATOR_ROWS_SCANNED",
+    "OPERATOR_TIMING",
+    "OPERATOR_TYPE",
+    "QUERY_NAME",
+    "RESULT_SET_SIZE",
+    "ROWS_RETURNED",
+    "SYSTEM_PEAK_BUFFER_MEMORY",
+    "SYSTEM_PEAK_TEMP_DIR_SIZE",
+    "TOTAL_BYTES_READ",
+    "TOTAL_BYTES_WRITTEN",
+    "WAITING_TO_ATTACH_LATENCY",
+]
+
+phase_timing_metrics = [
+    "ALL_OPTIMIZERS",
+    "CUMULATIVE_OPTIMIZER_TIMING",
+    "PHYSICAL_PLANNER",
+    "PHYSICAL_PLANNER_COLUMN_BINDING",
+    "PHYSICAL_PLANNER_CREATE_PLAN",
+    "PHYSICAL_PLANNER_RESOLVE_TYPES",
+    "PLANNER",
+    "PLANNER_BINDING",
+]
+
+query_global_metrics = [
+    "ATTACH_LOAD_STORAGE_LATENCY",
+    "ATTACH_REPLAY_WAL_LATENCY",
+    "BLOCKED_THREAD_TIME",
+    "CHECKPOINT_LATENCY",
+    "SYSTEM_PEAK_BUFFER_MEMORY",
+    "SYSTEM_PEAK_TEMP_DIR_SIZE",
+    "WAITING_TO_ATTACH_LATENCY",
+]
+
+optimizer_types = []
+
+# Regular expression to match the enum values
+enum_pattern = r'\s*([A-Z_]+)\s*=\s*\d+,?|\s*([A-Z_]+),?'
+
+inside_enum = False
+
+# open the optimizer file and extract the optimizer types
+with open(optimizer_file, "r") as f:
+    for line in f:
+        line = line.strip()
+
+        if line.startswith("enum class OptimizerType"):
+            inside_enum = True
+            continue
+
+        if inside_enum and line.startswith("};"):
+            break
+
+        if inside_enum:
+            match = re.match(enum_pattern, line)
+            if match:
+                optimizer_type = match[1] if match[1] else match[2]
+                if optimizer_type == "INVALID":
+                    continue
+                optimizer_types.append(optimizer_type)
+
+header = """//-------------------------------------------------------------------------
+//                         DuckDB
+//
+//
+// duckdb/common/enums/metrics_type.hpp
+// 
+// This file is automatically generated by scripts/generate_metric_enums.py
+// Do not edit this file manually, your changes will be overwritten
+//-------------------------------------------------------------------------\n
+"""
+
+typedefs = """struct MetricsTypeHashFunction {
+	uint64_t operator()(const MetricsType &index) const {
+		return std::hash<uint8_t>()(static_cast<uint8_t>(index));
+	}
+};
+
+typedef unordered_set<MetricsType, MetricsTypeHashFunction> profiler_settings_t;
+typedef unordered_map<MetricsType, Value, MetricsTypeHashFunction> profiler_metrics_t;
+
+"""
+
+get_optimizer_metric_fun = 'GetOptimizerMetrics()'
+get_phase_timing_metric_fun = 'GetPhaseTimingMetrics()'
+get_optimizer_metric_by_type_fun = 'GetOptimizerMetricByType(OptimizerType type)'
+get_optimizer_type_by_metric_fun = 'GetOptimizerTypeByMetric(MetricsType type)'
+is_optimizer_metric_fun = 'IsOptimizerMetric(MetricsType type)'
+is_phase_timing_metric_fun = 'IsPhaseTimingMetric(MetricsType type)'
+is_query_global_metric_fun = 'IsQueryGlobalMetric(MetricsType type)'
+
+metrics_class = 'MetricsUtils'
+
+# Write the metric type header file
+with open(metrics_header_file, "w") as f:
+    f.write(header)
+
+    f.write('#pragma once\n\n')
+    f.write('#include "duckdb/common/types/value.hpp"\n')
+    f.write('#include "duckdb/common/unordered_set.hpp"\n')
+    f.write('#include "duckdb/common/unordered_map.hpp"\n')
+    f.write('#include "duckdb/common/constants.hpp"\n')
+    f.write('#include "duckdb/common/enum_util.hpp"\n')
+    f.write('#include "duckdb/common/enums/optimizer_type.hpp"\n\n')
+
+    f.write("namespace duckdb {\n\n")
+
+    f.write("enum class MetricsType : uint8_t {\n")
+
+    for metric in metrics:
+        f.write(f"    {metric},\n")
+
+    for metric in phase_timing_metrics:
+        f.write(f"    {metric},\n")
+
+    for metric in optimizer_types:
+        f.write(f"    OPTIMIZER_{metric},\n")
+
+    f.write("};\n\n")
+
+    f.write(typedefs)
+
+    f.write('class MetricsUtils {\n')
+    f.write('public:\n')
+    f.write(f'    static profiler_settings_t {get_optimizer_metric_fun};\n')
+    f.write(f'    static profiler_settings_t {get_phase_timing_metric_fun};\n\n')
+    f.write(f'    static MetricsType {get_optimizer_metric_by_type_fun};\n')
+    f.write(f'    static OptimizerType {get_optimizer_type_by_metric_fun};\n\n')
+    f.write(f'    static bool {is_optimizer_metric_fun};\n')
+    f.write(f'    static bool {is_phase_timing_metric_fun};\n')
+    f.write(f'    static bool {is_query_global_metric_fun};\n')
+    f.write('};\n\n')
+
+    f.write("} // namespace duckdb\n")
+
+# Write the metric_type.cpp file
+with open(metrics_cpp_file, "w") as f:
+    f.write(header)
+
+    f.write('#include "duckdb/common/enums/metric_type.hpp"\n')
+    f.write("namespace duckdb {\n\n")
+
+    f.write(f'profiler_settings_t {metrics_class}::{get_optimizer_metric_fun} {{\n')
+    f.write(f"    return {{\n")
+    for metric in optimizer_types:
+        f.write(f"        MetricsType::OPTIMIZER_{metric},\n")
+    f.write("    };\n")
+    f.write("}\n\n")
+
+    f.write(f'profiler_settings_t {metrics_class}::{get_phase_timing_metric_fun} {{\n')
+    f.write(f"    return {{\n")
+    for metric in phase_timing_metrics:
+        f.write(f"        MetricsType::{metric},\n")
+    f.write("    };\n")
+    f.write("}\n\n")
+
+    f.write(f'MetricsType {metrics_class}::{get_optimizer_metric_by_type_fun} {{\n')
+    f.write('    switch(type) {\n')
+    for metric in optimizer_types:
+        f.write(f"        case OptimizerType::{metric}:\n")
+        f.write(f"            return MetricsType::OPTIMIZER_{metric};\n")
+    f.write('       default:\n')
+    f.write(
+        '            throw InternalException("OptimizerType %s cannot be converted to a MetricsType", '
+        'EnumUtil::ToString(type));\n'
+    )
+    f.write('    };\n')
+    f.write('}\n\n')
+
+    f.write(f'OptimizerType {metrics_class}::{get_optimizer_type_by_metric_fun} {{\n')
+    f.write('    switch(type) {\n')
+    for metric in optimizer_types:
+        f.write(f"        case MetricsType::OPTIMIZER_{metric}:\n")
+        f.write(f"            return OptimizerType::{metric};\n")
+    f.write('    default:\n')
+    f.write('            return OptimizerType::INVALID;\n')
+    f.write('    };\n')
+    f.write('}\n\n')
+
+    f.write(f'bool {metrics_class}::{is_optimizer_metric_fun} {{\n')
+    f.write('    switch(type) {\n')
+    for metric in optimizer_types:
+        f.write(f"        case MetricsType::OPTIMIZER_{metric}:\n")
+
+    f.write('            return true;\n')
+    f.write('        default:\n')
+    f.write('            return false;\n')
+    f.write('    };\n')
+    f.write('}\n\n')
+
+    f.write(f'bool {metrics_class}::{is_phase_timing_metric_fun} {{\n')
+    f.write('    switch(type) {\n')
+    for metric in phase_timing_metrics:
+        f.write(f"        case MetricsType::{metric}:\n")
+
+    f.write('            return true;\n')
+    f.write('        default:\n')
+    f.write('            return false;\n')
+    f.write('    };\n')
+    f.write('}\n\n')
+
+    f.write(f'bool {metrics_class}::{is_query_global_metric_fun} {{\n')
+    f.write('    switch(type) {\n')
+    for metric in query_global_metrics:
+        f.write(f"        case MetricsType::{metric}:\n")
+
+    f.write('            return true;\n')
+    f.write('        default:\n')
+    f.write('            return false;\n')
+    f.write('    };\n')
+    f.write('}\n\n')
+
+    f.write("} // namespace duckdb\n")
+
+# Generate the test files
+test_names = ["test_default_profiling_settings", "test_custom_profiling_optimizer"]
+
+test_descriptions = ["default", "custom optimizer"]
+
+test_files = [os.path.join("..", "test", "sql", "pragma", "profiling", f"{name}.test") for name in test_names]
+
+
+def write_statement(f, statement_type, statement):
+    f.write(f"statement {statement_type}\n")
+    f.write(statement + "\n\n")
+
+
+def write_query(f, options, query):
+    f.write(f"query {options}\n")
+    f.write(query + "\n")
+    f.write("----\n")
+
+
+def write_default_query(f):
+    query = "SELECT unnest(['Maia', 'Thijs', 'Mark', 'Hannes', 'Tom', 'Max', 'Carlo', 'Sam', 'Tania']) AS names ORDER BY random();"
+    write_statement(f, "ok", query)
+    write_statement(f, "ok", "PRAGMA disable_profiling;")
+
+
+def write_get_custom_profiling_settings(f):
+    query = """
+SELECT unnest(res) FROM (
+    SELECT current_setting('custom_profiling_settings') AS raw_setting,
+    raw_setting.trim('{}') AS setting,
+    string_split(setting, ', ') AS res
+) ORDER BY ALL;
+            """.strip()
+    write_query(f, "I", query)
+
+
+def write_custom_profiling_optimizer(f):
+    write_statement(f, "ok", "PRAGMA custom_profiling_settings='{\"ALL_OPTIMIZERS\": \"true\"}';")
+
+    write_default_query(f)
+
+    query = """
+SELECT * FROM (
+    SELECT unnest(res) str FROM (
+        SELECT current_setting('custom_profiling_settings') as raw_setting,
+        raw_setting.trim('{}') AS setting,
+        string_split(setting, ', ') AS res
+    )
+) WHERE '"true"' NOT in str
+ORDER BY ALL \
+            """.strip()
+    write_query(f, "I", query)
+    f.write("\n")
+
+    write_statement(f, "ok", "PRAGMA custom_profiling_settings='{}'")
+    write_default_query(f)
+
+    write_get_custom_profiling_settings(f)
+    f.write("(empty)\n\n")
+
+    write_statement(f, "ok", "PRAGMA custom_profiling_settings='{\"OPTIMIZER_JOIN_ORDER\": \"true\"}'")
+    write_default_query(f)
+
+    write_get_custom_profiling_settings(f)
+    f.write("\"OPTIMIZER_JOIN_ORDER\": \"true\"\n\n")
+
+    write_statement(
+        f, "ok", "CREATE OR REPLACE TABLE metrics_output AS SELECT * FROM '__TEST_DIR__/profiling_output.json';"
+    )
+
+    query = """
+SELECT
+    CASE WHEN optimizer_join_order > 0 THEN 'true'
+     ELSE 'false' END
+FROM metrics_output;
+            """.strip()
+    write_query(f, "I", query)
+    f.write("true\n\n")
+
+    write_statement(f, "ok", "SET disabled_optimizers = 'JOIN_ORDER';")
+    write_statement(f, "ok", "PRAGMA custom_profiling_settings='{\"OPTIMIZER_JOIN_ORDER\": \"true\"}'")
+    write_default_query(f)
+
+    write_get_custom_profiling_settings(f)
+    f.write("(empty)\n\n")
+
+    write_statement(f, "ok", "PRAGMA custom_profiling_settings='{\"CUMULATIVE_OPTIMIZER_TIMING\": \"true\"}';")
+    write_default_query(f)
+
+    write_statement(
+        f, "ok", "CREATE OR REPLACE TABLE metrics_output AS SELECT * FROM '__TEST_DIR__/profiling_output.json';"
+    )
+
+    query = """
+SELECT
+    CASE WHEN cumulative_optimizer_timing > 0 THEN 'true'
+    ELSE 'false' END
+FROM metrics_output;
+        """.strip()
+    write_query(f, "I", query)
+    f.write("true\n\n")
+
+    f.write("# All phase timings must be collected when using detailed profiling mode.\n\n")
+
+    write_statement(f, "ok", "RESET custom_profiling_settings;")
+    write_statement(f, "ok", "SET profiling_mode = 'detailed';")
+    write_default_query(f)
+
+    query = """
+SELECT * FROM (
+    SELECT unnest(res) str FROM (
+        SELECT current_setting('custom_profiling_settings') AS raw_setting,
+        raw_setting.trim('{}') AS setting,
+        string_split(setting, ', ') AS res
+    )
+)
+WHERE '"true"' NOT IN str
+ORDER BY ALL
+            """.strip()
+    write_query(f, "I", query)
+    f.write("\n")
+
+    write_statement(f, "ok", "RESET custom_profiling_settings;")
+    write_statement(f, "ok", "SET profiling_mode = 'standard';")
+
+
+# Create the test files
+for test_file, name, description in zip(test_files, test_names, test_descriptions):
+    with open(test_file, "w") as f:
+        display_name = test_file.replace("../", "")
+        f.write(f"# name: {display_name}\n")
+        f.write(f"# description: Test {description} profiling settings.\n")
+        f.write("# group: [profiling]\n\n")
+        f.write("# This file is automatically generated by scripts/generate_metric_enums.py\n")
+        f.write("# Do not edit this file manually, your changes will be overwritten\n\n")
+
+        f.write("require json\n\n")
+
+        write_statement(f, "ok", "PRAGMA enable_verification;")
+        write_statement(f, "ok", "PRAGMA enable_profiling = 'json';")
+        write_statement(f, "ok", "PRAGMA profiling_output = '__TEST_DIR__/profiling_output.json';")
+
+        if name == "test_custom_profiling_optimizer":
+            write_custom_profiling_optimizer(f)
+
+        write_default_query(f)
+
+        write_get_custom_profiling_settings(f)
+        metrics.sort()
+
+        for metric in metrics:
+            f.write(f'"{metric}": "true"\n')
+        f.write("\n")
+
+        write_statement(
+            f, "ok", "CREATE OR REPLACE TABLE metrics_output AS SELECT * FROM '__TEST_DIR__/profiling_output.json';"
+        )
+        write_statement(f, "ok", "SELECT cpu_time, extra_info, rows_returned, latency FROM metrics_output;")
--- a/external/duckdb/scripts/generate_plan_storage_version.py
+++ b/external/duckdb/scripts/generate_plan_storage_version.py
@@ -0,0 +1,39 @@
+# this script re-generates the binary file used for Test deserialized plans from file
+# before running this script, increment the version number in src/planner/logical_operator.cpp and
+# recompile (make debug)
+# Note that the test is not linked unless you BUILD_TPCH=1
+
+import os
+import subprocess
+from python_helpers import open_utf8
+
+shell_proc = os.path.join('build', 'debug', 'test', 'unittest')
+gen_binary_file = os.path.join('test', 'api', 'serialized_plans', 'serialized_plans.binary')
+
+
+def try_remove_file(fname):
+    try:
+        os.remove(fname)
+    except:
+        pass
+
+
+try_remove_file(gen_binary_file)
+
+
+def run_test(test):
+    print(test)
+    env = os.environ.copy()
+    env["GEN_PLAN_STORAGE"] = "1"
+    res = subprocess.run([shell_proc, test], capture_output=True, env=env)
+    stdout = res.stdout.decode('utf8').strip()
+    stderr = res.stderr.decode('utf8').strip()
+    if res.returncode != 0:
+        print("Failed to create binary file!")
+        print("----STDOUT----")
+        print(stdout)
+        print("----STDERR----")
+        print(stderr)
+
+
+run_test("Generate serialized plans file")
--- a/external/duckdb/scripts/generate_presigned_url.sh
+++ b/external/duckdb/scripts/generate_presigned_url.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+#Note: DONT run as root
+
+set -e
+
+DUCKDB_PATH=duckdb
+if test -f build/release/duckdb; then
+  DUCKDB_PATH=build/release/duckdb
+elif test -f build/reldebug/duckdb; then
+  DUCKDB_PATH=build/reldebug/duckdb
+elif test -f build/debug/duckdb; then
+  DUCKDB_PATH=build/debug/duckdb
+fi
+
+mkdir -p data/parquet-testing/presigned
+
+generate_large_parquet_query=$(cat <<EOF
+
+CALL DBGEN(sf=1);
+COPY lineitem TO 'data/parquet-testing/presigned/presigned-url-lineitem.parquet' (FORMAT 'parquet');
+
+EOF
+)
+$DUCKDB_PATH -c "$generate_large_parquet_query"
+
+mkdir -p data/attach_test/
+
+# Generate Storage Version
+$DUCKDB_PATH  data/attach_test/attach.db < test/sql/storage_version/generate_storage_version.sql
+$DUCKDB_PATH  data/attach_test/lineitem_sf1.db -c "CALL dbgen(sf=1)"
--- a/external/duckdb/scripts/generate_serialization.py
+++ b/external/duckdb/scripts/generate_serialization.py
@@ -0,0 +1,858 @@
+import os
+import json
+import re
+import argparse
+from enum import Enum
+
+from typing import Dict, Optional, Tuple, List
+
+parser = argparse.ArgumentParser(description='Generate serialization code')
+parser.add_argument('--source', type=str, help='Source directory')
+parser.add_argument('--target', type=str, help='Target directory')
+
+args = parser.parse_args()
+
+
+class MemberVariableStatus(Enum):
+    # Both serialized and deserialized
+    EXISTING = 1
+    # Not serialized, but is deserialized
+    READ_ONLY = 2
+    # Not serialized, not deserialized
+    DELETED = 3
+
+
+def get_file_list():
+    if args.source is None:
+        targets = [
+            {'source': 'src/include/duckdb/storage/serialization', 'target': 'src/storage/serialization'},
+            {'source': 'extension/parquet/include/', 'target': 'extension/parquet'},
+            {'source': 'extension/json/include/', 'target': 'extension/json'},
+        ]
+    else:
+        targets = [
+            {'source': args.source, 'target': args.target},
+        ]
+
+    file_list = []
+    for target in targets:
+        source_base = os.path.sep.join(target['source'].split('/'))
+        target_base = os.path.sep.join(target['target'].split('/'))
+        for fname in os.listdir(source_base):
+            if '.json' not in fname:
+                continue
+            if '_enums.json' in fname:
+                continue
+            file_list.append(
+                {
+                    'source': os.path.join(source_base, fname),
+                    'target': os.path.join(target_base, 'serialize_' + fname.replace('.json', '.cpp')),
+                }
+            )
+    return file_list
+
+
+scripts_dir = os.path.dirname(os.path.abspath(__file__))
+version_map_path = os.path.join(scripts_dir, '..', 'src', 'storage', 'version_map.json')
+version_map_file = file = open(version_map_path)
+version_map = json.load(version_map_file)
+
+
+def verify_serialization_versions(version_map):
+    serialization = version_map['serialization']['values']
+    if list(serialization.keys())[-1] != 'latest':
+        print(f"The version map ({version_map_path}) for serialization versions must end in 'latest'!")
+        exit(1)
+
+
+verify_serialization_versions(version_map)
+
+
+def lookup_serialization_version(version: str):
+    if version.lower() == "latest":
+        print(
+            f"'latest' is not an allowed 'version' to use in serialization JSON files, please provide a duckdb version"
+        )
+
+    versions = version_map['serialization']['values']
+    if version not in versions:
+        from packaging.version import Version
+
+        current_version = Version(version)
+
+        # This version does not exist in the version map
+        # Which is allowed for unreleased versions, they will get mapped to 'latest' instead
+
+        last_registered_version = Version(list(versions.keys())[-2])
+        if current_version < last_registered_version:
+            # The version was lower than the last defined version, which is not allowed
+            print(
+                f"Specified version ({current_version}) could not be found in the version_map.json, and it is lower than the last defined version ({last_registered_version})!"
+            )
+            exit(1)
+        if hasattr(versions, 'latest'):
+            # We have already mapped a version to 'latest', check that the versions match
+            latest_version = getattr(versions, 'latest')
+            if current_version != latest_version:
+                print(
+                    f"Found more than one version that is not present in the version_map.json!: Current: {current_version}, Latest: {latest_version}"
+                )
+                exit(1)
+        else:
+            setattr(lookup_serialization_version, 'latest', current_version)
+        return versions['latest']
+    return versions[version]
+
+
+INCLUDE_FORMAT = '#include "{filename}"\n'
+
+HEADER = '''//===----------------------------------------------------------------------===//
+// This file is automatically generated by scripts/generate_serialization.py
+// Do not edit this file manually, your changes will be overwritten
+//===----------------------------------------------------------------------===//
+
+{include_list}
+namespace duckdb {{
+'''
+
+FOOTER = '''
+} // namespace duckdb
+'''
+
+TEMPLATED_BASE_FORMAT = '''
+template <typename {template_name}>'''
+
+SERIALIZE_BASE_FORMAT = '''
+void {class_name}::Serialize(Serializer &serializer) const {{
+{members}}}
+'''
+
+SERIALIZE_ELEMENT_FORMAT = (
+    '\tserializer.WriteProperty<{property_type}>({property_id}, "{property_key}", {property_name}{property_default});\n'
+)
+
+BASE_SERIALIZE_FORMAT = '\t{base_class_name}::Serialize(serializer);\n'
+
+POINTER_RETURN_FORMAT = '{pointer}<{class_name}>'
+
+DESERIALIZE_BASE_FORMAT = '''
+{deserialize_return} {class_name}::Deserialize(Deserializer &deserializer) {{
+{members}
+}}
+'''
+
+SWITCH_CODE_FORMAT = '''\tswitch ({switch_variable}) {{
+{case_statements}\tdefault:
+\t\tthrow SerializationException("Unsupported type for deserialization of {base_class}!");
+\t}}
+'''
+
+SET_DESERIALIZE_PARAMETER_FORMAT = '\tdeserializer.Set<{property_type}>({property_name});\n'
+UNSET_DESERIALIZE_PARAMETER_FORMAT = '\tdeserializer.Unset<{property_type}>();\n'
+GET_DESERIALIZE_PARAMETER_FORMAT = 'deserializer.Get<{property_type}>()'
+TRY_GET_DESERIALIZE_PARAMETER_FORMAT = 'deserializer.TryGet<{property_type}>()'
+
+SWITCH_HEADER_FORMAT = '\tcase {enum_type}::{enum_value}:\n'
+
+SWITCH_STATEMENT_FORMAT = (
+    SWITCH_HEADER_FORMAT
+    + '''\t\tresult = {class_deserialize}::Deserialize(deserializer);
+\t\tbreak;
+'''
+)
+
+DESERIALIZE_ELEMENT_FORMAT = '\tauto {property_name} = deserializer.ReadProperty<{property_type}>({property_id}, "{property_key}"{property_default});\n'
+DESERIALIZE_ELEMENT_BASE_FORMAT = '\tauto {property_name} = deserializer.ReadProperty<unique_ptr<{base_property}>>({property_id}, "{property_key}"{property_default});\n'
+DESERIALIZE_ELEMENT_CLASS_FORMAT = '\tdeserializer.ReadProperty<{property_type}>({property_id}, "{property_key}", result{assignment}{property_name}{property_default});\n'
+DESERIALIZE_ELEMENT_CLASS_BASE_FORMAT = '\tauto {property_name} = deserializer.ReadProperty<unique_ptr<{base_property}>>({property_id}, "{property_key}"{property_default});\n\tresult{assignment}{property_name} = unique_ptr_cast<{base_property}, {derived_property}>(std::move({property_name}));\n'
+
+MOVE_LIST = [
+    'string',
+    'ParsedExpression*',
+    'CommonTableExpressionMap',
+    'LogicalType',
+    'ColumnDefinition',
+    'BaseStatistics',
+    'BoundLimitNode',
+]
+
+REFERENCE_LIST = ['ClientContext', 'bound_parameter_map_t', 'Catalog']
+
+
+def is_container(type):
+    return '<' in type and 'CSVOption' not in type
+
+
+def is_pointer(type):
+    return type.endswith('*') or type.startswith('shared_ptr<')
+
+
+def is_zeroable(type):
+    return type in [
+        'bool',
+        'int8_t',
+        'int16_t',
+        'int32_t',
+        'int64_t',
+        'uint8_t',
+        'uint16_t',
+        'uint32_t',
+        'uint64_t',
+        'idx_t',
+        'size_t',
+        'int',
+    ]
+
+
+def requires_move(type):
+    return is_container(type) or is_pointer(type) or type in MOVE_LIST
+
+
+def replace_pointer(type):
+    return re.sub('([a-zA-Z0-9]+)[*]', 'unique_ptr<\\1>', type)
+
+
+def get_default_argument(default_value):
+    return f'{default_value}'.lower() if type(default_value) == bool else f'{default_value}'
+
+
+def get_deserialize_element_template(
+    template,
+    property_name,
+    property_key,
+    property_id,
+    property_type,
+    has_default,
+    default_value,
+    status: MemberVariableStatus,
+    pointer_type,
+):
+    if status == MemberVariableStatus.READ_ONLY and not has_default:
+        print("'read_only' status is not allowed without a default value")
+        exit(1)
+
+    # read_method = 'ReadProperty'
+    assignment = '.' if pointer_type == 'none' else '->'
+    default_argument = '' if default_value is None else f', {get_default_argument(default_value)}'
+    if status == MemberVariableStatus.DELETED:
+        template = template.replace(', result{assignment}{property_name}', '').replace(
+            'ReadProperty', 'ReadDeletedProperty'
+        )
+    elif has_default and default_value is None:
+        template = template.replace('ReadProperty', 'ReadPropertyWithDefault')
+    elif has_default and default_value is not None:
+        template = template.replace('ReadProperty', 'ReadPropertyWithExplicitDefault')
+    template = template.format(
+        property_name=property_name,
+        property_key=property_key,
+        property_id=str(property_id),
+        property_default=default_argument,
+        property_type=property_type,
+        assignment=assignment,
+    )
+    if status == MemberVariableStatus.DELETED:
+        template = template.replace(f'auto {property_name} = ', '')
+    return template
+
+
+def get_deserialize_assignment(property_name, property_type, pointer_type):
+    assignment = '.' if pointer_type == 'none' else '->'
+    property = property_name.replace('.', '_')
+    if requires_move(property_type):
+        property = f'std::move({property})'
+    return f'\tresult{assignment}{property_name} = {property};\n'
+
+
+def get_return_value(pointer_type, class_name):
+    if pointer_type == 'none':
+        return class_name
+    return POINTER_RETURN_FORMAT.format(pointer=pointer_type, class_name=class_name)
+
+
+def generate_return(class_entry):
+    if class_entry.base is None or class_entry.constructor_method is not None:
+        return '\treturn result;'
+    else:
+        return '\treturn std::move(result);'
+
+
+def parse_status(status: str):
+    if status == 'deleted':
+        return MemberVariableStatus.DELETED
+    if status == 'read_only':
+        return MemberVariableStatus.READ_ONLY
+    if status == 'existing':
+        return MemberVariableStatus.EXISTING
+    valid_options = ['deleted', 'read_only', 'existing']
+    valid_options_string = ", ".join(valid_options)
+    print(f"Invalid 'status' ('{status}') encountered, valid options are: {valid_options_string}")
+    exit(1)
+
+
+# FIXME: python has __slots__ for this, so it's enforced by Python itself
+# see: https://wiki.python.org/moin/UsingSlots
+supported_member_entries = [
+    'id',
+    'name',
+    'type',
+    'property',
+    'serialize_property',
+    'deserialize_property',
+    'base',
+    'default',
+    'status',
+    'version',
+]
+
+
+def has_default_by_default(type):
+    if is_pointer(type):
+        return True
+    if is_container(type):
+        if 'IndexVector' in type:
+            return False
+        if 'CSVOption' in type:
+            return False
+        return True
+    if type == 'string':
+        return True
+    if is_zeroable(type):
+        return True
+    return False
+
+
+class MemberVariable:
+    def __init__(self, entry):
+        self.id = entry['id']
+        self.name = entry['name']
+        self.type = entry['type']
+        self.base = None
+        self.has_default = False
+        self.default = None
+        self.status: MemberVariableStatus = MemberVariableStatus.EXISTING
+        self.version: str = 'v0.10.2'
+        if 'property' in entry:
+            self.serialize_property = entry['property']
+            self.deserialize_property = entry['property']
+        else:
+            self.serialize_property = self.name
+            self.deserialize_property = self.name
+        if 'version' in entry:
+            self.version = entry['version']
+        if 'serialize_property' in entry:
+            self.serialize_property = entry['serialize_property']
+        if 'deserialize_property' in entry:
+            self.deserialize_property = entry['deserialize_property']
+        if 'default' in entry:
+            self.has_default = True
+            self.default = entry['default']
+        if 'status' in entry:
+            self.status = parse_status(entry['status'])
+        if self.default is None:
+            # default default
+            self.has_default = has_default_by_default(self.type)
+        if 'base' in entry:
+            self.base = entry['base']
+        for key in entry.keys():
+            if key not in supported_member_entries:
+                print(
+                    f"Unsupported key \"{key}\" in member variable, key should be in set {str(supported_member_entries)}"
+                )
+
+
+supported_serialize_entries = [
+    'class',
+    'class_type',
+    'pointer_type',
+    'base',
+    'enum',
+    'constructor',
+    'constructor_method',
+    'custom_implementation',
+    'custom_switch_code',
+    'members',
+    'return_type',
+    'set_parameters',
+    'includes',
+    'finalize_deserialization',
+]
+
+
+class SerializableClass:
+    def __init__(self, entry):
+        self.name = entry['class']
+        self.is_base_class = 'class_type' in entry
+        self.base = None
+        self.base_object = None
+        self.enum_value = None
+        self.enum_entries = []
+        self.set_parameter_names = []
+        self.set_parameters = []
+        self.pointer_type = 'unique_ptr'
+        self.constructor: Optional[List[str]] = None
+        self.constructor_method = None
+        self.members: Optional[List[MemberVariable]] = None
+        self.custom_implementation = False
+        self.custom_switch_code = None
+        self.children: Dict[str, SerializableClass] = {}
+        self.return_type = self.name
+        self.return_class = self.name
+        self.finalize_deserialization = None
+        if 'finalize_deserialization' in entry:
+            self.finalize_deserialization = entry['finalize_deserialization']
+        if self.is_base_class:
+            self.enum_value = entry['class_type']
+        if 'pointer_type' in entry:
+            self.pointer_type = entry['pointer_type']
+        if 'base' in entry:
+            self.base = entry['base']
+            self.enum_entries = entry['enum']
+            if type(self.enum_entries) is str:
+                self.enum_entries = [self.enum_entries]
+            self.return_type = self.base
+        if 'constructor' in entry:
+            self.constructor = entry['constructor']
+            if not isinstance(self.constructor, list):
+                print(f"constructor for {self.name}, must be of type [], but is of type {str(type(self.constructor))}")
+                exit(1)
+        if 'constructor_method' in entry:
+            self.constructor_method = entry['constructor_method']
+            if self.constructor is not None:
+                print(
+                    "Not allowed to mix 'constructor_method' and 'constructor', 'constructor_method' will implicitly receive all parameters"
+                )
+                exit(1)
+        if 'custom_implementation' in entry and entry['custom_implementation']:
+            self.custom_implementation = True
+        if 'custom_switch_code' in entry:
+            self.custom_switch_code = entry['custom_switch_code']
+        if 'members' in entry:
+            self.members = [MemberVariable(x) for x in entry['members']]
+        if 'return_type' in entry:
+            self.return_type = entry['return_type']
+            self.return_class = self.return_type
+        if 'set_parameters' in entry:
+            self.set_parameter_names = entry['set_parameters']
+            for set_parameter_name in self.set_parameter_names:
+                found = False
+                assert self.members is not None
+                for member in self.members:
+                    if member.name == set_parameter_name:
+                        self.set_parameters.append(member)
+                        found = True
+                        break
+                if not found:
+                    raise Exception(f'Set parameter {set_parameter_name} not found in member list')
+        for key in entry.keys():
+            if key not in supported_serialize_entries:
+                print(
+                    f"Unsupported key \"{key}\" in member variable, key should be in set {str(supported_serialize_entries)}"
+                )
+
+    def inherit(self, base_class):
+        self.base_object = base_class
+        self.pointer_type = base_class.pointer_type
+
+    def get_deserialize_element(
+        self, entry: MemberVariable, *, base: Optional[str] = None, pointer_type: Optional[str] = None
+    ):
+        property_name = entry.deserialize_property
+        property_id = entry.id
+        property_key = entry.name
+        property_type = replace_pointer(entry.type)
+        if not pointer_type:
+            pointer_type = self.pointer_type
+
+        property_name = property_name.replace('.', '_')
+        template = DESERIALIZE_ELEMENT_FORMAT
+        if base:
+            template = DESERIALIZE_ELEMENT_BASE_FORMAT.replace('{base_property}', base.replace('*', ''))
+
+        return get_deserialize_element_template(
+            template,
+            property_name,
+            property_key,
+            property_id,
+            property_type,
+            entry.has_default,
+            entry.default,
+            entry.status,
+            pointer_type,
+        )
+
+    def get_serialize_element(self, entry: MemberVariable):
+        property_name = entry.serialize_property
+        property_id = entry.id
+        property_key = entry.name
+        property_type = replace_pointer(entry.type)
+        default_value = entry.default
+
+        assignment = '.' if self.pointer_type == 'none' else '->'
+        default_argument = '' if default_value is None else f', {get_default_argument(default_value)}'
+        storage_version = lookup_serialization_version(entry.version)
+        conditional_serialization = storage_version != 1
+        template = SERIALIZE_ELEMENT_FORMAT
+        if entry.status != MemberVariableStatus.EXISTING and not conditional_serialization:
+            template = "\t/* [Deleted] ({property_type}) \"{property_name}\" */\n"
+        elif entry.has_default:
+            template = template.replace('WriteProperty', 'WritePropertyWithDefault')
+        serialization_code = template.format(
+            property_name=property_name,
+            property_type=property_type,
+            property_id=str(property_id),
+            property_key=property_key,
+            property_default=default_argument,
+            assignment=assignment,
+        )
+
+        if conditional_serialization:
+            code = []
+            if entry.status != MemberVariableStatus.EXISTING:
+                # conditional delete
+                code.append(f'\tif (!serializer.ShouldSerialize({storage_version})) {{')
+            else:
+                # conditional serialization
+                code.append(f'\tif (serializer.ShouldSerialize({storage_version})) {{')
+            code.append('\t' + serialization_code)
+
+            result = '\n'.join(code) + '\t}\n'
+            return result
+        return serialization_code
+
+    def generate_constructor(self, constructor_parameters: List[str]):
+        parameters = ", ".join(constructor_parameters)
+
+        if self.constructor_method is not None:
+            return f'\tauto result = {self.constructor_method}({parameters});\n'
+        if self.pointer_type == 'none':
+            if parameters != '':
+                parameters = f'({parameters})'
+            return f'\t{self.return_class} result{parameters};\n'
+        return f'\tauto result = duckdb::{self.pointer_type}<{self.return_class}>(new {self.return_class}({parameters}));\n'
+
+
+def generate_base_class_code(base_class: SerializableClass):
+    base_class_serialize = ''
+    base_class_deserialize = ''
+
+    # properties
+    enum_type = ''
+    for entry in base_class.members:
+        if entry.serialize_property == base_class.enum_value:
+            enum_type = entry.type
+        base_class_serialize += base_class.get_serialize_element(entry)
+
+        type_name = replace_pointer(entry.type)
+        base_class_deserialize += base_class.get_deserialize_element(entry)
+    expressions = [x for x in base_class.children.items()]
+    expressions = sorted(expressions, key=lambda x: x[0])
+
+    # set parameters
+    for entry in base_class.set_parameters:
+        base_class_deserialize += SET_DESERIALIZE_PARAMETER_FORMAT.format(
+            property_type=entry.type, property_name=entry.name
+        )
+
+    base_class_deserialize += f'\t{base_class.pointer_type}<{base_class.name}> result;\n'
+    switch_cases = ''
+    for expr in expressions:
+        enum_value = expr[0]
+        child_data = expr[1]
+        if child_data.custom_switch_code is not None:
+            switch_cases += SWITCH_HEADER_FORMAT.format(
+                enum_type=enum_type, enum_value=enum_value, class_deserialize=child_data.name
+            )
+            switch_cases += '\n'.join(
+                ['\t\t' + x for x in child_data.custom_switch_code.replace('\\n', '\n').split('\n')]
+            )
+            switch_cases += '\n'
+            continue
+        switch_cases += SWITCH_STATEMENT_FORMAT.format(
+            enum_type=enum_type, enum_value=enum_value, class_deserialize=child_data.name
+        )
+
+    assign_entries = []
+    for entry in base_class.members:
+        skip = False
+        for check_entry in [entry.name, entry.serialize_property]:
+            if check_entry in base_class.set_parameter_names:
+                skip = True
+            if check_entry == base_class.enum_value:
+                skip = True
+        if skip:
+            continue
+        assign_entries.append(entry)
+
+    # class switch statement
+    base_class_deserialize += SWITCH_CODE_FORMAT.format(
+        switch_variable=base_class.enum_value, case_statements=switch_cases, base_class=base_class.name
+    )
+
+    deserialize_return = get_return_value(base_class.pointer_type, base_class.return_type)
+
+    for entry in base_class.set_parameters:
+        base_class_deserialize += UNSET_DESERIALIZE_PARAMETER_FORMAT.format(property_type=entry.type)
+
+    for entry in assign_entries:
+        if entry.status != MemberVariableStatus.EXISTING:
+            continue
+        move = False
+        if entry.type in MOVE_LIST or is_container(entry.type) or is_pointer(entry.type):
+            move = True
+        if move:
+            base_class_deserialize += (
+                f'\tresult->{entry.deserialize_property} = std::move({entry.deserialize_property});\n'
+            )
+        else:
+            base_class_deserialize += f'\tresult->{entry.deserialize_property} = {entry.deserialize_property};\n'
+    if base_class.finalize_deserialization is not None:
+        for line in base_class.finalize_deserialization:
+            base_class_deserialize += "\t" + line + "\n"
+    base_class_deserialize += generate_return(base_class)
+    base_class_generation = ''
+    serialization = ''
+    if base_class.base is not None:
+        serialization += BASE_SERIALIZE_FORMAT.format(base_class_name=base_class.base)
+    base_class_generation += SERIALIZE_BASE_FORMAT.format(
+        class_name=base_class.name, members=serialization + base_class_serialize
+    )
+    base_class_generation += DESERIALIZE_BASE_FORMAT.format(
+        deserialize_return=deserialize_return, class_name=base_class.name, members=base_class_deserialize
+    )
+    return base_class_generation
+
+
+def generate_class_code(class_entry: SerializableClass):
+    if class_entry.custom_implementation:
+        return None
+    class_serialize = ''
+    class_deserialize = ''
+
+    constructor_parameters: List[str] = []
+    constructor_entries = set()
+    last_constructor_index = -1
+    if class_entry.constructor is not None:
+        for constructor_entry_ in class_entry.constructor:
+            if constructor_entry_.endswith('&'):
+                constructor_entry = constructor_entry_[:-1]
+                is_reference = True
+            else:
+                constructor_entry = constructor_entry_
+                is_reference = False
+            constructor_entries.add(constructor_entry)
+            found = False
+            for entry_idx, entry in enumerate(class_entry.members):
+                if entry.name == constructor_entry:
+                    if entry_idx > last_constructor_index:
+                        last_constructor_index = entry_idx
+                    type_name = replace_pointer(entry.type)
+                    entry.deserialize_property = entry.deserialize_property.replace('.', '_')
+                    if requires_move(type_name) and not is_reference:
+                        constructor_parameters.append(f'std::move({entry.deserialize_property})')
+                    else:
+                        constructor_parameters.append(entry.deserialize_property)
+                    found = True
+                    break
+
+            if constructor_entry.startswith('$') or constructor_entry.startswith('?'):
+                is_optional = constructor_entry.startswith('?')
+                if is_optional:
+                    param_type = constructor_entry.replace('?', '')
+                    get_format = TRY_GET_DESERIALIZE_PARAMETER_FORMAT
+                else:
+                    param_type = constructor_entry.replace('$', '')
+                    get_format = GET_DESERIALIZE_PARAMETER_FORMAT
+                    if param_type in REFERENCE_LIST:
+                        param_type += ' &'
+                constructor_parameters.append(get_format.format(property_type=param_type))
+                found = True
+
+            if class_entry.base_object is not None:
+                for entry in class_entry.base_object.set_parameters:
+                    if entry.name == constructor_entry:
+                        constructor_parameters.append(GET_DESERIALIZE_PARAMETER_FORMAT.format(property_type=entry.type))
+                        found = True
+                        break
+            if not found:
+                print(f"Constructor member \"{constructor_entry}\" was not found in members list")
+                exit(1)
+    elif class_entry.constructor_method is not None:
+        for entry_idx, entry in enumerate(class_entry.members):
+            if entry_idx > last_constructor_index:
+                last_constructor_index = entry_idx
+            constructor_entries.add(entry.name)
+            type_name = replace_pointer(entry.type)
+            entry.deserialize_property = entry.deserialize_property.replace('.', '_')
+            if requires_move(type_name):
+                constructor_parameters.append(f'std::move({entry.deserialize_property})')
+            else:
+                constructor_parameters.append(entry.deserialize_property)
+
+    if class_entry.base is not None:
+        class_serialize += BASE_SERIALIZE_FORMAT.format(base_class_name=class_entry.base)
+    for entry_idx in range(last_constructor_index + 1):
+        entry = class_entry.members[entry_idx]
+        class_deserialize += class_entry.get_deserialize_element(entry, base=entry.base, pointer_type='unique_ptr')
+
+    class_deserialize += class_entry.generate_constructor(constructor_parameters)
+    if class_entry.members is None:
+        return None
+    for entry_idx, entry in enumerate(class_entry.members):
+        write_property_name = entry.serialize_property
+        deserialize_template_str = DESERIALIZE_ELEMENT_CLASS_FORMAT
+        if entry.base:
+            deserialize_template_str = DESERIALIZE_ELEMENT_CLASS_BASE_FORMAT.replace(
+                '{base_property}', entry.base.replace('*', '')
+            ).replace('{derived_property}', entry.type.replace('*', ''))
+
+        class_serialize += class_entry.get_serialize_element(entry)
+
+        type_name = replace_pointer(entry.type)
+        if entry_idx > last_constructor_index:
+            class_deserialize += get_deserialize_element_template(
+                deserialize_template_str,
+                entry.deserialize_property,
+                entry.name,
+                entry.id,
+                type_name,
+                entry.has_default,
+                entry.default,
+                entry.status,
+                class_entry.pointer_type,
+            )
+        elif entry.name not in constructor_entries and entry.status == MemberVariableStatus.EXISTING:
+            class_deserialize += get_deserialize_assignment(
+                entry.deserialize_property, entry.type, class_entry.pointer_type
+            )
+        if entry.name in class_entry.set_parameter_names and entry.status == MemberVariableStatus.EXISTING:
+            class_deserialize += SET_DESERIALIZE_PARAMETER_FORMAT.format(
+                property_type=entry.type, property_name=entry.name
+            )
+
+    for entry in class_entry.set_parameters:
+        class_deserialize += UNSET_DESERIALIZE_PARAMETER_FORMAT.format(
+            property_type=entry.type, property_name=entry.name
+        )
+    if class_entry.finalize_deserialization is not None:
+        class_deserialize += class_entry.finalize_deserialization
+    if class_entry.finalize_deserialization is not None:
+        for line in class_entry.finalize_deserialization:
+            class_deserialize += "\t" + line + "\n"
+    class_deserialize += generate_return(class_entry)
+    deserialize_return = get_return_value(class_entry.pointer_type, class_entry.return_type)
+
+    class_generation = ''
+    pattern = re.compile(r'<\w+>')
+    templated_type = ''
+
+    # Check if is a templated class
+    is_templated = pattern.search(class_entry.name)
+    if is_templated:
+        templated_type = TEMPLATED_BASE_FORMAT.format(template_name=is_templated.group()[1:-1])
+
+    class_generation += templated_type + SERIALIZE_BASE_FORMAT.format(
+        class_name=class_entry.name, members=class_serialize
+    )
+
+    class_generation += templated_type + DESERIALIZE_BASE_FORMAT.format(
+        deserialize_return=deserialize_return,
+        class_name=class_entry.name,
+        members=class_deserialize,
+    )
+    return class_generation
+
+
+def check_children_for_duplicate_members(node: SerializableClass, parents: list, seen_names: set, seen_ids: set):
+    # Check for duplicate names
+    if node.members is not None:
+        for member in node.members:
+            if member.name in seen_names:
+                # Print the inheritance tree
+                exit(
+                    f"Error: Duplicate member name \"{member.name}\" in class \"{node.name}\" ({' -> '.join(map(lambda x: x.name, parents))} -> {node.name})"
+                )
+            seen_names.add(member.name)
+            if member.id in seen_ids:
+                exit(
+                    f"Error: Duplicate member id \"{member.id}\" in class \"{node.name}\" ({' -> '.join(map(lambda x: x.name, parents))} -> {node.name})"
+                )
+            seen_ids.add(member.id)
+
+    # Recurse
+    for child in node.children.values():
+        check_children_for_duplicate_members(child, parents + [node], seen_names.copy(), seen_ids.copy())
+
+
+file_list = get_file_list()
+
+for entry in file_list:
+    source_path = entry['source']
+    target_path = entry['target']
+    with open(source_path, 'r') as f:
+        try:
+            json_data = json.load(f)
+        except Exception as e:
+            print(f"Failed to parse {source_path}: {str(e)}")
+            exit(1)
+
+    include_list = [
+        'duckdb/common/serializer/serializer.hpp',
+        'duckdb/common/serializer/deserializer.hpp',
+    ]
+    base_classes: List[SerializableClass] = []
+    classes: List[SerializableClass] = []
+    base_class_data: Dict[str, SerializableClass] = {}
+
+    for entry in json_data:
+        if 'includes' in entry:
+            if type(entry['includes']) != type([]):
+                print(f"Include list must be a list, found {type(entry['includes'])} (in {str(entry)})")
+                exit(1)
+            for include_entry in entry['includes']:
+                if include_entry not in include_list:
+                    include_list.append(include_entry)
+        new_class = SerializableClass(entry)
+        if new_class.is_base_class:
+            # this class is a base class itself - construct the base class list
+            if new_class.name in base_class_data:
+                raise Exception(f"Duplicate base class \"{new_class.name}\"")
+            base_class_data[new_class.name] = new_class
+            base_classes.append(new_class)
+        else:
+            classes.append(new_class)
+        if new_class.base is not None:
+            # this class inherits from a base class - add the enum value
+            if new_class.base not in base_class_data:
+                raise Exception(f"Unknown base class \"{new_class.base}\" for entry \"{new_class.name}\"")
+            base_class_object = base_class_data[new_class.base]
+            new_class.inherit(base_class_object)
+            for enum_entry in new_class.enum_entries:
+                if enum_entry in base_class_object.children:
+                    raise Exception(f"Duplicate enum entry \"{enum_entry}\"")
+                base_class_object.children[enum_entry] = new_class
+
+    # Ensure that there are no duplicate names in the inheritance tree
+    for base_class in base_classes:
+        if base_class.base is None:
+            # Root base class, now traverse the children
+            check_children_for_duplicate_members(base_class, [], set(), set())
+
+    with open(target_path, 'w+') as f:
+        include_list = ''.join([INCLUDE_FORMAT.format(filename=x) for x in include_list])
+        header = HEADER.format(include_list=include_list)
+        f.write(header)
+
+        # generate the base class serialization
+        for base_class in base_classes:
+            base_class_generation = generate_base_class_code(base_class)
+            f.write(base_class_generation)
+
+        # generate the class serialization
+        classes = sorted(classes, key=lambda x: x.name)
+        for class_entry in classes:
+            class_generation = generate_class_code(class_entry)
+            if class_generation is None:
+                continue
+            f.write(class_generation)
+
+        f.write(FOOTER)
--- a/external/duckdb/scripts/generate_settings.py
+++ b/external/duckdb/scripts/generate_settings.py
@@ -0,0 +1,10 @@
+from settings_scripts import parse_and_sort_json_file, update_header_file, update_scopes, update_src_code
+from settings_scripts.config import SettingsList, make_format
+
+if __name__ == '__main__':
+    parse_and_sort_json_file()
+    update_header_file()
+    update_scopes()
+    update_src_code()
+    make_format()
+    print(f"- Successfully parsed and included {len(SettingsList)} setting(s)!")
--- a/external/duckdb/scripts/generate_storage_info.py
+++ b/external/duckdb/scripts/generate_storage_info.py
@@ -0,0 +1,77 @@
+import json
+import os
+
+scripts_dir = os.path.dirname(os.path.abspath(__file__))
+VERSION_MAP_PATH = scripts_dir + "/../src/storage/version_map.json"
+STORAGE_INFO_PATH = scripts_dir + "/../src/storage/storage_info.cpp"
+START_MARKER = "// START OF {type} VERSION INFO"
+END_MARKER = "// END OF {type} VERSION INFO"
+
+
+def generate_version_info_array(storage_versions, type, name, default):
+    result = []
+    name_upper = name.upper()
+    if 'latest' in storage_versions:
+        latest_value = storage_versions['latest']
+        result.append(f"const uint64_t LATEST_{name_upper} = {latest_value};")
+
+    result.append(f"const uint64_t DEFAULT_{name_upper} = {default};")
+
+    result.append(f"static const {type} {name}[] = {{")
+
+    for version_name, storage_version in storage_versions.items():
+        result.append(f'\t{{"{version_name}", {storage_version}}},')
+
+    result.append("\t{nullptr, 0}")
+    result.append("};\n")
+
+    return "\n".join(result)
+
+
+def main():
+
+    with open(VERSION_MAP_PATH, 'r') as json_file:
+        version_map = json.load(json_file)
+
+    with open(STORAGE_INFO_PATH, "r") as cpp_file:
+        content = cpp_file.read()
+
+    for key in version_map['serialization']['values'].keys():
+        if key in ['latest']:
+            continue
+        if key not in version_map['storage']['values'].keys():
+            print(f'Key {key} found in serialization version but not in storage version')
+            exit(1)
+    types = ['storage', 'serialization']
+    for type in version_map:
+        if type not in types:
+            print("Unexpected key {type}")
+            exit(1)
+        capitalized_type = type.capitalize()
+        upper_type = type.upper()
+        array_code = generate_version_info_array(
+            version_map[type]['values'],
+            f'{capitalized_type}VersionInfo',
+            f'{type}_version_info',
+            version_map[type]['default'],
+        )
+
+        start_marker = START_MARKER.format(type=upper_type)
+        start_index = content.find(start_marker)
+        if start_index == -1:
+            print(f"storage_info.cpp is corrupted, could not find the START_MARKER for {type}")
+            exit(1)
+
+        end_marker = END_MARKER.format(type=upper_type)
+        end_index = content.find(end_marker)
+        if end_index == -1:
+            print(f"storage_info.cpp is corrupted, could not find the END_MARKER for {type}")
+            exit(1)
+        content = content[: start_index + len(start_marker)] + "\n" + array_code + content[end_index:]
+
+    with open(STORAGE_INFO_PATH, "w") as cpp_file:
+        cpp_file.write(content)
+
+
+if __name__ == "__main__":
+    main()
--- a/external/duckdb/scripts/generate_storage_version.py
+++ b/external/duckdb/scripts/generate_storage_version.py
@@ -0,0 +1,49 @@
+# this script re-generates the storage used for storage_version.test_slow
+# before running this script, increment the version number in src/storage/storage_info.cpp and recompile (`make`)
+
+import os
+import subprocess
+from python_helpers import open_utf8
+
+shell_proc = os.path.join('build', 'release', 'duckdb')
+
+gen_storage_script = os.path.join('test', 'sql', 'storage_version', 'generate_storage_version.sql')
+gen_storage_target = os.path.join('test', 'sql', 'storage_version', 'storage_version.db')
+
+
+def try_remove_file(fname):
+    try:
+        os.remove(fname)
+    except:
+        pass
+
+
+try_remove_file(gen_storage_target)
+try_remove_file(gen_storage_target + '.wal')
+
+
+def run_command_in_shell(cmd):
+    print(cmd)
+    res = subprocess.run(
+        [shell_proc, '--batch', '-init', '/dev/null', gen_storage_target],
+        capture_output=True,
+        input=bytearray(cmd, 'utf8'),
+    )
+    stdout = res.stdout.decode('utf8').strip()
+    stderr = res.stderr.decode('utf8').strip()
+    if res.returncode != 0:
+        print("Failed to create database file!")
+        print("----STDOUT----")
+        print(stdout)
+        print("----STDERR----")
+        print(stderr)
+
+
+with open_utf8(gen_storage_script, 'r') as f:
+    cmd = f.read()
+
+run_command_in_shell(cmd)
+run_command_in_shell('select * from integral_values')
+run_command_in_shell('select * from integral_values')
+
+try_remove_file(gen_storage_target + '.wal')
--- a/external/duckdb/scripts/generate_tpcds_results.py
+++ b/external/duckdb/scripts/generate_tpcds_results.py
@@ -0,0 +1,137 @@
+import psycopg2
+import argparse
+import os
+import platform
+import shutil
+import sys
+import subprocess
+import multiprocessing.pool
+
+parser = argparse.ArgumentParser(description='Generate TPC-DS reference results from Postgres.')
+parser.add_argument(
+    '--sf', dest='sf', action='store', help='The TPC-DS scale factor reference results to generate', default=1
+)
+parser.add_argument(
+    '--query-dir',
+    dest='query_dir',
+    action='store',
+    help='The directory with queries to run',
+    default='extension/tpcds/dsdgen/queries',
+)
+parser.add_argument(
+    '--answer-dir',
+    dest='answer_dir',
+    action='store',
+    help='The directory where to store the answers',
+    default='extension/tpcds/dsdgen/answers/sf${SF}',
+)
+parser.add_argument(
+    '--duckdb-path',
+    dest='duckdb_path',
+    action='store',
+    help='The path to the DuckDB executable',
+    default='build/reldebug/duckdb',
+)
+parser.add_argument(
+    '--skip-load',
+    dest='skip_load',
+    action='store_const',
+    const=True,
+    help='Whether or not to skip loading',
+    default=False,
+)
+parser.add_argument(
+    '--query-list', dest='query_list', action='store', help='The list of queries to run (default = all)', default=''
+)
+parser.add_argument('--nthreads', dest='nthreads', action='store', type=int, help='The number of threads', default=0)
+
+args = parser.parse_args()
+
+con = psycopg2.connect(database='postgres')
+c = con.cursor()
+if not args.skip_load:
+    tpcds_dir = f'tpcds_sf{args.sf}'
+
+    q = f"""
+    CALL dsdgen(sf={args.sf});
+    EXPORT DATABASE '{tpcds_dir}' (DELIMITER '|');
+    """
+    proc = subprocess.Popen([args.duckdb_path, "-c", q])
+    proc.wait()
+    if proc.returncode != 0:
+        exit(1)
+
+    # drop the previous tables
+    tables = [
+        'name',
+        'web_site',
+        'web_sales',
+        'web_returns',
+        'web_page',
+        'warehouse',
+        'time_dim',
+        'store_sales',
+        'store_returns',
+        'store',
+        'ship_mode',
+        'reason',
+        'promotion',
+        'item',
+        'inventory',
+        'income_band',
+        'household_demographics',
+        'date_dim',
+        'customer_demographics',
+        'customer_address',
+        'customer',
+        'catalog_sales',
+        'catalog_returns',
+        'catalog_page',
+        'call_center',
+    ]
+    for table in tables:
+        c.execute(f'DROP TABLE IF EXISTS {table};')
+
+    with open(os.path.join(tpcds_dir, 'schema.sql'), 'r') as f:
+        schema = f.read()
+
+    c.execute(schema)
+
+    with open(os.path.join(tpcds_dir, 'load.sql'), 'r') as f:
+        load = f.read()
+
+    load = load.replace(f'{tpcds_dir}/', f'{os.getcwd()}/{tpcds_dir}/')
+
+    c.execute(load)
+
+    con.commit()
+
+# get a list of all queries
+queries = os.listdir(args.query_dir)
+queries.sort()
+
+answer_dir = args.answer_dir.replace('${SF}', args.sf)
+
+if len(args.query_list) > 0:
+    passing_queries = [x + '.sql' for x in args.query_list.split(',')]
+    queries = [x for x in queries if x in passing_queries]
+    queries.sort()
+
+
+def run_query(q):
+    print(q)
+    with open(os.path.join(args.query_dir, q), 'r') as f:
+        sql_query = f.read()
+    answer_path = os.path.join(os.getcwd(), answer_dir, q.replace('.sql', '.csv'))
+    c.execute(f'DROP TABLE IF EXISTS "query_result{q}"')
+    c.execute(f'CREATE TABLE "query_result{q}" AS ' + sql_query)
+    c.execute(f"COPY \"query_result{q}\" TO '{answer_path}' (FORMAT CSV, DELIMITER '|', HEADER, NULL 'NULL')")
+
+
+if args.nthreads == 0:
+    for q in queries:
+        run_query(q)
+else:
+    pool = multiprocessing.pool.ThreadPool(processes=args.nthreads)
+
+    pool.map(run_query, queries)
--- a/external/duckdb/scripts/generate_tpcds_schema.py
+++ b/external/duckdb/scripts/generate_tpcds_schema.py
@@ -0,0 +1,116 @@
+import os
+import subprocess
+
+duckdb_program = '/Users/myth/Programs/duckdb-bugfix/build/release/duckdb'
+
+struct_def = '''struct $STRUCT_NAME {
+	static constexpr char *Name = "$NAME";
+	static const char *Columns[];
+	static constexpr idx_t ColumnCount = $COLUMN_COUNT;
+	static const LogicalType Types[];
+	static constexpr idx_t PrimaryKeyCount = $PK_COLUMN_COUNT;
+	static const char *PrimaryKeyColumns[];
+};
+'''
+
+initcode = '''
+call dsdgen(sf=0);
+.mode csv
+.header 0
+'''
+
+column_count_query = '''
+select count(*) from pragma_table_info('$NAME');
+'''
+
+pk_column_count_query = '''
+select count(*) from pragma_table_info('$NAME') where pk=true;
+'''
+
+gen_names = '''
+select concat('const char *', '$STRUCT_NAME', '::Columns[] = {', STRING_AGG('"' || name || '"', ', ') || '};') from pragma_table_info('$NAME');
+'''
+
+gen_types = '''
+select concat('const LogicalType ', '$STRUCT_NAME', '::Types[] = {', STRING_AGG('LogicalType::' || type, ', ') || '};') from pragma_table_info('$NAME');
+'''
+
+pk_columns = '''
+select concat('const char *', '$STRUCT_NAME', '::PrimaryKeyColumns[] = {', STRING_AGG('"' || name || '"', ', ') || '};') from pragma_table_info('$NAME') where pk=true;
+'''
+
+
+def run_query(sql):
+    input_sql = initcode + '\n' + sql
+    res = subprocess.run(duckdb_program, input=input_sql.encode('utf8'), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    stdout = res.stdout.decode('utf8').strip()
+    stderr = res.stderr.decode('utf8').strip()
+    if res.returncode != 0:
+        print("FAILED TO RUN QUERY")
+        print(stderr)
+        exit(1)
+    return stdout
+
+
+def prepare_query(sql, table_name, struct_name):
+    return sql.replace('$NAME', table_name).replace('$STRUCT_NAME', struct_name)
+
+
+header = '''
+#pragma once
+
+#include "duckdb.hpp"
+
+#ifndef DUCKDB_AMALGAMATION
+#include "duckdb/common/exception.hpp"
+#include "duckdb/common/types/date.hpp"
+#include "duckdb/parser/column_definition.hpp"
+#include "duckdb/storage/data_table.hpp"
+#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
+#include "duckdb/planner/parsed_data/bound_create_table_info.hpp"
+#include "duckdb/parser/parsed_data/create_table_info.hpp"
+#include "duckdb/parser/constraints/not_null_constraint.hpp"
+#include "duckdb/catalog/catalog.hpp"
+#include "duckdb/planner/binder.hpp"
+#endif
+
+namespace tpcds {
+
+using duckdb::LogicalType;
+using duckdb::idx_t;
+'''
+
+footer = '''
+}
+'''
+
+print(header)
+
+table_list = run_query('show tables')
+for table_name in table_list.split('\n'):
+    table_name = table_name.strip()
+    print(
+        '''
+//===--------------------------------------------------------------------===//
+// $NAME
+//===--------------------------------------------------------------------===//'''.replace(
+            '$NAME', table_name
+        )
+    )
+    struct_name = str(table_name.title().replace('_', '')) + 'Info'
+    column_count = int(run_query(prepare_query(column_count_query, table_name, struct_name)).strip())
+    pk_column_count = int(run_query(prepare_query(pk_column_count_query, table_name, struct_name)).strip())
+    print(
+        prepare_query(struct_def, table_name, struct_name)
+        .replace('$COLUMN_COUNT', str(column_count))
+        .replace('$PK_COLUMN_COUNT', str(pk_column_count))
+    )
+
+    print(run_query(prepare_query(gen_names, table_name, struct_name)).replace('""', '"').strip('"'))
+    print("")
+    print(run_query(prepare_query(gen_types, table_name, struct_name)).strip('"'))
+    print("")
+    print(run_query(prepare_query(pk_columns, table_name, struct_name)).replace('""', '"').strip('"'))
+
+print(footer)
--- a/external/duckdb/scripts/generate_vector_sizes.py
+++ b/external/duckdb/scripts/generate_vector_sizes.py
@@ -0,0 +1,21 @@
+supported_vector_sizes = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
+
+result = ""
+for i in range(len(supported_vector_sizes)):
+    vsize = supported_vector_sizes[i]
+    if i == 0:
+        result += "#if"
+    else:
+        result += "#elif"
+    result += " STANDARD_VECTOR_SIZE == " + str(vsize) + "\n"
+    result += "const sel_t FlatVector::incremental_vector[] = {"
+    for idx in range(vsize):
+        if idx != 0:
+            result += ", "
+        result += str(idx)
+    result += "};\n"
+
+result += """#else
+#error Unsupported VECTOR_SIZE!
+#endif"""
+print(result)
--- a/external/duckdb/scripts/gentpcecode.py
+++ b/external/duckdb/scripts/gentpcecode.py
@@ -0,0 +1,327 @@
+import os
+from python_helpers import open_utf8
+
+GENERATED_HEADER = 'include/tpce_generated.hpp'
+GENERATED_SOURCE = 'tpce_generated.cpp'
+TPCE_DIR = os.path.join('third_party', 'tpce-tool')
+
+GENERATED_HEADER = os.path.join(TPCE_DIR, GENERATED_HEADER)
+GENERATED_SOURCE = os.path.join(TPCE_DIR, GENERATED_SOURCE)
+
+current_table = None
+
+tables = {}
+
+print(GENERATED_HEADER)
+print(GENERATED_SOURCE)
+
+header = open_utf8(GENERATED_HEADER, 'w+')
+source = open_utf8(GENERATED_SOURCE, 'w+')
+
+for fp in [header, source]:
+    fp.write(
+        """
+////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////
+// THIS FILE IS GENERATED BY gentpcecode.py, DO NOT EDIT MANUALLY //
+////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////
+
+"""
+    )
+
+header.write(
+    """
+#include "duckdb/catalog/catalog.hpp"
+#include "duckdb/main/appender.hpp"
+#include "duckdb/main/connection.hpp"
+#include "duckdb/main/database.hpp"
+
+#include "main/BaseLoader.h"
+#include "main/BaseLoaderFactory.h"
+#include "main/NullLoader.h"
+#include "main/TableRows.h"
+
+namespace TPCE {
+	class DuckDBLoaderFactory : public CBaseLoaderFactory {
+		duckdb::Connection &con;
+		std::string schema;
+		std::string suffix;
+
+	  public:
+		DuckDBLoaderFactory(duckdb::Connection &con, std::string schema,
+		                    std::string suffix)
+		    : con(con), schema(schema), suffix(suffix) {
+		}
+
+		// Functions to create loader classes for individual tables.
+		virtual CBaseLoader<ACCOUNT_PERMISSION_ROW> *
+		CreateAccountPermissionLoader();
+		virtual CBaseLoader<ADDRESS_ROW> *CreateAddressLoader();
+		virtual CBaseLoader<BROKER_ROW> *CreateBrokerLoader();
+		virtual CBaseLoader<CASH_TRANSACTION_ROW> *
+		CreateCashTransactionLoader();
+		virtual CBaseLoader<CHARGE_ROW> *CreateChargeLoader();
+		virtual CBaseLoader<COMMISSION_RATE_ROW> *CreateCommissionRateLoader();
+		virtual CBaseLoader<COMPANY_COMPETITOR_ROW> *
+		CreateCompanyCompetitorLoader();
+		virtual CBaseLoader<COMPANY_ROW> *CreateCompanyLoader();
+		virtual CBaseLoader<CUSTOMER_ACCOUNT_ROW> *
+		CreateCustomerAccountLoader();
+		virtual CBaseLoader<CUSTOMER_ROW> *CreateCustomerLoader();
+		virtual CBaseLoader<CUSTOMER_TAXRATE_ROW> *
+		CreateCustomerTaxrateLoader();
+		virtual CBaseLoader<DAILY_MARKET_ROW> *CreateDailyMarketLoader();
+		virtual CBaseLoader<EXCHANGE_ROW> *CreateExchangeLoader();
+		virtual CBaseLoader<FINANCIAL_ROW> *CreateFinancialLoader();
+		virtual CBaseLoader<HOLDING_ROW> *CreateHoldingLoader();
+		virtual CBaseLoader<HOLDING_HISTORY_ROW> *CreateHoldingHistoryLoader();
+		virtual CBaseLoader<HOLDING_SUMMARY_ROW> *CreateHoldingSummaryLoader();
+		virtual CBaseLoader<INDUSTRY_ROW> *CreateIndustryLoader();
+		virtual CBaseLoader<LAST_TRADE_ROW> *CreateLastTradeLoader();
+		virtual CBaseLoader<NEWS_ITEM_ROW> *CreateNewsItemLoader();
+		virtual CBaseLoader<NEWS_XREF_ROW> *CreateNewsXRefLoader();
+		virtual CBaseLoader<SECTOR_ROW> *CreateSectorLoader();
+		virtual CBaseLoader<SECURITY_ROW> *CreateSecurityLoader();
+		virtual CBaseLoader<SETTLEMENT_ROW> *CreateSettlementLoader();
+		virtual CBaseLoader<STATUS_TYPE_ROW> *CreateStatusTypeLoader();
+		virtual CBaseLoader<TAX_RATE_ROW> *CreateTaxRateLoader();
+		virtual CBaseLoader<TRADE_HISTORY_ROW> *CreateTradeHistoryLoader();
+		virtual CBaseLoader<TRADE_ROW> *CreateTradeLoader();
+		virtual CBaseLoader<TRADE_REQUEST_ROW> *CreateTradeRequestLoader();
+		virtual CBaseLoader<TRADE_TYPE_ROW> *CreateTradeTypeLoader();
+		virtual CBaseLoader<WATCH_ITEM_ROW> *CreateWatchItemLoader();
+		virtual CBaseLoader<WATCH_LIST_ROW> *CreateWatchListLoader();
+		virtual CBaseLoader<ZIP_CODE_ROW> *CreateZipCodeLoader();
+	};
+
+"""
+)
+
+source.write(
+    """
+#include "tpce_generated.hpp"
+
+using namespace duckdb;
+using namespace std;
+
+namespace TPCE {
+struct tpce_append_information {
+	tpce_append_information(Connection &con, string schema, string table) :
+		appender(con, schema, table) {}
+
+	Appender appender;
+};
+
+static void append_value(tpce_append_information &info, int32_t value) {
+	info.appender.Append<int32_t>(value);
+}
+
+static void append_bigint(tpce_append_information &info, int64_t value) {
+	info.appender.Append<int64_t>(value);
+}
+
+static void append_string(tpce_append_information &info, const char *value) {
+	info.appender.Append<Value>(Value(value));
+}
+
+static void append_double(tpce_append_information &info, double value) {
+	info.appender.Append<double>(value);
+}
+
+static void append_bool(tpce_append_information &info, bool value) {
+	info.appender.Append<bool>(value);
+}
+
+static void append_timestamp(tpce_append_information &info, CDateTime time) {
+	int32_t year = 0, month = 0, day = 0, hour = 0, minute = 0, second = 0, msec = 0;
+	time.GetYMDHMS(&year, &month, &day, &hour, &minute, &second, &msec);
+	info.appender.Append<Value>(Value::TIMESTAMP(year, month, day, hour, minute, second, msec * 1000));
+}
+
+void append_char(tpce_append_information &info, char value) {
+	char val[2];
+	val[0] = value;
+	val[1] = '\\0';
+	append_string(info, val);
+}
+
+template <typename T> class DuckDBBaseLoader : public CBaseLoader<T> {
+  protected:
+	tpce_append_information info;
+
+  public:
+	DuckDBBaseLoader(Connection &con, string schema, string table) :
+		info(con, schema, table) {
+	}
+
+	void FinishLoad() {
+
+	}
+};
+
+"""
+)
+
+with open(os.path.join(TPCE_DIR, 'include/main/TableRows.h'), 'r') as f:
+    for line in f:
+        line = line.strip()
+        if line.startswith('typedef struct '):
+            line = line.replace('typedef struct ', '')
+            current_table = line.split(' ')[0].replace('_ROW', ' ').replace('_', ' ').lower().strip()
+            tables[current_table] = []
+        elif line.startswith('}'):
+            current_table = None
+        elif current_table != None:
+            # row
+            # get type
+            splits = line.strip().split(' ')
+            if len(splits) < 2:
+                continue
+            line = splits[0]
+            name = splits[1].split(';')[0].split('[')[0].lower()
+            is_single_char = False
+            if 'TIdent' in line or 'INT64' in line or 'TTrade' in line:
+                tpe = "TypeId::BIGINT"
+                sqltpe = "BIGINT"
+            elif 'double' in line or 'float' in line:
+                tpe = "TypeId::DECIMAL"
+                sqltpe = "DECIMAL"
+            elif 'int' in line:
+                tpe = "TypeId::INTEGER"
+                sqltpe = "INTEGER"
+            elif 'CDateTime' in line:
+                tpe = "TypeId::TIMESTAMP"
+                sqltpe = "TIMESTAMP"
+            elif 'bool' in line:
+                tpe = 'TypeId::BOOLEAN'
+                sqltpe = "BOOLEAN"
+            elif 'char' in line:
+                if '[' not in splits[1]:
+                    is_single_char = True
+                tpe = "TypeId::VARCHAR"
+                sqltpe = "VARCHAR"
+            else:
+                continue
+            tables[current_table].append([name, tpe, is_single_char, sqltpe])
+
+
+def get_tablename(name):
+    name = name.title().replace(' ', '')
+    if name == 'NewsXref':
+        return 'NewsXRef'
+    return name
+
+
+for table in tables.keys():
+    source.write(
+        """
+class DuckDB${TABLENAME}Load : public DuckDBBaseLoader<${ROW_TYPE}> {
+public:
+	DuckDB${TABLENAME}Load(Connection &con, string schema, string table) :
+		DuckDBBaseLoader(con, schema, table) {
+
+	}
+
+	void WriteNextRecord(const ${ROW_TYPE} &next_record) {
+		info.appender.BeginRow();""".replace(
+            "${TABLENAME}", get_tablename(table)
+        ).replace(
+            "${ROW_TYPE}", table.upper().replace(' ', '_') + '_ROW'
+        )
+    )
+    source.write("\n")
+    collist = tables[table]
+    for i in range(len(collist)):
+        entry = collist[i]
+        name = entry[0].upper()
+        tpe = entry[1]
+        if tpe == "TypeId::BIGINT":
+            funcname = "bigint"
+        elif tpe == "TypeId::DECIMAL":
+            funcname = "double"
+        elif tpe == "TypeId::INTEGER":
+            funcname = "value"
+        elif tpe == "TypeId::TIMESTAMP":
+            funcname = "timestamp"
+        elif tpe == 'TypeId::BOOLEAN':
+            funcname = "bool"
+        elif tpe == "TypeId::VARCHAR":
+            if entry[2]:
+                funcname = "char"
+            else:
+                funcname = "string"
+        else:
+            print("Unknown type " + tpe)
+            exit(1)
+        source.write("\t\tappend_%s(info, next_record.%s);" % (funcname, name))
+        if i != len(collist) - 1:
+            source.write("\n")
+    source.write(
+        """
+		info.appender.EndRow();
+	}
+
+};"""
+    )
+
+
+for table in tables.keys():
+    source.write(
+        """
+CBaseLoader<${ROW_TYPE}> *
+DuckDBLoaderFactory::Create${TABLENAME}Loader() {
+	return new DuckDB${TABLENAME}Load(con, schema, "${TABLEINDB}" + suffix);
+}
+""".replace(
+            "${TABLENAME}", get_tablename(table)
+        )
+        .replace("${ROW_TYPE}", table.upper().replace(' ', '_') + '_ROW')
+        .replace("${TABLEINDB}", table.replace(' ', '_'))
+    )
+
+source.write("\n")
+
+# static string RegionSchema(string schema, string suffix) {
+# 	return "CREATE TABLE " + schema + ".region" + suffix + " ("
+# 		"r_regionkey INT NOT NULL,"
+# 		"r_name VARCHAR(25) NOT NULL,"
+# 		"r_comment VARCHAR(152) NOT NULL);";
+# }
+
+
+for table in tables.keys():
+    tname = table.replace(' ', '_')
+    str = 'static string ' + table.title().replace(' ', '') + 'Schema(string schema, string suffix) {\n'
+    str += '\treturn "CREATE TABLE " + schema + ".%s" + suffix + " ("\n' % (tname,)
+    columns = tables[table]
+    for i in range(len(columns)):
+        column = columns[i]
+        str += '\t    "' + column[0] + " " + column[3]
+        if i == len(columns) - 1:
+            str += ')";'
+        else:
+            str += ',"'
+        str += "\n"
+    str += "}\n\n"
+    source.write(str)
+
+
+func = 'void CreateTPCESchema(duckdb::DuckDB &db, duckdb::Connection &con, std::string &schema, std::string &suffix)'
+header.write(func + ';\n\n')
+source.write(func + ' {\n')
+
+
+# con.Query(RegionSchema(schema, suffix));
+
+for table in tables.keys():
+    tname = table.replace(' ', '_')
+    source.write('\tcon.Query(%sSchema(schema, suffix));\n' % (table.title().replace(' ', '')))
+
+
+source.write('}\n\n')
+
+
+for fp in [header, source]:
+    fp.write("} /* namespace TPCE */\n")
+    fp.close()
--- a/external/duckdb/scripts/get_test_list.py
+++ b/external/duckdb/scripts/get_test_list.py
@@ -0,0 +1,61 @@
+import argparse
+import sys
+import subprocess
+import re
+import os
+
+DEFAULT_UNITTEST_PATH = 'build/release/test/unittest'
+
+parser = argparse.ArgumentParser(description='Print a list of tests to run.')
+parser.add_argument(
+    '--file-contains',
+    dest='file_contains',
+    action='store',
+    help='Filter based on a string contained in the text',
+    default=None,
+)
+parser.add_argument(
+    '--unittest',
+    dest='unittest',
+    action='store',
+    help='The path to the unittest program',
+    default=DEFAULT_UNITTEST_PATH,
+)
+parser.add_argument('--list', dest='filter', action='store', help='The unittest filter to apply', default='')
+
+args = parser.parse_args()
+
+file_contains = args.file_contains
+extra_args = [args.filter]
+unittest_program = args.unittest
+
+# Override default for windows
+if os.name == 'nt' and unittest_program == DEFAULT_UNITTEST_PATH:
+    unittest_program = 'build/release/test/Release/unittest.exe'
+
+proc = subprocess.Popen([unittest_program, '-l'] + extra_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+stdout = proc.stdout.read().decode('utf8')
+stderr = proc.stderr.read().decode('utf8')
+if proc.returncode is not None and proc.returncode != 0:
+    print("Failed to run program " + unittest_program)
+    print(proc.returncode)
+    print(stdout)
+    print(stderr)
+    exit(1)
+
+test_cases = []
+for line in stdout.splitlines()[1:]:
+    if not line.strip():
+        continue
+    splits = line.rsplit('\t', 1)
+    if file_contains is not None:
+        if not os.path.isfile(splits[0]):
+            continue
+        try:
+            with open(splits[0], 'r') as f:
+                text = f.read()
+        except UnicodeDecodeError:
+            continue
+        if file_contains not in text:
+            continue
+    print(splits[0])
--- a/external/duckdb/scripts/include_analyzer.py
+++ b/external/duckdb/scripts/include_analyzer.py
@@ -0,0 +1,78 @@
+import amalgamation
+import os
+import re
+import sys
+import shutil
+from python_helpers import open_utf8
+
+include_counts = {}
+include_chains = {}
+cached_includes = {}
+
+
+def analyze_include_file(fpath, already_included_files, prev_include=""):
+    if fpath in already_included_files:
+        return
+    if fpath in amalgamation.always_excluded:
+        return
+    if fpath not in cached_includes:
+        # print(fpath)
+        with open_utf8(fpath, 'r') as f:
+            text = f.read()
+        (statements, includes) = amalgamation.get_includes(fpath, text)
+        cached_includes[fpath] = includes
+    else:
+        includes = cached_includes[fpath]
+
+    if fpath in include_counts:
+        include_counts[fpath] += 1
+    else:
+        include_counts[fpath] = 1
+
+    if fpath not in include_chains:
+        include_chains[fpath] = {}
+    if prev_include not in include_chains[fpath]:
+        include_chains[fpath][prev_include] = 0
+    include_chains[fpath][prev_include] += 1
+
+    already_included_files.append(fpath)
+    if fpath.endswith('.h') or fpath.endswith('.hpp'):
+        prev_include = fpath
+    for include in includes:
+        analyze_include_file(include, already_included_files, prev_include)
+
+
+def analyze_includes(dir):
+    files = os.listdir(dir)
+    files.sort()
+    for fname in files:
+        if fname in amalgamation.excluded_files:
+            continue
+        fpath = os.path.join(dir, fname)
+        if os.path.isdir(fpath):
+            analyze_includes(fpath)
+        elif fname.endswith('.cpp') or fname.endswith('.c') or fname.endswith('.cc'):
+            analyze_include_file(fpath, [])
+
+
+for compile_dir in amalgamation.compile_directories:
+    analyze_includes(compile_dir)
+
+kws = []
+for entry in include_counts.keys():
+    kws.append([entry, include_counts[entry]])
+
+kws.sort(key=lambda tup: -tup[1])
+for k in range(0, len(kws)):
+    include_file = kws[k][0]
+    include_count = kws[k][1]
+    print("------------------------------------------------------------")
+    print(include_file + " (" + str(include_count) + ")")
+    print("------------------------------------------------------------")
+    print("FILE INCLUDED FROM:")
+    chainkws = []
+    for chain in include_chains[include_file]:
+        chainkws.append([chain, include_chains[include_file][chain]])
+        chainkws.sort(key=lambda tup: -tup[1])
+    for l in range(0, min(5, len(chainkws))):
+        print(chainkws[l])
--- a/external/duckdb/scripts/install_node.sh
+++ b/external/duckdb/scripts/install_node.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+if [[ ${1:-false} == 'false' ]]; then
+    echo "Error: pass node version as first argument"
+    exit 1
+fi
+
+NODE_VERSION=$1
+
+# if an existing nvm is already installed we need to unload it
+nvm unload || true
+
+# here we set up the node version on the fly based on the matrix value.
+# This is done manually so that the build works the same on OS X
+rm -rf ./__nvm/ && git clone --depth 1 https://github.com/creationix/nvm.git ./__nvm
+source ./__nvm/nvm.sh
+nvm install ${NODE_VERSION}
+nvm use --delete-prefix ${NODE_VERSION}
+node --version
+npm --version
+which node
--- a/external/duckdb/scripts/list_vcpkg_registry_packages.py
+++ b/external/duckdb/scripts/list_vcpkg_registry_packages.py
@@ -0,0 +1,33 @@
+import argparse
+import requests
+
+parser = argparse.ArgumentParser(description='Generate the list of packages provided by the registry at <baseline>.')
+parser.add_argument(
+    '--baseline',
+    action='store',
+    help='The baseline (git commit) of the vcpkg-duckdb-ports',
+    required=True,
+)
+args = parser.parse_args()
+
+GITHUB_API = "https://api.github.com/repos/duckdb/vcpkg-duckdb-ports/git/trees"
+
+
+def main():
+    # Get the tree recursively for the commit
+    response = requests.get(f"{GITHUB_API}/{args.baseline}?recursive=1")
+    response.raise_for_status()
+
+    # Extract package names from ports directory
+    packages = set()
+    for item in response.json()['tree']:
+        path = item['path']
+        if path.startswith('ports/'):
+            parts = path.split('/')
+            if len(parts) > 2:
+                packages.add(parts[1])
+    print(sorted(list(packages)))
+
+
+if __name__ == '__main__':
+    main()
--- a/external/duckdb/scripts/merge_vcpkg_deps.py
+++ b/external/duckdb/scripts/merge_vcpkg_deps.py
@@ -0,0 +1,97 @@
+import json
+import os
+import sys
+
+# Pass vcpkg.json files to merge their dependencies and produce a single vcpkg.json with their
+# combined & deduplicated dependencies. Note that this script is very dumb and some manual merging may be required
+# to combine extensions from multiple builds in the case of colliding dependencies.
+
+# Also: note that due to the fact that the httpfs extension currently can not use the latest openssl version (3.1),
+# we need to pin the openssl version requiring us to also pin the vcpkg version here. When updating the vcpkg git hash
+# we probably want to change it here and in ('.github/actions/build_extensions/action.yml') at the same time
+
+dependencies_str = []
+dependencies_dict = []
+merged_overlay_ports = []
+merged_overlay_triplets = []
+
+
+def prefix_overlay_ports_or_triples(overlay_dir, path_to_vcpkg_json):
+    def prefix_overlay_port_or_triplet(overlay_port_or_triplet):
+        vcpkg_prefix_path = path_to_vcpkg_json[0 : path_to_vcpkg_json.find("/vcpkg.json")]
+        if len(vcpkg_prefix_path) == 0:
+            return overlay_port_or_triplet
+        return vcpkg_prefix_path + '/' + overlay_port_or_triplet
+
+    return map(prefix_overlay_port_or_triplet, overlay_dir)
+
+
+for file in sys.argv[1:]:
+    f = open(file)
+    data = json.load(f)
+
+    if 'dependencies' in data:
+        for dep in data['dependencies']:
+            if type(dep) is str:
+                dependencies_str.append(dep)
+            elif type(dep) is dict:
+                dependencies_dict.append(dep)
+            else:
+                raise Exception(f"Unknown entry type found in dependencies: '{dep}'")
+
+    if 'vcpkg-configuration' in data:
+        if 'overlay-ports' in data['vcpkg-configuration']:
+            merged_overlay_ports += prefix_overlay_ports_or_triples(data['vcpkg-configuration']['overlay-ports'], file)
+        if 'overlay-triplets' in data['vcpkg-configuration']:
+            merged_overlay_triplets += prefix_overlay_ports_or_triples(
+                data['vcpkg-configuration']['overlay-triplets'], file
+            )
+
+final_deduplicated_deps = list()
+dedup_set = set()
+
+for dep in dependencies_dict:
+    if dep['name'] not in dedup_set:
+        final_deduplicated_deps.append(dep)
+        # TODO: deduplication is disabled for now, just let vcpkg handle duplicates in deps
+        # dedup_set.add(dep['name'])
+
+for dep in dependencies_str:
+    if dep not in dedup_set:
+        final_deduplicated_deps.append(dep)
+        # TODO: deduplication is disabled for now, just let vcpkg handle duplicates in deps
+        # dedup_set.add(dep)
+
+opensslVersion = os.getenv("OPENSSL_VERSION_OVERRIDE", "3.0.8")
+data = {
+    "description": f"Auto-generated vcpkg.json for combined DuckDB extension build, generated by 'scripts/merge_vcpkg_deps.py'",
+    "builtin-baseline": "ce613c41372b23b1f51333815feb3edd87ef8a8b",
+    "dependencies": final_deduplicated_deps,
+    "overrides": [{"name": "openssl", "version": opensslVersion}],
+}
+
+data['vcpkg-configuration'] = {}
+
+if merged_overlay_ports:
+    data['vcpkg-configuration']['overlay-ports'] = merged_overlay_ports
+
+if merged_overlay_triplets:
+    data['vcpkg-configuration']['overlay-triplets'] = merged_overlay_triplets
+
+REGISTRY_BASELINE = '869bddccca976e0abe25894356e7f49e77765169'
+# NOTE: use 'scripts/list_vcpkg_registry_packages.py --baseline <baseline>' to generate the list of packages
+data['vcpkg-configuration']['registries'] = [
+    {
+        "kind": "git",
+        "repository": "https://github.com/duckdb/vcpkg-duckdb-ports",
+        "baseline": REGISTRY_BASELINE,
+        "packages": ['avro-c', 'vcpkg-cmake'],
+    }
+]
+
+# Print output
+print("Writing to 'build/extension_configuration/vcpkg.json': ")
+print(data["dependencies"])
+
+with open('build/extension_configuration/vcpkg.json', 'w', encoding='utf-8') as f:
+    json.dump(data, f, ensure_ascii=False, indent=4)
--- a/external/duckdb/scripts/modify_distribution_matrix.py
+++ b/external/duckdb/scripts/modify_distribution_matrix.py
@@ -0,0 +1,75 @@
+# This script is used by CI to modify the deployment matrix for the extension distribution
+
+import argparse
+import json
+import sys
+import logging
+
+# Define command-line arguments
+parser = argparse.ArgumentParser(description="Filter a JSON file based on excluded duckdb_arch values and select an OS")
+parser.add_argument("--input", required=True, help="Input JSON file path")
+parser.add_argument("--exclude", required=True, help="Semicolon-separated list of excluded duckdb_arch values")
+parser.add_argument("--output", help="Output JSON file path")
+parser.add_argument("--pretty", action="store_true", help="Pretty print the output JSON")
+parser.add_argument("--select_os", help="Select an OS to include in the output JSON")
+parser.add_argument("--deploy_matrix", action="store_true", help="Create a merged list used in deploy step")
+args = parser.parse_args()
+
+# Parse the input file path, excluded arch values, and output file path
+input_json_file_path = args.input
+excluded_arch_values = args.exclude.split(";")
+output_json_file_path = args.output
+select_os = args.select_os
+
+# Read the input JSON file
+with open(input_json_file_path, "r") as json_file:
+    data = json.load(json_file)
+
+
+# Function to filter entries based on duckdb_arch values
+def filter_entries(data, arch_values):
+    for os, config in data.items():
+        if "include" in config:
+            config["include"] = [entry for entry in config["include"] if entry["duckdb_arch"] not in arch_values]
+        if not config["include"]:
+            del config["include"]
+
+    return data
+
+
+# Filter the JSON data
+filtered_data = filter_entries(data, excluded_arch_values)
+
+# Select an OS if specified
+if select_os:
+    found = False
+    for os in filtered_data.keys():
+        if os == select_os:
+            filtered_data = filtered_data[os]
+            found = True
+            break
+    if found == False:
+        logging.warning('A selection OS was provided but not found')
+        filtered_data = []
+
+# When deploy_matrix is specified, we only output a single merged include list with all the duckdb_archs
+elif args.deploy_matrix:
+    deploy_archs = []
+
+    for os, config in filtered_data.items():
+        if "include" in config:
+            for item in config["include"]:
+                deploy_archs.append({"duckdb_arch": item["duckdb_arch"]})
+
+    filtered_data = {"include": deploy_archs}
+
+# Determine the JSON formatting
+indent = 2 if args.pretty else None
+
+# If no output file is provided, print to stdout
+if output_json_file_path:
+    with open(output_json_file_path, "w") as output_json_file:
+        if filtered_data:
+            json.dump(filtered_data, output_json_file, indent=indent)
+else:
+    json.dump(filtered_data, sys.stdout, indent=indent)
--- a/external/duckdb/scripts/null.txt
+++ b/external/duckdb/scripts/null.txt
--- a/external/duckdb/scripts/osx_import_codesign_certificate.sh
+++ b/external/duckdb/scripts/osx_import_codesign_certificate.sh
@@ -0,0 +1,15 @@
+# create variables
+export CERTIFICATE_PATH=$RUNNER_TEMP/build_certificate.p12
+export KEYCHAIN_PATH=$RUNNER_TEMP/app-signing.keychain-db
+
+# import certificate and provisioning profile from secrets
+echo -n "$BUILD_CERTIFICATE_BASE64" | base64 --decode -o $CERTIFICATE_PATH
+
+# create temporary keychain
+security create-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH
+security set-keychain-settings -lut 21600 $KEYCHAIN_PATH
+security unlock-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH
+
+# import certificate to keychain
+security import $CERTIFICATE_PATH -P "$P12_PASSWORD" -A -t cert -f pkcs12 -k $KEYCHAIN_PATH
+security list-keychain -d user -s $KEYCHAIN_PATH
--- a/external/duckdb/scripts/package_build.py
+++ b/external/duckdb/scripts/package_build.py
@@ -0,0 +1,417 @@
+import os
+import sys
+import shutil
+import subprocess
+from python_helpers import open_utf8
+import re
+
+excluded_objects = ['utf8proc_data.cpp']
+
+
+def third_party_includes():
+    includes = []
+    includes += [os.path.join('third_party', 'concurrentqueue')]
+    includes += [os.path.join('third_party', 'fast_float')]
+    includes += [os.path.join('third_party', 'fastpforlib')]
+    includes += [os.path.join('third_party', 'fmt', 'include')]
+    includes += [os.path.join('third_party', 'fsst')]
+    includes += [os.path.join('third_party', 'httplib')]
+    includes += [os.path.join('third_party', 'hyperloglog')]
+    includes += [os.path.join('third_party', 'jaro_winkler')]
+    includes += [os.path.join('third_party', 'jaro_winkler', 'details')]
+    includes += [os.path.join('third_party', 'libpg_query')]
+    includes += [os.path.join('third_party', 'libpg_query', 'include')]
+    includes += [os.path.join('third_party', 'lz4')]
+    includes += [os.path.join('third_party', 'brotli', 'include')]
+    includes += [os.path.join('third_party', 'brotli', 'common')]
+    includes += [os.path.join('third_party', 'brotli', 'dec')]
+    includes += [os.path.join('third_party', 'brotli', 'enc')]
+    includes += [os.path.join('third_party', 'mbedtls', 'include')]
+    includes += [os.path.join('third_party', 'mbedtls', 'library')]
+    includes += [os.path.join('third_party', 'miniz')]
+    includes += [os.path.join('third_party', 'pcg')]
+    includes += [os.path.join('third_party', 'pdqsort')]
+    includes += [os.path.join('third_party', 're2')]
+    includes += [os.path.join('third_party', 'ska_sort')]
+    includes += [os.path.join('third_party', 'skiplist')]
+    includes += [os.path.join('third_party', 'tdigest')]
+    includes += [os.path.join('third_party', 'utf8proc')]
+    includes += [os.path.join('third_party', 'utf8proc', 'include')]
+    includes += [os.path.join('third_party', 'vergesort')]
+    includes += [os.path.join('third_party', 'yyjson', 'include')]
+    includes += [os.path.join('third_party', 'zstd', 'include')]
+    return includes
+
+
+def third_party_sources():
+    sources = []
+    sources += [os.path.join('third_party', 'fmt')]
+    sources += [os.path.join('third_party', 'fsst')]
+    sources += [os.path.join('third_party', 'miniz')]
+    sources += [os.path.join('third_party', 're2')]
+    sources += [os.path.join('third_party', 'hyperloglog')]
+    sources += [os.path.join('third_party', 'skiplist')]
+    sources += [os.path.join('third_party', 'fastpforlib')]
+    sources += [os.path.join('third_party', 'utf8proc')]
+    sources += [os.path.join('third_party', 'libpg_query')]
+    sources += [os.path.join('third_party', 'mbedtls')]
+    sources += [os.path.join('third_party', 'yyjson')]
+    sources += [os.path.join('third_party', 'zstd')]
+    return sources
+
+
+def file_is_lib(fname, libname):
+    libextensions = ['.a', '.lib']
+    libprefixes = ['', 'lib']
+    for ext in libextensions:
+        for prefix in libprefixes:
+            potential_libname = prefix + libname + ext
+            if fname == potential_libname:
+                return True
+    return False
+
+
+def get_libraries(binary_dir, libraries, extensions):
+    result_libs = []
+
+    def find_library_recursive(search_dir, libname):
+        flist = os.listdir(search_dir)
+        for fname in flist:
+            fpath = os.path.join(search_dir, fname)
+            if os.path.isdir(fpath):
+                entry = find_library_recursive(fpath, libname)
+                if entry != None:
+                    return entry
+            elif os.path.isfile(fpath) and file_is_lib(fname, libname):
+                return search_dir
+        return None
+
+    def find_library(search_dir, libname, result_libs, required=False):
+        if libname == 'Threads::Threads':
+            result_libs += [(None, 'pthread')]
+            return
+        libdir = find_library_recursive(binary_dir, libname)
+        if libdir is None and required:
+            raise Exception(f"Failed to locate required library {libname} in {binary_dir}")
+
+        result_libs += [(libdir, libname)]
+
+    duckdb_lib_name = 'duckdb_static'
+    if os.name == 'nt':
+        duckdb_lib_name = 'duckdb'
+    find_library(os.path.join(binary_dir, 'src'), duckdb_lib_name, result_libs, True)
+    for ext in extensions:
+        find_library(os.path.join(binary_dir, 'extension', ext), ext + '_extension', result_libs, True)
+
+    for libname in libraries:
+        find_library(binary_dir, libname, result_libs)
+
+    return result_libs
+
+
+def includes(extensions):
+    scripts_dir = os.path.dirname(os.path.abspath(__file__))
+    # add includes for duckdb and extensions
+    includes = []
+    includes.append(os.path.join(scripts_dir, '..', 'src', 'include'))
+    includes.append(os.path.join(scripts_dir, '..'))
+    includes.append(os.path.join(scripts_dir, '..', 'third_party', 'utf8proc', 'include'))
+    for ext in extensions:
+        includes.append(os.path.join(scripts_dir, '..', 'extension', ext, 'include'))
+    return includes
+
+
+def include_flags(extensions):
+    return ' ' + ' '.join(['-I' + x for x in includes(extensions)])
+
+
+def convert_backslashes(x):
+    return '/'.join(x.split(os.path.sep))
+
+
+def get_relative_path(source_dir, target_file):
+    source_dir = convert_backslashes(source_dir)
+    target_file = convert_backslashes(target_file)
+
+    # absolute path: try to convert
+    if source_dir in target_file:
+        target_file = target_file.replace(source_dir, "").lstrip('/')
+    return target_file
+
+
+######
+# MAIN_BRANCH_VERSIONING default should be 'True' for main branch and feature branches
+# MAIN_BRANCH_VERSIONING default should be 'False' for release branches
+# MAIN_BRANCH_VERSIONING default value needs to keep in sync between:
+# - CMakeLists.txt
+# - scripts/amalgamation.py
+# - scripts/package_build.py
+######
+MAIN_BRANCH_VERSIONING = True
+if os.getenv('MAIN_BRANCH_VERSIONING') == "0":
+    MAIN_BRANCH_VERSIONING = False
+if os.getenv('MAIN_BRANCH_VERSIONING') == "1":
+    MAIN_BRANCH_VERSIONING = True
+
+
+def get_git_describe():
+    override_git_describe = os.getenv('OVERRIDE_GIT_DESCRIBE') or ''
+    versioning_tag_match = 'v*.*.*'
+    if MAIN_BRANCH_VERSIONING:
+        versioning_tag_match = 'v*.*.0'
+    # empty override_git_describe, either since env was empty string or not existing
+    # -> ask git (that can fail, so except in place)
+    if len(override_git_describe) == 0:
+        try:
+            return (
+                subprocess.check_output(
+                    ['git', 'describe', '--tags', '--long', '--debug', '--match', versioning_tag_match]
+                )
+                .strip()
+                .decode('utf8')
+            )
+        except subprocess.CalledProcessError:
+            return "v0.0.0-0-gdeadbeeff"
+    if len(override_git_describe.split('-')) == 3:
+        return override_git_describe
+    if len(override_git_describe.split('-')) == 1:
+        override_git_describe += "-0"
+    assert len(override_git_describe.split('-')) == 2
+    try:
+        return (
+            override_git_describe
+            + "-g"
+            + subprocess.check_output(['git', 'log', '-1', '--format=%h']).strip().decode('utf8')
+        )
+    except subprocess.CalledProcessError:
+        return override_git_describe + "-g" + "deadbeeff"
+
+
+def git_commit_hash():
+    if 'SETUPTOOLS_SCM_PRETEND_HASH' in os.environ:
+        return os.environ['SETUPTOOLS_SCM_PRETEND_HASH']
+    try:
+        git_describe = get_git_describe()
+        hash = git_describe.split('-')[2].lstrip('g')
+        return hash
+    except:
+        return "deadbeeff"
+
+
+def prefix_version(version):
+    """Make sure the version is prefixed with 'v' to be of the form vX.Y.Z"""
+    if version.startswith('v'):
+        return version
+    return 'v' + version
+
+
+def git_dev_version():
+    if 'SETUPTOOLS_SCM_PRETEND_VERSION' in os.environ:
+        return prefix_version(os.environ['SETUPTOOLS_SCM_PRETEND_VERSION'])
+    try:
+        long_version = get_git_describe()
+        version_splits = long_version.split('-')[0].lstrip('v').split('.')
+        dev_version = long_version.split('-')[1]
+        if int(dev_version) == 0:
+            # directly on a tag: emit the regular version
+            return "v" + '.'.join(version_splits)
+        else:
+            # not on a tag: increment the version by one and add a -devX suffix
+            # this needs to keep in sync with changes to CMakeLists.txt
+            if MAIN_BRANCH_VERSIONING == True:
+                # increment minor version
+                version_splits[1] = str(int(version_splits[1]) + 1)
+            else:
+                # increment patch version
+                version_splits[2] = str(int(version_splits[2]) + 1)
+            return "v" + '.'.join(version_splits) + "-dev" + dev_version
+    except:
+        return "v0.0.0"
+
+
+def include_package(pkg_name, pkg_dir, include_files, include_list, source_list):
+    import amalgamation
+
+    original_path = sys.path
+    # append the directory
+    sys.path.append(pkg_dir)
+    ext_pkg = __import__(pkg_name + '_config')
+
+    ext_include_dirs = ext_pkg.include_directories
+    ext_source_files = ext_pkg.source_files
+
+    include_files += amalgamation.list_includes_files(ext_include_dirs)
+    include_list += ext_include_dirs
+    source_list += ext_source_files
+
+    sys.path = original_path
+
+
+def build_package(target_dir, extensions, linenumbers=False, unity_count=32, folder_name='duckdb', short_paths=False):
+    if not os.path.isdir(target_dir):
+        os.mkdir(target_dir)
+
+    scripts_dir = os.path.dirname(os.path.abspath(__file__))
+    sys.path.append(scripts_dir)
+    import amalgamation
+
+    prev_wd = os.getcwd()
+    os.chdir(os.path.join(scripts_dir, '..'))
+
+    # obtain the list of source files from the amalgamation
+    source_list = amalgamation.list_sources()
+    include_list = amalgamation.list_include_dirs()
+    include_files = amalgamation.list_includes()
+
+    def copy_file(src, target_dir):
+        # get the path
+        full_path = src.split(os.path.sep)
+        current_path = target_dir
+        for i in range(len(full_path) - 1):
+            current_path = os.path.join(current_path, full_path[i])
+            if not os.path.isdir(current_path):
+                os.mkdir(current_path)
+        target_name = full_path[-1]
+        target_file = os.path.join(current_path, target_name)
+        amalgamation.copy_if_different(src, target_file)
+
+    # include the main extension helper
+    include_files += [os.path.join('src', 'include', 'duckdb', 'main', 'extension_helper.hpp')]
+    # include the separate extensions
+    for ext in extensions:
+        ext_path = os.path.join(scripts_dir, '..', 'extension', ext)
+        include_package(ext, ext_path, include_files, include_list, source_list)
+
+    for src in source_list:
+        copy_file(src, target_dir)
+
+    for inc in include_files:
+        copy_file(inc, target_dir)
+
+    # handle pragma_version.cpp: paste #define DUCKDB_SOURCE_ID and DUCKDB_VERSION there
+    curdir = os.getcwd()
+    os.chdir(os.path.join(scripts_dir, '..'))
+    githash = git_commit_hash()
+    dev_version = git_dev_version()
+    dev_v_parts = dev_version.lstrip('v').split('.')
+    os.chdir(curdir)
+    # open the file and read the current contents
+    fpath = os.path.join(target_dir, 'src', 'function', 'table', 'version', 'pragma_version.cpp')
+    with open_utf8(fpath, 'r') as f:
+        text = f.read()
+    # now add the DUCKDB_SOURCE_ID define, if it is not there already
+    found_hash = False
+    found_dev = False
+    found_major = False
+    found_minor = False
+    found_patch = False
+    lines = text.split('\n')
+    for i in range(len(lines)):
+        if '#define DUCKDB_SOURCE_ID ' in lines[i]:
+            lines[i] = '#define DUCKDB_SOURCE_ID "{}"'.format(githash)
+            found_hash = True
+        if '#define DUCKDB_VERSION ' in lines[i]:
+            lines[i] = '#define DUCKDB_VERSION "{}"'.format(dev_version)
+            found_dev = True
+        if '#define DUCKDB_MAJOR_VERSION ' in lines[i]:
+            lines[i] = '#define DUCKDB_MAJOR_VERSION {}'.format(int(dev_v_parts[0]))
+            found_major = True
+        if '#define DUCKDB_MINOR_VERSION ' in lines[i]:
+            lines[i] = '#define DUCKDB_MINOR_VERSION {}'.format(int(dev_v_parts[1]))
+            found_minor = True
+        if '#define DUCKDB_PATCH_VERSION ' in lines[i]:
+            lines[i] = '#define DUCKDB_PATCH_VERSION "{}"'.format(dev_v_parts[2])
+            found_patch = True
+    if not found_hash:
+        lines = ['#ifndef DUCKDB_SOURCE_ID', '#define DUCKDB_SOURCE_ID "{}"'.format(githash), '#endif'] + lines
+    if not found_dev:
+        lines = ['#ifndef DUCKDB_VERSION', '#define DUCKDB_VERSION "{}"'.format(dev_version), '#endif'] + lines
+    if not found_major:
+        lines = [
+            '#ifndef DUCKDB_MAJOR_VERSION',
+            '#define DUCKDB_MAJOR_VERSION {}'.format(int(dev_v_parts[0])),
+            '#endif',
+        ] + lines
+    if not found_minor:
+        lines = [
+            '#ifndef DUCKDB_MINOR_VERSION',
+            '#define DUCKDB_MINOR_VERSION {}'.format(int(dev_v_parts[1])),
+            '#endif',
+        ] + lines
+    if not found_patch:
+        lines = [
+            '#ifndef DUCKDB_PATCH_VERSION',
+            '#define DUCKDB_PATCH_VERSION "{}"'.format(dev_v_parts[2]),
+            '#endif',
+        ] + lines
+    text = '\n'.join(lines)
+    with open_utf8(fpath, 'w+') as f:
+        f.write(text)
+
+    def file_is_excluded(fname):
+        for entry in excluded_objects:
+            if entry in fname:
+                return True
+        return False
+
+    def generate_unity_build(entries, unity_name, linenumbers):
+        ub_file = os.path.join(target_dir, unity_name)
+        with open_utf8(ub_file, 'w+') as f:
+            for entry in entries:
+                if linenumbers:
+                    f.write('#line 0 "{}"\n'.format(convert_backslashes(entry)))
+                f.write('#include "{}"\n\n'.format(convert_backslashes(entry)))
+        return ub_file
+
+    def generate_unity_builds(source_list, nsplits, linenumbers):
+        files_per_directory = {}
+        for source in source_list:
+            dirname = os.path.dirname(source)
+            if dirname not in files_per_directory:
+                files_per_directory[dirname] = []
+            files_per_directory[dirname].append(source)
+
+        new_source_files = []
+        for dirname in files_per_directory.keys():
+            current_files = files_per_directory[dirname]
+            cmake_file = os.path.join(dirname, 'CMakeLists.txt')
+            unity_build = False
+            if os.path.isfile(cmake_file):
+                with open(cmake_file, 'r') as f:
+                    text = f.read()
+                    if 'add_library_unity' in text:
+                        unity_build = True
+                        # re-order the files in the unity build so that they follow the same order as the CMake
+                        scores = {}
+                        filenames = [x[0] for x in re.findall('([a-zA-Z0-9_]+[.](cpp|cc|c|cxx))', text)]
+                        score = 0
+                        for filename in filenames:
+                            scores[filename] = score
+                            score += 1
+                        current_files.sort(
+                            key=lambda x: scores[os.path.basename(x)] if os.path.basename(x) in scores else 99999
+                        )
+            if not unity_build:
+                if short_paths:
+                    # replace source files with "__"
+                    for file in current_files:
+                        unity_filename = os.path.basename(file)
+                        new_source_files.append(generate_unity_build([file], unity_filename, linenumbers))
+                else:
+                    # directly use the source files
+                    new_source_files += [os.path.join(folder_name, file) for file in current_files]
+            else:
+                unity_base = dirname.replace(os.path.sep, '_')
+                unity_name = f'ub_{unity_base}.cpp'
+                new_source_files.append(generate_unity_build(current_files, unity_name, linenumbers))
+        return new_source_files
+
+    original_sources = source_list
+    source_list = generate_unity_builds(source_list, unity_count, linenumbers)
+
+    os.chdir(prev_wd)
+    return (
+        [convert_backslashes(x) for x in source_list if not file_is_excluded(x)],
+        [convert_backslashes(x) for x in include_list],
+        [convert_backslashes(x) for x in original_sources],
+    )
--- a/external/duckdb/scripts/parser_test.py
+++ b/external/duckdb/scripts/parser_test.py
@@ -0,0 +1,21 @@
+from sqllogictest import SQLParserException, SQLLogicParser, SQLLogicTest
+
+from typing import Optional
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser(description="SQL Logic Parser")
+    parser.add_argument("filename", type=str, help="Path to the SQL logic file")
+    args = parser.parse_args()
+
+    filename = args.filename
+
+    parser = SQLLogicParser()
+    out: Optional[SQLLogicTest] = parser.parse(filename)
+    if not out:
+        raise SQLParserException(f"Test {filename} could not be parsed")
+
+
+if __name__ == "__main__":
+    main()
--- a/external/duckdb/scripts/plan_cost_runner.py
+++ b/external/duckdb/scripts/plan_cost_runner.py
@@ -0,0 +1,207 @@
+import argparse
+import glob
+import json
+import os
+import subprocess
+import sys
+from tqdm import tqdm
+
+
+OLD_DB_NAME = "old.duckdb"
+NEW_DB_NAME = "new.duckdb"
+PROFILE_FILENAME = "duckdb_profile.json"
+
+ENABLE_PROFILING = "PRAGMA enable_profiling=json"
+PROFILE_OUTPUT = f"PRAGMA profile_output='{PROFILE_FILENAME}'"
+
+BANNER_SIZE = 52
+
+
+def init_db(cli, dbname, benchmark_dir):
+    print(f"INITIALIZING {dbname} ...")
+    subprocess.run(
+        f"{cli} {dbname} < {benchmark_dir}/init/schema.sql", shell=True, check=True, stdout=subprocess.DEVNULL
+    )
+    subprocess.run(f"{cli} {dbname} < {benchmark_dir}/init/load.sql", shell=True, check=True, stdout=subprocess.DEVNULL)
+    print("INITIALIZATION DONE")
+
+
+class PlanCost:
+    def __init__(self):
+        self.total = 0
+        self.build_side = 0
+        self.probe_side = 0
+        self.time = 0
+
+    def __add__(self, other):
+        self.total += other.total
+        self.build_side += other.build_side
+        self.probe_side += other.probe_side
+        return self
+
+    def __gt__(self, other):
+        if self == other or self.total < other.total:
+            return False
+        # if the total intermediate cardinalities is greater, also inspect time.
+        # it's possible a plan reordering increased cardinalities, but overall execution time
+        # was not greatly affected
+        total_card_increased = self.total > other.total
+        build_card_increased = self.build_side > other.build_side
+        if total_card_increased and build_card_increased:
+            return True
+        # we know the total cardinality is either the same or higher and the build side has not increased
+        # in this case fall back to the timing. It's possible that even if the probe side is higher
+        # since the tuples are in flight, the plan executes faster
+        return self.time > other.time * 1.03
+
+    def __lt__(self, other):
+        if self == other:
+            return False
+        return not (self > other)
+
+    def __eq__(self, other):
+        return self.total == other.total and self.build_side == other.build_side and self.probe_side == other.probe_side
+
+
+def is_measured_join(op) -> bool:
+    if 'name' not in op:
+        return False
+    if op['name'] != 'HASH_JOIN':
+        return False
+    if 'Join Type' not in op['extra_info']:
+        return False
+    if op['extra_info']['Join Type'].startswith('MARK'):
+        return False
+    return True
+
+
+def op_inspect(op) -> PlanCost:
+    cost = PlanCost()
+    if 'Query' in op:
+        cost.time = op['operator_timing']
+    if is_measured_join(op):
+        cost.total = op['operator_cardinality']
+        if 'operator_cardinality' in op['children'][0]:
+            cost.probe_side += op['children'][0]['operator_cardinality']
+        if 'operator_cardinality' in op['children'][1]:
+            cost.build_side += op['children'][1]['operator_cardinality']
+
+        left_cost = op_inspect(op['children'][0])
+        right_cost = op_inspect(op['children'][1])
+        cost.probe_side += left_cost.probe_side + right_cost.probe_side
+        cost.build_side += left_cost.build_side + right_cost.build_side
+        cost.total += left_cost.total + right_cost.total
+        return cost
+
+    for child_op in op['children']:
+        cost += op_inspect(child_op)
+
+    return cost
+
+
+def query_plan_cost(cli, dbname, query):
+    try:
+        subprocess.run(
+            f"{cli} --readonly {dbname} -c \"{ENABLE_PROFILING};{PROFILE_OUTPUT};{query}\"",
+            shell=True,
+            check=True,
+            capture_output=True,
+        )
+    except subprocess.CalledProcessError as e:
+        print("-------------------------")
+        print("--------Failure----------")
+        print("-------------------------")
+        print(e.stderr.decode('utf8'))
+        print("-------------------------")
+        print("--------Output----------")
+        print("-------------------------")
+        print(e.output.decode('utf8'))
+        print("-------------------------")
+        raise e
+    with open(PROFILE_FILENAME, 'r') as file:
+        return op_inspect(json.load(file))
+
+
+def print_banner(text):
+    text_len = len(text)
+    rest = BANNER_SIZE - text_len - 10
+    l_width = int(rest / 2)
+    r_width = l_width
+    if rest % 2 != 0:
+        l_width += 1
+    print("")
+    print("=" * BANNER_SIZE)
+    print("=" * l_width + " " * 5 + text + " " * 5 + "=" * r_width)
+    print("=" * BANNER_SIZE)
+
+
+def print_diffs(diffs):
+    for query_name, old_cost, new_cost in diffs:
+        print("")
+        print("Query:", query_name)
+        print("Old total cost:", old_cost.total)
+        print("Old build cost:", old_cost.build_side)
+        print("Old probe cost:", old_cost.probe_side)
+        print("New total cost:", new_cost.total)
+        print("New build cost:", new_cost.build_side)
+        print("New probe cost:", new_cost.probe_side)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Plan cost regression test script with old and new versions.")
+
+    parser.add_argument("--old", type=str, help="Path to the old runner.", required=True)
+    parser.add_argument("--new", type=str, help="Path to the new runner.", required=True)
+    parser.add_argument("--dir", type=str, help="Path to the benchmark directory.", required=True)
+
+    args = parser.parse_args()
+
+    old = args.old
+    new = args.new
+    benchmark_dir = args.dir
+
+    init_db(old, OLD_DB_NAME, benchmark_dir)
+    init_db(new, NEW_DB_NAME, benchmark_dir)
+
+    improvements = []
+    regressions = []
+
+    files = glob.glob(f"{benchmark_dir}/queries/*.sql")
+    files.sort()
+
+    print("")
+    print("RUNNING BENCHMARK QUERIES")
+    for f in tqdm(files):
+        query_name = f.split("/")[-1].replace(".sql", "")
+
+        with open(f, "r") as file:
+            query = file.read()
+
+        old_cost = query_plan_cost(old, OLD_DB_NAME, query)
+        new_cost = query_plan_cost(new, NEW_DB_NAME, query)
+
+        if old_cost > new_cost:
+            improvements.append((query_name, old_cost, new_cost))
+        elif new_cost > old_cost:
+            regressions.append((query_name, old_cost, new_cost))
+
+    exit_code = 0
+    if improvements:
+        print_banner("IMPROVEMENTS DETECTED")
+        print_diffs(improvements)
+    if regressions:
+        exit_code = 1
+        print_banner("REGRESSIONS DETECTED")
+        print_diffs(regressions)
+    if not improvements and not regressions:
+        print_banner("NO DIFFERENCES DETECTED")
+
+    os.remove(OLD_DB_NAME)
+    os.remove(NEW_DB_NAME)
+    os.remove(PROFILE_FILENAME)
+
+    exit(exit_code)
+
+
+if __name__ == "__main__":
+    main()
--- a/external/duckdb/scripts/python_helpers.py
+++ b/external/duckdb/scripts/python_helpers.py
@@ -0,0 +1,23 @@
+def open_utf8(fpath, flags):
+    import sys
+
+    if sys.version_info[0] < 3:
+        return open(fpath, flags)
+    else:
+        return open(fpath, flags, encoding="utf8")
+
+
+def normalize_path(path):
+    import os
+
+    def normalize(p):
+        return os.path.sep.join(p.split('/'))
+
+    if isinstance(path, list):
+        normed = map(lambda p: normalize(p), path)
+        return list(normed)
+
+    if isinstance(path, str):
+        return normalize(path)
+
+    raise Exception("Can only be called with a str or list argument")
--- a/external/duckdb/scripts/raspberry-pi-cmake-toolchain.cmake
+++ b/external/duckdb/scripts/raspberry-pi-cmake-toolchain.cmake
@@ -0,0 +1,17 @@
+SET(CMAKE_SYSTEM_NAME Linux)
+
+# Define our host system
+SET(CMAKE_SYSTEM_NAME Linux)
+SET(CMAKE_SYSTEM_VERSION 1)
+# Define the cross compiler locations
+SET(CMAKE_C_COMPILER ${DUCKDB_RPI_TOOLCHAIN_PREFIX}/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc)
+SET(CMAKE_CXX_COMPILER ${DUCKDB_RPI_TOOLCHAIN_PREFIX}/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc)
+# Define the sysroot path for the RaspberryPi distribution in our tools folder 
+SET(CMAKE_FIND_ROOT_PATH ${DUCKDB_RPI_TOOLCHAIN_PREFIX}/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/arm-linux-gnueabihf/sysroot/)
+# Use our definitions for compiler tools
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# Search for libraries and headers in the target directories only
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+SET(DUCKDB_EXTRA_LINK_FLAGS -lstdc++ -lgcc -lm)
--- a/external/duckdb/scripts/regression/init.py
+++ b/external/duckdb/scripts/regression/init.py
@@ -0,0 +1 @@
+
--- a/external/duckdb/scripts/regression/benchmark.py
+++ b/external/duckdb/scripts/regression/benchmark.py
@@ -0,0 +1,193 @@
+import subprocess
+import statistics
+from io import StringIO
+import csv
+from dataclasses import dataclass
+import argparse
+from typing import Optional, Union, Tuple, List
+import functools
+
+print = functools.partial(print, flush=True)
+
+STDERR_HEADER = '''====================================================
+==============         STDERR          =============
+====================================================
+'''
+
+STDOUT_HEADER = '''====================================================
+==============         STDOUT          =============
+====================================================
+'''
+
+# timeouts in seconds
+MAX_TIMEOUT = 3600
+DEFAULT_TIMEOUT = 600
+
+
+@dataclass
+class BenchmarkRunnerConfig:
+    "Configuration for a BenchmarkRunner"
+
+    benchmark_runner: str
+    benchmark_file: str
+    verbose: bool = False
+    threads: Optional[int] = None
+    memory_limit: Optional[str] = None
+    disable_timeout: bool = False
+    max_timeout: int = MAX_TIMEOUT
+    root_dir: str = ""
+    no_summary: bool = False
+
+    @classmethod
+    def from_params(cls, benchmark_runner, benchmark_file, **kwargs) -> "BenchmarkRunnerConfig":
+        verbose = kwargs.get("verbose", False)
+        threads = kwargs.get("threads", None)
+        memory_limit = kwargs.get("memory_limit", None)
+        disable_timeout = kwargs.get("disable_timeout", False)
+        max_timeout = kwargs.get("max_timeout", MAX_TIMEOUT)
+        root_dir = kwargs.get("root_dir", "")
+        no_summary = kwargs.get("no_summary", False)
+
+        config = cls(
+            benchmark_runner=benchmark_runner,
+            benchmark_file=benchmark_file,
+            verbose=verbose,
+            threads=threads,
+            memory_limit=memory_limit,
+            disable_timeout=disable_timeout,
+            max_timeout=max_timeout,
+            root_dir=root_dir,
+            no_summary=no_summary,
+        )
+        return config
+
+    @classmethod
+    def from_args(cls) -> "BenchmarkRunnerConfig":
+        parser = argparse.ArgumentParser(description="Benchmark script with old and new runners.")
+
+        # Define the arguments
+        parser.add_argument("--path", type=str, help="Path to the benchmark_runner executable", required=True)
+        parser.add_argument("--benchmarks", type=str, help="Path to the benchmark file.", required=True)
+        parser.add_argument("--verbose", action="store_true", help="Enable verbose output.")
+        parser.add_argument("--threads", type=int, help="Number of threads to use.")
+        parser.add_argument("--memory_limit", type=str, help="Memory limit to use.")
+        parser.add_argument("--disable-timeout", action="store_true", help="Disable timeout.")
+        parser.add_argument(
+            "--max-timeout", type=int, default=3600, help="Set maximum timeout in seconds (default: 3600)."
+        )
+        parser.add_argument("--root-dir", type=str, default="", help="Root directory.")
+        parser.add_argument(
+            "--no-summary", type=str, default=False, help="No failures summary is outputed when passing this flag."
+        )
+
+        # Parse arguments
+        parsed_args = parser.parse_args()
+
+        # Create an instance of BenchmarkRunnerConfig using parsed arguments
+        config = cls(
+            benchmark_runner=parsed_args.path,
+            benchmark_file=parsed_args.benchmarks,
+            verbose=parsed_args.verbose,
+            threads=parsed_args.threads,
+            memory_limit=parsed_args.memory_limit,
+            disable_timeout=parsed_args.disable_timeout,
+            max_timeout=parsed_args.max_timeout,
+            root_dir=parsed_args.root_dir,
+            no_summary=parsed_args.no_summary,
+        )
+        return config
+
+
+class BenchmarkRunner:
+    def __init__(self, config: BenchmarkRunnerConfig):
+        self.config = config
+        self.complete_timings = []
+        self.benchmark_list: List[str] = []
+        with open(self.config.benchmark_file, 'r') as f:
+            self.benchmark_list = [x.strip() for x in f.read().split('\n') if len(x) > 0]
+
+    def construct_args(self, benchmark_path):
+        benchmark_args = []
+        benchmark_args.extend([self.config.benchmark_runner, benchmark_path])
+        if self.config.root_dir:
+            benchmark_args.extend(['--root-dir', self.config.root_dir])
+        if self.config.threads:
+            benchmark_args.extend([f"--threads={self.config.threads}"])
+        if self.config.memory_limit:
+            benchmark_args.extend([f"--memory_limit={self.config.memory_limit}"])
+        if self.config.disable_timeout:
+            benchmark_args.extend(["--disable-timeout"])
+        if self.config.no_summary:
+            benchmark_args.extend(["--no-summary"])
+        return benchmark_args
+
+    def run_benchmark(self, benchmark) -> Tuple[Union[float, str], Optional[str]]:
+        benchmark_args = self.construct_args(benchmark)
+        timeout_seconds = DEFAULT_TIMEOUT
+        if self.config.disable_timeout:
+            timeout_seconds = self.config.max_timeout
+
+        try:
+            proc = subprocess.run(
+                benchmark_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout_seconds
+            )
+            out = proc.stdout.decode('utf8')
+            err = proc.stderr.decode('utf8')
+            returncode = proc.returncode
+        except subprocess.TimeoutExpired:
+            print("Failed to run benchmark " + benchmark)
+            print(f"Aborted due to exceeding the limit of {timeout_seconds} seconds")
+            return (
+                'Failed to run benchmark ' + benchmark,
+                f"Aborted due to exceeding the limit of {timeout_seconds} seconds",
+            )
+        if returncode != 0:
+            print("Failed to run benchmark " + benchmark)
+            print(STDERR_HEADER)
+            print(err)
+            print(STDOUT_HEADER)
+            print(out)
+            if 'HTTP' in err:
+                print("Ignoring HTTP error and terminating the running of the regression tests")
+                exit(0)
+            return 'Failed to run benchmark ' + benchmark, err
+        if self.config.verbose:
+            print(err)
+        # read the input CSV
+        f = StringIO(err)
+        csv_reader = csv.reader(f, delimiter='\t')
+        header = True
+        timings = []
+        try:
+            for row in csv_reader:
+                if len(row) == 0:
+                    continue
+                if header:
+                    header = False
+                else:
+                    timings.append(row[2])
+                    self.complete_timings.append(row[2])
+            return float(statistics.median(timings)), None
+        except:
+            print("Failed to run benchmark " + benchmark)
+            print(err)
+            return 'Failed to run benchmark ' + benchmark, err
+
+    def run_benchmarks(self, benchmark_list: List[str]):
+        results = {}
+        failures = {}
+        for benchmark in benchmark_list:
+            result, failure_message = self.run_benchmark(benchmark)
+            results[benchmark] = result
+            failures[benchmark] = failure_message if failure_message else None
+        return results, failures
+
+
+def main():
+    config = BenchmarkRunnerConfig.from_args()
+    runner = BenchmarkRunner(config)
+    runner.run_benchmarks()
+
+
+if __name__ == "__main__":
+    main()
--- a/external/duckdb/scripts/regression/test_runner.py
+++ b/external/duckdb/scripts/regression/test_runner.py
@@ -0,0 +1,227 @@
+import os
+import math
+import functools
+import shutil
+from benchmark import BenchmarkRunner, BenchmarkRunnerConfig
+from dataclasses import dataclass
+from typing import Optional, List, Union
+import subprocess
+
+print = functools.partial(print, flush=True)
+
+
+def is_number(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+
+# Geometric mean of an array of numbers
+def geomean(xs):
+    if len(xs) == 0:
+        return 'EMPTY'
+    for entry in xs:
+        if not is_number(entry):
+            return entry
+    return math.exp(math.fsum(math.log(float(x)) for x in xs) / len(xs))
+
+
+import argparse
+
+# Set up the argument parser
+parser = argparse.ArgumentParser(description="Benchmark script with old and new runners.")
+
+# Define the arguments
+parser.add_argument("--old", type=str, help="Path to the old runner.", required=True)
+parser.add_argument("--new", type=str, help="Path to the new runner.", required=True)
+parser.add_argument("--benchmarks", type=str, help="Path to the benchmark file.", required=True)
+parser.add_argument("--verbose", action="store_true", help="Enable verbose output.")
+parser.add_argument("--threads", type=int, help="Number of threads to use.")
+parser.add_argument("--memory_limit", type=str, help="Memory limit to use.")
+parser.add_argument("--nofail", action="store_true", help="Do not fail on regression.")
+parser.add_argument("--disable-timeout", action="store_true", help="Disable timeout.")
+parser.add_argument("--max-timeout", type=int, default=3600, help="Set maximum timeout in seconds (default: 3600).")
+parser.add_argument("--root-dir", type=str, default="", help="Root directory.")
+parser.add_argument("--no-summary", type=str, default=False, help="No summary in the end.")
+parser.add_argument(
+    "--regression-threshold-seconds",
+    type=float,
+    default=0.05,
+    help="REGRESSION_THRESHOLD_SECONDS value for large benchmarks.",
+)
+
+# Parse the arguments
+args = parser.parse_args()
+
+# Assign parsed arguments to variables
+old_runner_path = args.old
+new_runner_path = args.new
+benchmark_file = args.benchmarks
+verbose = args.verbose
+threads = args.threads
+memory_limit = args.memory_limit
+no_regression_fail = args.nofail
+disable_timeout = args.disable_timeout
+max_timeout = args.max_timeout
+root_dir = args.root_dir
+no_summary = args.no_summary
+regression_threshold_seconds = args.regression_threshold_seconds
+
+
+# how many times we will run the experiment, to be sure of the regression
+NUMBER_REPETITIONS = 5
+# the threshold at which we consider something a regression (percentage)
+REGRESSION_THRESHOLD_PERCENTAGE = 0.1
+# minimal seconds diff for something to be a regression (for very fast benchmarks)
+REGRESSION_THRESHOLD_SECONDS = regression_threshold_seconds
+
+if not os.path.isfile(old_runner_path):
+    print(f"Failed to find old runner {old_runner_path}")
+    exit(1)
+
+if not os.path.isfile(new_runner_path):
+    print(f"Failed to find new runner {new_runner_path}")
+    exit(1)
+
+config_dict = vars(args)
+old_runner = BenchmarkRunner(BenchmarkRunnerConfig.from_params(old_runner_path, benchmark_file, **config_dict))
+new_runner = BenchmarkRunner(BenchmarkRunnerConfig.from_params(new_runner_path, benchmark_file, **config_dict))
+
+benchmark_list = old_runner.benchmark_list
+
+summary = []
+
+
+@dataclass
+class BenchmarkResult:
+    benchmark: str
+    old_result: Union[float, str]
+    new_result: Union[float, str]
+    old_failure: Optional[str] = None
+    new_failure: Optional[str] = None
+
+
+multiply_percentage = 1.0 + REGRESSION_THRESHOLD_PERCENTAGE
+other_results: List[BenchmarkResult] = []
+error_list: List[BenchmarkResult] = []
+for i in range(NUMBER_REPETITIONS):
+    regression_list: List[BenchmarkResult] = []
+    if len(benchmark_list) == 0:
+        break
+    print(
+        f'''====================================================
+==============      ITERATION {i}        =============
+==============      REMAINING {len(benchmark_list)}        =============
+====================================================
+'''
+    )
+
+    old_results, old_failures = old_runner.run_benchmarks(benchmark_list)
+    new_results, new_failures = new_runner.run_benchmarks(benchmark_list)
+
+    for benchmark in benchmark_list:
+        old_res = old_results[benchmark]
+        new_res = new_results[benchmark]
+
+        old_fail = old_failures[benchmark]
+        new_fail = new_failures[benchmark]
+
+        if isinstance(old_res, str) or isinstance(new_res, str):
+            # benchmark failed to run - always a regression
+            error_list.append(BenchmarkResult(benchmark, old_res, new_res, old_fail, new_fail))
+        elif (no_regression_fail == False) and (
+            (old_res + REGRESSION_THRESHOLD_SECONDS) * multiply_percentage < new_res
+        ):
+            regression_list.append(BenchmarkResult(benchmark, old_res, new_res))
+        else:
+            other_results.append(BenchmarkResult(benchmark, old_res, new_res))
+    benchmark_list = [res.benchmark for res in regression_list]
+
+exit_code = 0
+regression_list.extend(error_list)
+summary = []
+if len(regression_list) > 0:
+    exit_code = 1
+    print(
+        '''====================================================
+==============  REGRESSIONS DETECTED   =============
+====================================================
+'''
+    )
+    for regression in regression_list:
+        print(f"{regression.benchmark}")
+        print(f"Old timing: {regression.old_result}")
+        print(f"New timing: {regression.new_result}")
+        if regression.old_failure or regression.new_failure:
+            new_data = {
+                "benchmark": regression.benchmark,
+                "old_failure": regression.old_failure,
+                "new_failure": regression.new_failure,
+            }
+            summary.append(new_data)
+        print("")
+    print(
+        '''====================================================
+==============     OTHER TIMINGS       =============
+====================================================
+'''
+    )
+else:
+    print(
+        '''====================================================
+============== NO REGRESSIONS DETECTED  =============
+====================================================
+'''
+    )
+
+other_results.sort(key=lambda x: x.benchmark)
+for res in other_results:
+    print(f"{res.benchmark}")
+    print(f"Old timing: {res.old_result}")
+    print(f"New timing: {res.new_result}")
+    print("")
+
+time_a = geomean(old_runner.complete_timings)
+time_b = geomean(new_runner.complete_timings)
+
+
+print("")
+if isinstance(time_a, str) or isinstance(time_b, str):
+    print(f"Old: {time_a}")
+    print(f"New: {time_b}")
+elif time_a > time_b * 1.01:
+    print(f"Old timing geometric mean: {time_a}")
+    print(f"New timing geometric mean: {time_b}, roughly {int((time_a - time_b) * 100.0 / time_a)}% faster")
+elif time_b > time_a * 1.01:
+    print(f"Old timing geometric mean: {time_a}, roughly {int((time_b - time_a) * 100.0 / time_b)}% faster")
+    print(f"New timing geometric mean: {time_b}")
+else:
+    print(f"Old timing geometric mean: {time_a}")
+    print(f"New timing geometric mean: {time_b}")
+
+# nuke cached benchmark data between runs
+if os.path.isdir("duckdb_benchmark_data"):
+    shutil.rmtree('duckdb_benchmark_data')
+
+if summary and not no_summary:
+    print(
+        '''\n\n====================================================
+================  FAILURES SUMMARY  ================
+====================================================
+'''
+    )
+    # check the value is "true" otherwise you'll see the prefix in local run outputs
+    prefix = "::error::" if ('CI' in os.environ and os.getenv('CI') == 'true') else ""
+    for i, failure_message in enumerate(summary, start=1):
+        prefix_str = f"{prefix}{i}" if len(prefix) > 0 else f"{i}"
+        print(f"{prefix_str}: ", failure_message["benchmark"])
+        if failure_message["old_failure"] != failure_message["new_failure"]:
+            print("Old:\n", failure_message["old_failure"])
+            print("New:\n", failure_message["new_failure"])
+        else:
+            print(failure_message["old_failure"])
+        print("-", 52)
+
+exit(exit_code)
--- a/external/duckdb/scripts/regression_check.py
+++ b/external/duckdb/scripts/regression_check.py
@@ -0,0 +1,115 @@
+import os
+import sys
+import duckdb
+import numpy
+import subprocess
+from io import StringIO
+import csv
+import statistics
+
+old_file = None
+new_file = None
+# the threshold at which we consider something a regression (percentage)
+regression_threshold_percentage = 0.1
+# minimal seconds diff for something to be a regression (for very fast benchmarks)
+regression_threshold_seconds = 0.01
+
+for arg in sys.argv:
+    if arg.startswith("--old="):
+        old_file = arg.replace("--old=", "")
+    elif arg.startswith("--new="):
+        new_file = arg.replace("--new=", "")
+
+if old_file is None or new_file is None:
+    print("Usage: python scripts/regression_check.py --old=<old_file> --new-<new_file>")
+    exit(1)
+
+con = duckdb.connect()
+old_timings_l = con.execute(
+    f"SELECT name, median(time) FROM read_csv_auto('{old_file}') t(name, nrun, time) GROUP BY ALL ORDER BY ALL"
+).fetchall()
+new_timings_l = con.execute(
+    f"SELECT name, median(time) FROM read_csv_auto('{new_file}') t(name, nrun, time) GROUP BY ALL ORDER BY ALL"
+).fetchall()
+
+old_timings = {}
+new_timings = {}
+
+for entry in old_timings_l:
+    name = entry[0]
+    timing = entry[1]
+    old_timings[name] = timing
+
+for entry in new_timings_l:
+    name = entry[0]
+    timing = entry[1]
+    new_timings[name] = timing
+
+slow_keys = []
+multiply_percentage = 1.0 + regression_threshold_percentage
+
+test_keys = list(new_timings.keys())
+test_keys.sort()
+
+for key in test_keys:
+    new_timing = new_timings[key]
+    old_timing = old_timings[key]
+    if (old_timing + regression_threshold_seconds) * multiply_percentage < new_timing:
+        slow_keys.append(key)
+
+return_code = 0
+if len(slow_keys) > 0:
+    print(
+        '''====================================================
+==============  REGRESSIONS DETECTED   =============
+====================================================
+'''
+    )
+    return_code = 1
+    for key in slow_keys:
+        new_timing = new_timings[key]
+        old_timing = old_timings[key]
+        print(key)
+        print(f"Old timing: {old_timing}")
+        print(f"New timing: {new_timing}")
+        print("")
+
+    print(
+        '''====================================================
+==================  New Timings   ==================
+====================================================
+'''
+    )
+    with open(new_file, 'r') as f:
+        print(f.read())
+    print(
+        '''====================================================
+==================  Old Timings   ==================
+====================================================
+'''
+    )
+    with open(old_file, 'r') as f:
+        print(f.read())
+else:
+    print(
+        '''====================================================
+============== NO REGRESSIONS DETECTED  =============
+====================================================
+'''
+    )
+
+print(
+    '''====================================================
+=================== ALL TIMINGS  ===================
+====================================================
+'''
+)
+for key in test_keys:
+    new_timing = new_timings[key]
+    old_timing = old_timings[key]
+    print(key)
+    print(f"Old timing: {old_timing}")
+    print(f"New timing: {new_timing}")
+    print("")
+
+exit(return_code)
--- a/external/duckdb/scripts/regression_test_extension_size.py
+++ b/external/duckdb/scripts/regression_test_extension_size.py
@@ -0,0 +1,80 @@
+import os
+import argparse
+import subprocess
+import tempfile
+from pathlib import Path
+
+# the threshold at which we consider something a regression (percentage)
+regression_threshold_percentage = 0.20
+
+parser = argparse.ArgumentParser(description='Generate TPC-DS reference results from Postgres.')
+parser.add_argument(
+    '--old', dest='old_extension_dir', action='store', help='Path to the old extension dir', required=True
+)
+parser.add_argument(
+    '--new', dest='new_extension_dir', action='store', help='Path to the new extension dir', required=True
+)
+parser.add_argument(
+    '--expect',
+    dest='expected_extensions_raw',
+    action='store',
+    help='Comma separated list of expected extensions',
+    required=True,
+)
+
+args = parser.parse_args()
+
+
+expected_extensions = args.expected_extensions_raw.split(',')
+
+exit_code = 0
+
+
+def parse_extensions(dir):
+    result = {}
+
+    for root, dirs, files in os.walk(dir):
+        for filename in files:
+            if filename.endswith(".duckdb_extension"):
+                result[Path(filename).stem] = os.path.join(root, filename)
+
+    # Check all expected extensions are there
+    for expected_extension in expected_extensions:
+        if expected_extension not in result.keys():
+            print(f"Did not find expected extension {expected_extension} in {dir}")
+            exit(1)
+
+    return result
+
+
+old_extensions = parse_extensions(args.old_extension_dir)
+new_extensions = parse_extensions(args.new_extension_dir)
+
+matching_extensions = []
+
+for extension in old_extensions.keys():
+    if extension in new_extensions:
+        matching_extensions.append(extension)
+
+check_passed = True
+error_message = ""
+
+for extension in matching_extensions:
+    old_size = os.path.getsize(old_extensions[extension])
+    new_size = os.path.getsize(new_extensions[extension])
+
+    print(f" - checking '{extension}': old size={old_size}, new_size={new_size}")
+
+    if new_size / (old_size + 0.1) > (1.0 + regression_threshold_percentage):
+        check_passed = False
+        error_message += f" - Extension '{extension}' was bigger than expected {new_size}\n"
+        error_message += f"   - old size: {old_size}\n"
+        error_message += f"   - new size: {new_size}\n"
+
+print()
+if not check_passed:
+    print("Extension size regression check failed:\n")
+    print(error_message)
+    exit(1)
+else:
+    print(f"All extensions passed the check!")
--- a/external/duckdb/scripts/regression_test_python.py
+++ b/external/duckdb/scripts/regression_test_python.py
@@ -0,0 +1,402 @@
+import os
+import sys
+import duckdb
+import pandas as pd
+import pyarrow as pa
+import time
+import argparse
+from typing import Dict, List, Any
+import numpy as np
+
+TPCH_QUERIES = []
+res = duckdb.execute(
+    """
+    select query from tpch_queries()
+"""
+).fetchall()
+for x in res:
+    TPCH_QUERIES.append(x[0])
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--verbose", action="store_true", help="Enable verbose mode", default=False)
+parser.add_argument("--threads", type=int, help="Number of threads", default=None)
+parser.add_argument("--nruns", type=int, help="Number of runs", default=10)
+parser.add_argument("--out-file", type=str, help="Output file path", default=None)
+parser.add_argument("--scale-factor", type=float, help="Set the scale factor TPCH is generated at", default=1.0)
+args, unknown_args = parser.parse_known_args()
+
+verbose = args.verbose
+threads = args.threads
+nruns = args.nruns
+out_file = args.out_file
+scale_factor = args.scale_factor
+
+if unknown_args:
+    parser.error(f"Unrecognized parameter(s): {', '.join(unknown_args)}")
+
+
+def print_msg(message: str):
+    if not verbose:
+        return
+    print(message)
+
+
+def write_result(benchmark_name, nrun, t):
+    bench_result = f"{benchmark_name}\t{nrun}\t{t}"
+    if out_file is not None:
+        if not hasattr(write_result, 'file'):
+            write_result.file = open(out_file, 'w+')
+        write_result.file.write(bench_result)
+        write_result.file.write('\n')
+    else:
+        print_msg(bench_result)
+
+
+def close_result():
+    if not hasattr(write_result, 'file'):
+        return
+    write_result.file.close()
+
+
+class BenchmarkResult:
+    def __init__(self, name):
+        self.name = name
+        self.runs: List[float] = []
+
+    def add(self, duration: float):
+        self.runs.append(duration)
+
+    def write(self):
+        for i, run in enumerate(self.runs):
+            write_result(self.name, i, run)
+
+
+class TPCHData:
+    TABLES = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
+
+    def __init__(self, scale_factor):
+        self.conn = duckdb.connect()
+        self.conn.execute(f'CALL dbgen(sf={scale_factor})')
+
+    def get_tables(self, convertor) -> Dict[str, Any]:
+        res = {}
+        for table in self.TABLES:
+            res[table] = convertor(self.conn, table)
+        return res
+
+    def load_lineitem(self, collector, benchmark_name) -> BenchmarkResult:
+        query = 'SELECT * FROM lineitem'
+        result = BenchmarkResult(benchmark_name)
+        for _ in range(nruns):
+            duration = 0.0
+            start = time.time()
+            rel = self.conn.sql(query)
+            res = collector(rel)
+            end = time.time()
+            duration = float(end - start)
+            del res
+            padding = " " * len(str(nruns))
+            print_msg(f"T{padding}: {duration}s")
+            result.add(duration)
+        return result
+
+
+class TPCHBenchmarker:
+    def __init__(self, name: str):
+        self.initialize_connection()
+        self.name = name
+
+    def initialize_connection(self):
+        self.con = duckdb.connect()
+        if not threads:
+            return
+        print_msg(f'Limiting threads to {threads}')
+        self.con.execute(f"SET threads={threads}")
+
+    def register_tables(self, tables: Dict[str, Any]):
+        for name, table in tables.items():
+            self.con.register(name, table)
+
+    def run_tpch(self, collector, benchmark_name) -> BenchmarkResult:
+        print_msg("")
+        print_msg(TPCH_QUERIES)
+        result = BenchmarkResult(benchmark_name)
+        for _ in range(nruns):
+            duration = 0.0
+            # Execute all queries
+            for i, query in enumerate(TPCH_QUERIES):
+                start = time.time()
+                rel = self.con.sql(query)
+                if rel:
+                    res = collector(rel)
+                    del res
+                else:
+                    print_msg(f"Query '{query}' did not produce output")
+                end = time.time()
+                query_time = float(end - start)
+                print_msg(f"Q{str(i).ljust(len(str(nruns)), ' ')}: {query_time}")
+                duration += float(end - start)
+                padding = " " * len(str(nruns))
+                print_msg(f"T{padding}: {duration}s")
+            result.add(duration)
+        return result
+
+
+def test_tpch():
+    print_msg(f"Generating TPCH (sf={scale_factor})")
+    tpch = TPCHData(scale_factor)
+
+    ## -------- Benchmark converting LineItem to different formats ---------
+
+    def fetch_native(rel: duckdb.DuckDBPyRelation):
+        return rel.fetchall()
+
+    def fetch_pandas(rel: duckdb.DuckDBPyRelation):
+        return rel.df()
+
+    def fetch_arrow(rel: duckdb.DuckDBPyRelation):
+        return rel.arrow()
+
+    COLLECTORS = {'native': fetch_native, 'pandas': fetch_pandas, 'arrow': fetch_arrow}
+    # For every collector, load lineitem 'nrun' times
+    for collector in COLLECTORS:
+        result: BenchmarkResult = tpch.load_lineitem(COLLECTORS[collector], collector + "_load_lineitem")
+        print_msg(result.name)
+        print_msg(collector)
+        result.write()
+
+    ## ------- Benchmark running TPCH queries on top of different formats --------
+
+    def convert_pandas(conn: duckdb.DuckDBPyConnection, table_name: str):
+        return conn.execute(f"SELECT * FROM {table_name}").df()
+
+    def convert_arrow(conn: duckdb.DuckDBPyConnection, table_name: str):
+        df = convert_pandas(conn, table_name)
+        return pa.Table.from_pandas(df)
+
+    CONVERTORS = {'pandas': convert_pandas, 'arrow': convert_arrow}
+    # Convert TPCH data to the right format, then run TPCH queries on that data
+    for convertor in CONVERTORS:
+        tables = tpch.get_tables(CONVERTORS[convertor])
+        tester = TPCHBenchmarker(convertor)
+        tester.register_tables(tables)
+        collector = COLLECTORS[convertor]
+        result: BenchmarkResult = tester.run_tpch(collector, f"{convertor}tpch")
+        result.write()
+
+
+def generate_string(seed: int):
+    output = ''
+    for _ in range(10):
+        output += chr(ord('A') + int(seed % 26))
+        seed /= 26
+    return output
+
+
+class ArrowDictionary:
+    def __init__(self, unique_values):
+        self.size = unique_values
+        self.dict = [generate_string(x) for x in range(unique_values)]
+
+
+class ArrowDictionaryBenchmark:
+    def __init__(self, unique_values, values, arrow_dict: ArrowDictionary):
+        assert unique_values <= arrow_dict.size
+        self.initialize_connection()
+        self.generate(unique_values, values, arrow_dict)
+
+    def initialize_connection(self):
+        self.con = duckdb.connect()
+        if not threads:
+            return
+        print_msg(f'Limiting threads to {threads}')
+        self.con.execute(f"SET threads={threads}")
+
+    def generate(self, unique_values, values, arrow_dict: ArrowDictionary):
+        self.input = []
+        self.expected = []
+        for x in range(values):
+            value = arrow_dict.dict[x % unique_values]
+            self.input.append(value)
+            self.expected.append((value,))
+
+        array = pa.array(
+            self.input,
+            type=pa.dictionary(pa.int64(), pa.string()),
+        )
+        self.table = pa.table([array], names=["x"])
+
+    def benchmark(self, benchmark_name) -> BenchmarkResult:
+        self.con.register('arrow_table', self.table)
+        result = BenchmarkResult(benchmark_name)
+        for _ in range(nruns):
+            duration = 0.0
+            start = time.time()
+            res = self.con.execute(
+                """
+                select * from arrow_table
+            """
+            ).fetchall()
+            end = time.time()
+            duration = float(end - start)
+            assert self.expected == res
+            del res
+            padding = " " * len(str(nruns))
+            print_msg(f"T{padding}: {duration}s")
+            result.add(duration)
+        return result
+
+
+class SelectAndCallBenchmark:
+    def __init__(self):
+        """
+        SELECT statements become QueryRelations, any other statement type becomes a MaterializedRelation.
+        We use SELECT and CALL here because their execution plans are identical
+        """
+        self.initialize_connection()
+
+    def initialize_connection(self):
+        self.con = duckdb.connect()
+        if not threads:
+            return
+        print_msg(f'Limiting threads to {threads}')
+        self.con.execute(f"SET threads={threads}")
+
+    def benchmark(self, name, query) -> List[BenchmarkResult]:
+        results: List[BenchmarkResult] = []
+        methods = {'select': 'select * from ', 'call': 'call '}
+        for key, value in methods.items():
+            for rowcount in [2048, 50000, 2500000]:
+                result = BenchmarkResult(f'{key}_{name}_{rowcount}')
+                query_string = query.format(rows=rowcount)
+                query_string = value + query_string
+                rel = self.con.sql(query_string)
+                print_msg(rel.type)
+                for _ in range(nruns):
+                    duration = 0.0
+                    start = time.time()
+                    rel.fetchall()
+                    end = time.time()
+                    duration = float(end - start)
+                    padding = " " * len(str(nruns))
+                    print_msg(f"T{padding}: {duration}s")
+                    result.add(duration)
+                results.append(result)
+        return results
+
+
+class PandasDFLoadBenchmark:
+    def __init__(self):
+        self.initialize_connection()
+        self.generate()
+
+    def initialize_connection(self):
+        self.con = duckdb.connect()
+        if not threads:
+            return
+        print_msg(f'Limiting threads to {threads}')
+        self.con.execute(f"SET threads={threads}")
+
+    def generate(self):
+        self.con.execute("call dbgen(sf=0.1)")
+        new_table = "*, " + ", ".join(["l_shipdate"] * 300)
+        self.con.execute(f"create table wide as select {new_table} from lineitem limit 500")
+        self.con.execute(f"copy wide to 'wide_table.csv' (FORMAT CSV)")
+
+    def benchmark(self, benchmark_name) -> BenchmarkResult:
+        result = BenchmarkResult(benchmark_name)
+        for _ in range(nruns):
+            duration = 0.0
+            pandas_df = pd.read_csv('wide_table.csv')
+            start = time.time()
+            for _ in range(30):
+                res = self.con.execute("""select * from pandas_df""").df()
+            end = time.time()
+            duration = float(end - start)
+            del res
+            result.add(duration)
+        return result
+
+
+class PandasAnalyzerBenchmark:
+    def __init__(self):
+        self.initialize_connection()
+        self.generate()
+
+    def initialize_connection(self):
+        self.con = duckdb.connect()
+        if not threads:
+            return
+        print_msg(f'Limiting threads to {threads}')
+        self.con.execute(f"SET threads={threads}")
+
+    def generate(self):
+        return
+
+    def benchmark(self, benchmark_name) -> BenchmarkResult:
+        result = BenchmarkResult(benchmark_name)
+        data = [None] * 9999999 + [1]  # Last element is 1, others are None
+
+        # Create the DataFrame with the specified data and column type as object
+        pandas_df = pd.DataFrame(data, columns=['Column'], dtype=object)
+        for _ in range(nruns):
+            duration = 0.0
+            start = time.time()
+            for _ in range(30):
+                res = self.con.execute("""select * from pandas_df""").df()
+            end = time.time()
+            duration = float(end - start)
+            del res
+            result.add(duration)
+        return result
+
+
+def test_arrow_dictionaries_scan():
+    DICT_SIZE = 26 * 1000
+    print_msg(f"Generating a unique dictionary of size {DICT_SIZE}")
+    arrow_dict = ArrowDictionary(DICT_SIZE)
+    DATASET_SIZE = 10000000
+    for unique_values in [2, 1000, DICT_SIZE]:
+        test = ArrowDictionaryBenchmark(unique_values, DATASET_SIZE, arrow_dict)
+        benchmark_name = f"arrow_dict_unique_{unique_values}_total_{DATASET_SIZE}"
+        result = test.benchmark(benchmark_name)
+        result.write()
+
+
+def test_loading_pandas_df_many_times():
+    test = PandasDFLoadBenchmark()
+    benchmark_name = f"load_pandas_df_many_times"
+    result = test.benchmark(benchmark_name)
+    result.write()
+
+
+def test_pandas_analyze():
+    test = PandasAnalyzerBenchmark()
+    benchmark_name = f"pandas_analyze"
+    result = test.benchmark(benchmark_name)
+    result.write()
+
+
+def test_call_and_select_statements():
+    test = SelectAndCallBenchmark()
+    queries = {
+        'repeat_row': "repeat_row(42, 'test', True, 'this is a long string', num_rows={rows})",
+    }
+    for key, value in queries.items():
+        results = test.benchmark(key, value)
+        for res in results:
+            res.write()
+
+
+def main():
+    test_tpch()
+    test_arrow_dictionaries_scan()
+    test_loading_pandas_df_many_times()
+    test_pandas_analyze()
+    test_call_and_select_statements()
+
+    close_result()
+
+
+if __name__ == '__main__':
+    main()
--- a/external/duckdb/scripts/regression_test_storage_size.py
+++ b/external/duckdb/scripts/regression_test_storage_size.py
@@ -0,0 +1,87 @@
+import os
+import argparse
+import subprocess
+import tempfile
+
+# the threshold at which we consider something a regression (percentage)
+regression_threshold_percentage = 0.05
+
+parser = argparse.ArgumentParser(description='Generate TPC-DS reference results from Postgres.')
+parser.add_argument('--old', dest='old_runner', action='store', help='Path to the old shell executable')
+parser.add_argument('--new', dest='new_runner', action='store', help='Path to the new shell executable')
+
+args = parser.parse_args()
+
+old_runner = args.old_runner
+new_runner = args.new_runner
+exit_code = 0
+
+if not os.path.isfile(old_runner):
+    print(f"Failed to find old runner {old_runner}")
+    exit(1)
+
+if not os.path.isfile(new_runner):
+    print(f"Failed to find new runner {new_runner}")
+    exit(1)
+
+
+def load_data(shell_path, load_script):
+    with tempfile.NamedTemporaryFile() as f:
+        filename = f.name
+    proc = subprocess.Popen(
+        [
+            shell_path,
+            '-storage_version',
+            'latest',
+            '-c',
+            "set storage_compatibility_version='latest'",
+            '-c',
+            load_script,
+            filename,
+        ]
+    )
+    proc.wait()
+    if proc.returncode != 0:
+        print('----------------------------')
+        print('FAILED TO RUN')
+        print('----------------------------')
+        return None
+    return os.path.getsize(filename)
+
+
+def run_benchmark(load_script, benchmark_name):
+    print('----------------------------')
+    print(f'Running benchmark {benchmark_name}')
+    print('----------------------------')
+    old_size = load_data(old_runner, load_script)
+    if old_size is None:
+        return False
+    new_size = load_data(new_runner, load_script)
+    if new_size is None:
+        return False
+    print(f'Database size with old runner: {old_size}')
+    print(f'Database size with new runner: {new_size}')
+    if new_size - new_size * regression_threshold_percentage > old_size:
+        print('----------------------------')
+        print('FAILURE: SIZE INCREASE')
+        print('----------------------------')
+        return False
+    else:
+        print('----------------------------')
+        print('SUCCESS!')
+        print('----------------------------')
+    return True
+
+
+tpch_load = 'CALL dbgen(sf=1);'
+tpcds_load = 'CALL dsdgen(sf=1);'
+
+
+benchmarks = [[tpch_load, 'TPC-H SF1'], [tpcds_load, 'TPC-DS SF1']]
+
+for benchmark in benchmarks:
+    if not run_benchmark(benchmark[0], benchmark[1]):
+        print(f'Database size increased in {benchmark[1]}')
+        exit_code = 1
+
+exit(exit_code)
--- a/external/duckdb/scripts/rename-slow-tests.R
+++ b/external/duckdb/scripts/rename-slow-tests.R
@@ -0,0 +1,48 @@
+library(tidyverse)
+
+here <- rprojroot::is_git_root$find_file
+
+# build/debug/test/unittest -d yes 2>&1 > timings.txt
+timings <- readLines(here("timings.txt"))
+
+timings
+timings_df <- rematch2::re_match(timings, "^.*(?<time>[0-9][.][0-9][0-9][0-9]) s: (?<desc>.*)$")
+
+cum_timings_df <-
+  timings_df %>%
+  filter(!is.na(time)) %>%
+  mutate(time = as.numeric(time)) %>%
+  count(desc, wt = time, name = "time") %>%
+  arrange(time) %>%
+  mutate(cum_time = cumsum(time), id = row_number())
+
+cum_timings_df %>%
+  ggplot(aes(x = time, y = cum_time, color = id)) +
+  geom_line() +
+  scale_x_log10()
+
+cum_timings_df %>%
+  ggplot(aes(x = id, y = cum_time, color = time)) +
+  geom_line() +
+  scale_colour_continuous(trans = "log10")
+
+cum_timings_cut <-
+  cum_timings_df %>%
+  filter(cum_time >= 200, str_detect(desc, "[.]test$"))
+
+slow <- cum_timings_cut$desc
+slow_renamed <- paste0(slow, "_coverage")
+
+slow_renamed[fs::file_exists(here(slow_renamed))]
+stopifnot(!any(fs::file_exists(here(slow_renamed))))
+
+withr::with_dir(
+  here(),
+  fs::file_move(slow, slow_renamed)
+)
+
+walk2(slow_renamed, slow, ~ {
+  text <- brio::read_lines(here(.x))
+  text <- str_replace_all(text, fixed(.y), .x)
+  brio::write_lines(text, here(.x))
+})
--- a/external/duckdb/scripts/repeat_until_success.py
+++ b/external/duckdb/scripts/repeat_until_success.py
@@ -0,0 +1,20 @@
+import os
+import sys
+import time
+
+if len(sys.argv) <= 1:
+    print("Expected usage: python3 repeat_until_success.py [command]")
+    exit(1)
+
+ntries = 10
+sleep_duration = 3
+cmd = sys.argv[1]
+
+for i in range(ntries):
+    ret = os.system(cmd)
+    if ret is None or ret == 0:
+        exit(0)
+    print("Command {{ " + cmd + " }} failed, retrying (" + str(i + 1) + "/" + str(ntries) + ")")
+    time.sleep(sleep_duration)
+
+exit(1)
--- a/external/duckdb/scripts/rerun_workflows.py
+++ b/external/duckdb/scripts/rerun_workflows.py
@@ -0,0 +1,62 @@
+import subprocess
+import duckdb
+import os
+import pandas as pd
+import argparse
+from io import StringIO
+
+parser = argparse.ArgumentParser(description='Rerun failed workflows from a PR.')
+parser.add_argument(
+    '--title',
+    dest='title',
+    action='store',
+    help='The title of the PR for which we want to rerun workflows (or part of the title)',
+    required=True,
+)
+parser.add_argument(
+    '--repo', dest='repo', action='store', help='The repository to run this workflow on', default='duckdb/duckdb'
+)
+parser.add_argument(
+    '--max_workflows',
+    dest='max_workflows',
+    action='store',
+    help='The maximum number of workflows to look at (starting from the latest)',
+    default=200,
+)
+args = parser.parse_args()
+
+nlimit = args.max_workflows
+query = args.title
+
+
+proc = subprocess.Popen(
+    [
+        'gh',
+        'run',
+        '-R',
+        args.repo,
+        'list',
+        '--json',
+        'displayTitle,databaseId,status,conclusion,headSha',
+        f'--limit={nlimit}',
+    ],
+    stdout=subprocess.PIPE,
+)
+text = proc.stdout.read().decode('utf8')
+df = pd.read_json(StringIO(text))
+result = duckdb.query(f"select headSha from df where displayTitle LIKE '%{query}%' limit 1").fetchall()
+if len(result) == 0:
+    print(
+        f"No workflows found in the latest {nlimit} workflows that contain the text {query}.\nPerhaps try running with a higher --max_workflows parameter?"
+    )
+    exit(1)
+
+headSha = result[0][0]
+
+result = duckdb.query(
+    f"select databaseId from df where conclusion IN ('failure', 'cancelled') AND displayTitle LIKE '%{query}%' and headSha='{headSha}'"
+).fetchall()
+if len(result) == 0:
+    print(f"Found runs that match the text {query} but no failing or cancelled runs were found")
+for databaseId in [x[0] for x in result]:
+    os.system(f'gh run -R {args.repo} rerun {databaseId}')
--- a/external/duckdb/scripts/run-clang-tidy.py
+++ b/external/duckdb/scripts/run-clang-tidy.py
@@ -0,0 +1,347 @@
+#!/usr/bin/env python
+#
+# ===- run-clang-tidy.py - Parallel clang-tidy runner ---------*- python -*--===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===------------------------------------------------------------------------===#
+# FIXME: Integrate with clang-tidy-diff.py
+
+"""
+Parallel clang-tidy runner
+==========================
+
+Runs clang-tidy over all files in a compilation database. Requires clang-tidy
+and clang-apply-replacements in $PATH.
+
+Example invocations.
+- Run clang-tidy on all files in the current working directory with a default
+  set of checks and show warnings in the cpp files and all project headers.
+    run-clang-tidy.py $PWD
+
+- Fix all header guards.
+    run-clang-tidy.py -fix -checks=-*,llvm-header-guard
+
+- Fix all header guards included from clang-tidy and header guards
+  for clang-tidy headers.
+    run-clang-tidy.py -fix -checks=-*,llvm-header-guard extra/clang-tidy \
+                      -header-filter=extra/clang-tidy
+
+Compilation database setup:
+http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html
+"""
+
+from __future__ import print_function
+
+import argparse
+import glob
+import json
+import multiprocessing
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import threading
+import traceback
+
+try:
+    import yaml
+except ImportError:
+    yaml = None
+
+is_py2 = sys.version[0] == '2'
+
+if is_py2:
+    import Queue as queue
+else:
+    import queue as queue
+
+
+def find_compilation_database(path):
+    """Adjusts the directory until a compilation database is found."""
+    result = './'
+    while not os.path.isfile(os.path.join(result, path)):
+        if os.path.realpath(result) == '/':
+            print('Error: could not find compilation database.')
+            sys.exit(1)
+        result += '../'
+    return os.path.realpath(result)
+
+
+def make_absolute(f, directory):
+    if os.path.isabs(f):
+        return f
+    return os.path.normpath(os.path.join(directory, f))
+
+
+def get_tidy_invocation(
+    f, clang_tidy_binary, checks, tmpdir, build_path, header_filter, extra_arg, extra_arg_before, quiet, config
+):
+    """Gets a command line for clang-tidy."""
+    start = [clang_tidy_binary]
+    if header_filter is not None:
+        start.append('-header-filter=' + header_filter)
+    if checks:
+        start.append('-checks=' + checks)
+    if tmpdir is not None:
+        start.append('-export-fixes')
+        # Get a temporary file. We immediately close the handle so clang-tidy can
+        # overwrite it.
+        (handle, name) = tempfile.mkstemp(suffix='.yaml', dir=tmpdir)
+        os.close(handle)
+        start.append(name)
+    for arg in extra_arg:
+        start.append('-extra-arg=%s' % arg)
+    for arg in extra_arg_before:
+        start.append('-extra-arg-before=%s' % arg)
+    start.append('-p=' + build_path)
+    if quiet:
+        start.append('--quiet')
+    if config:
+        start.append('-config=' + config)
+    start.append(f)
+    return start
+
+
+def merge_replacement_files(tmpdir, mergefile):
+    """Merge all replacement files in a directory into a single file"""
+    # The fixes suggested by clang-tidy >= 4.0.0 are given under
+    # the top level key 'Diagnostics' in the output yaml files
+    mergekey = "Diagnostics"
+    merged = []
+    for replacefile in glob.iglob(os.path.join(tmpdir, '*.yaml')):
+        content = yaml.safe_load(open(replacefile, 'r'))
+        if not content:
+            continue  # Skip empty files.
+        merged.extend(content.get(mergekey, []))
+
+    if merged:
+        # MainSourceFile: The key is required by the definition inside
+        # include/clang/Tooling/ReplacementsYaml.h, but the value
+        # is actually never used inside clang-apply-replacements,
+        # so we set it to '' here.
+        output = {'MainSourceFile': '', mergekey: merged}
+        with open(mergefile, 'w') as out:
+            yaml.safe_dump(output, out)
+    else:
+        # Empty the file:
+        open(mergefile, 'w').close()
+
+
+def check_clang_apply_replacements_binary(args):
+    """Checks if invoking supplied clang-apply-replacements binary works."""
+    try:
+        subprocess.check_call([args.clang_apply_replacements_binary, '--version'])
+    except:
+        print(
+            'Unable to run clang-apply-replacements. Is clang-apply-replacements ' 'binary correctly specified?',
+            file=sys.stderr,
+        )
+        traceback.print_exc()
+        sys.exit(1)
+
+
+def apply_fixes(args, tmpdir):
+    """Calls clang-apply-fixes on a given directory."""
+    invocation = [args.clang_apply_replacements_binary]
+    if args.format:
+        invocation.append('-format')
+    if args.style:
+        invocation.append('-style=' + args.style)
+    invocation.append(tmpdir)
+    subprocess.call(invocation)
+
+
+def run_tidy(args, tmpdir, build_path, queue, lock, failed_files):
+    """Takes filenames out of queue and runs clang-tidy on them."""
+    while True:
+        name = queue.get()
+        invocation = get_tidy_invocation(
+            name,
+            args.clang_tidy_binary,
+            args.checks,
+            tmpdir,
+            build_path,
+            args.header_filter,
+            args.extra_arg,
+            args.extra_arg_before,
+            args.quiet,
+            args.config,
+        )
+
+        proc = subprocess.Popen(invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        output, err = proc.communicate()
+        if proc.returncode != 0:
+            failed_files.append(name)
+            with lock:
+                sys.stdout.write(' '.join(invocation) + '\n' + output.decode('utf-8'))
+                if len(err) > 0:
+                    sys.stdout.flush()
+                    sys.stderr.write(err.decode('utf-8'))
+        queue.task_done()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Runs clang-tidy over all files '
+        'in a compilation database. Requires '
+        'clang-tidy and clang-apply-replacements in '
+        '$PATH.'
+    )
+    parser.add_argument('-clang-tidy-binary', metavar='PATH', default='clang-tidy', help='path to clang-tidy binary')
+    parser.add_argument(
+        '-clang-apply-replacements-binary',
+        metavar='PATH',
+        default='clang-apply-replacements',
+        help='path to clang-apply-replacements binary',
+    )
+    parser.add_argument('-checks', default=None, help='checks filter, when not specified, use clang-tidy ' 'default')
+    parser.add_argument(
+        '-config',
+        default=None,
+        help='Specifies a configuration in YAML/JSON format: '
+        '  -config="{Checks: \'*\', '
+        '                       CheckOptions: [{key: x, '
+        '                                       value: y}]}" '
+        'When the value is empty, clang-tidy will '
+        'attempt to find a file named .clang-tidy for '
+        'each source file in its parent directories.',
+    )
+    parser.add_argument(
+        '-header-filter',
+        default=None,
+        help='regular expression matching the names of the '
+        'headers to output diagnostics from. Diagnostics from '
+        'the main file of each translation unit are always '
+        'displayed.',
+    )
+    if yaml:
+        parser.add_argument(
+            '-export-fixes',
+            metavar='filename',
+            dest='export_fixes',
+            help='Create a yaml file to store suggested fixes in, '
+            'which can be applied with clang-apply-replacements.',
+        )
+    parser.add_argument('-j', type=int, default=0, help='number of tidy instances to be run in parallel.')
+    parser.add_argument('files', nargs='*', default=['.*'], help='files to be processed (regex on path)')
+    parser.add_argument('-fix', action='store_true', help='apply fix-its')
+    parser.add_argument('-format', action='store_true', help='Reformat code ' 'after applying fixes')
+    parser.add_argument('-style', default='file', help='The style of reformat ' 'code after applying fixes')
+    parser.add_argument('-p', dest='build_path', help='Path used to read a compile command database.')
+    parser.add_argument(
+        '-extra-arg',
+        dest='extra_arg',
+        action='append',
+        default=[],
+        help='Additional argument to append to the compiler ' 'command line.',
+    )
+    parser.add_argument(
+        '-extra-arg-before',
+        dest='extra_arg_before',
+        action='append',
+        default=[],
+        help='Additional argument to prepend to the compiler ' 'command line.',
+    )
+    parser.add_argument('-quiet', action='store_true', help='Run clang-tidy in quiet mode')
+    args = parser.parse_args()
+
+    db_path = 'compile_commands.json'
+
+    if args.build_path is not None:
+        build_path = args.build_path
+    else:
+        # Find our database
+        build_path = find_compilation_database(db_path)
+
+    try:
+        invocation = [args.clang_tidy_binary, '-list-checks']
+        invocation.append('-p=' + build_path)
+        if args.checks:
+            invocation.append('-checks=' + args.checks)
+        invocation.append('-')
+        if args.quiet:
+            # Even with -quiet we still want to check if we can call clang-tidy.
+            with open(os.devnull, 'w') as dev_null:
+                subprocess.check_call(invocation, stdout=dev_null)
+        else:
+            subprocess.check_call(invocation)
+    except:
+        print("Unable to run clang-tidy, consider running `pip install clang-tidy`", file=sys.stderr)
+        sys.exit(1)
+
+    # Load the database and extract all files.
+    database = json.load(open(os.path.join(build_path, db_path)))
+    files = [make_absolute(entry['file'], entry['directory']) for entry in database]
+
+    max_task = args.j
+    if max_task == 0:
+        max_task = multiprocessing.cpu_count()
+
+    tmpdir = None
+    if args.fix or (yaml and args.export_fixes):
+        check_clang_apply_replacements_binary(args)
+        tmpdir = tempfile.mkdtemp()
+
+    # Build up a big regexy filter from all command line arguments.
+    file_name_re = re.compile('|'.join(args.files))
+
+    return_code = 0
+    try:
+        # Spin up a bunch of tidy-launching threads.
+        task_queue = queue.Queue(max_task)
+        # List of files with a non-zero return code.
+        failed_files = []
+        lock = threading.Lock()
+        for _ in range(max_task):
+            t = threading.Thread(target=run_tidy, args=(args, tmpdir, build_path, task_queue, lock, failed_files))
+            t.daemon = True
+            t.start()
+
+        # Fill the queue with files.
+        for name in files:
+            if file_name_re.search(name):
+                task_queue.put(name)
+
+        # Wait for all threads to be done.
+        task_queue.join()
+        if len(failed_files):
+            return_code = 1
+
+    except KeyboardInterrupt:
+        # This is a sad hack. Unfortunately subprocess goes
+        # bonkers with ctrl-c and we start forking merrily.
+        print('\nCtrl-C detected, goodbye.')
+        if tmpdir:
+            shutil.rmtree(tmpdir)
+        os.kill(0, 9)
+
+    if yaml and args.export_fixes:
+        print('Writing fixes to ' + args.export_fixes + ' ...')
+        try:
+            merge_replacement_files(tmpdir, args.export_fixes)
+        except:
+            print('Error exporting fixes.\n', file=sys.stderr)
+            traceback.print_exc()
+            return_code = 1
+
+    if args.fix:
+        print('Applying fixes ...')
+        try:
+            apply_fixes(args, tmpdir)
+        except:
+            print('Error applying fixes.\n', file=sys.stderr)
+            traceback.print_exc()
+            return_code = 1
+
+    if tmpdir:
+        shutil.rmtree(tmpdir)
+    sys.exit(return_code)
+
+
+if __name__ == '__main__':
+    main()
--- a/external/duckdb/scripts/run_benchmark.py
+++ b/external/duckdb/scripts/run_benchmark.py
@@ -0,0 +1,52 @@
+import argparse
+import os
+import subprocess
+import re
+
+parser = argparse.ArgumentParser(description='Run a full benchmark using the CLI and report the results.')
+parser.add_argument('--shell', action='store', help='Path to the CLI', default='build/reldebug/duckdb')
+parser.add_argument('--database', action='store', help='Path to the database file to load data from')
+parser.add_argument(
+    '--queries', action='store', help='Path to the list of queries to run (e.g. benchmark/clickbench/queries)'
+)
+parser.add_argument('--nrun', action='store', help='The number of runs', default=3)
+
+args = parser.parse_args()
+
+queries = os.listdir(args.queries)
+queries.sort()
+ran_queries = []
+timings = []
+for q in queries:
+    if 'load.sql' in q:
+        continue
+    command = [args.shell, args.database]
+    command += ['-c', '.timer on']
+    for i in range(args.nrun):
+        command += ['-c', '.read ' + os.path.join(args.queries, q)]
+    res = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    stdout = res.stdout.decode('utf8').strip()
+    stderr = res.stderr.decode('utf8').strip()
+    results = re.findall(r'Run Time \(s\): real (\d+.\d+)', stdout)
+    if res.returncode != 0 or 'Error:\n' in stderr or len(results) != args.nrun:
+        print("------- Failed to run query -------")
+        print(q)
+        print("------- stdout -------")
+        print(stdout)
+        print("------- stderr -------")
+        print(stderr)
+        exit(1)
+    results = [float(x) for x in results]
+    print(f"Timings for {q}: " + str(results))
+    ran_queries.append(q)
+    timings.append(results[1])
+
+print('')
+sql_query = 'SELECT UNNEST(['
+sql_query += ','.join(["'" + x + "'" for x in ran_queries]) + ']) as query'
+sql_query += ","
+sql_query += "UNNEST(["
+sql_query += ','.join([str(x) for x in timings])
+sql_query += "]) as timing;"
+print(sql_query)
--- a/external/duckdb/scripts/run_extension_medata_tests.sh
+++ b/external/duckdb/scripts/run_extension_medata_tests.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+
+# Generates a bunch of directories to be used for testing extension updating related behaviour used in `test/extension/update_extensions_ci.test`
+
+# Please consider your energy footprint by only running this script with ccache.
+# note that subsequent runs used cached artifacts, use `make clean` or rm -rf build/debug to clean
+
+set -x
+set -e
+
+DUCKDB_BUILD_DIR="./build/debug"
+
+TEST_DIR="./build/extension_metadata_test_data"
+TEST_DIR_COPY="./build/extension_metadata_test_data_copy"
+
+### Directories to use
+# Used as the extension installation directory for DuckDB
+export LOCAL_EXTENSION_DIR="$TEST_DIR/extension_dir"
+# Repository for testing successfully updating extensions
+export LOCAL_EXTENSION_REPO_UPDATED="$TEST_DIR/repository"
+# Repository for testing incorrect platform
+export LOCAL_EXTENSION_REPO_INCORRECT_PLATFORM="$TEST_DIR/repository_incorrect_platform"
+# Repository for testing incorrect version
+export LOCAL_EXTENSION_REPO_INCORRECT_DUCKDB_VERSION="$TEST_DIR/repository_incorrect_version"
+# Repository where both platform and version mismatch
+export LOCAL_EXTENSION_REPO_VERSION_AND_PLATFORM_INCORRECT="$TEST_DIR/repository_incorrect_version_and_platform"
+# Directory containing the extensions for direct installing
+export DIRECT_INSTALL_DIR="$TEST_DIR/direct_install"
+
+# Extension dir with a malformed info file for an extension
+export LOCAL_EXTENSION_DIR_MALFORMED_INFO="$TEST_DIR/extension_dir_malformed_info"
+# Extension dir with a metadata install version that mismatches the files metadata
+export LOCAL_EXTENSION_DIR_INFO_INCORRECT_VERSION="$TEST_DIR/extension_dir_malformed_info_incorrect_version"
+
+if [ -d "$TEST_DIR_COPY" ]; then
+  # REUSE PREVIOUSLY GENERATED DATA
+  rm -r $TEST_DIR
+  cp -R $TEST_DIR_COPY $TEST_DIR
+else
+  # GENERATE FRESH DATA
+  mkdir -p $TEST_DIR
+  mkdir -p $DIRECT_INSTALL_DIR
+  mkdir -p $LOCAL_EXTENSION_DIR
+  mkdir -p $LOCAL_EXTENSION_REPO_UPDATED
+  mkdir -p $LOCAL_EXTENSION_REPO_INCORRECT_PLATFORM
+  mkdir -p $LOCAL_EXTENSION_REPO_INCORRECT_DUCKDB_VERSION
+
+  #################################################
+  ### First repo: successfully updating extensions.
+  #################################################
+  # Set extension config
+  cat > $TEST_DIR/extension_config_before.cmake <<EOL
+  duckdb_extension_load(json DONT_LINK EXTENSION_VERSION v0.0.1)
+  duckdb_extension_load(tpch DONT_LINK EXTENSION_VERSION v0.0.1)
+  duckdb_extension_load(tpcds DONT_LINK EXTENSION_VERSION v0.0.1)
+  duckdb_extension_load(icu DONT_LINK EXTENSION_VERSION v0.0.1)
+EOL
+
+  # Build the extensions using the first config
+  LOCAL_EXTENSION_REPO=$LOCAL_EXTENSION_REPO_UPDATED EXTENSION_CONFIGS=$TEST_DIR/extension_config_before.cmake make debug
+
+  # Set the version and platform now that we have a build
+  DUCKDB_VERSION=`$DUCKDB_BUILD_DIR/duckdb -csv -noheader  -c 'select source_id from pragma_version()'`
+  DUCKDB_PLATFORM=`cat $DUCKDB_BUILD_DIR/duckdb_platform_out`
+
+  # Install the extension from the initial config
+  $DUCKDB_BUILD_DIR/duckdb -unsigned -c "set extension_directory='$LOCAL_EXTENSION_DIR'; set custom_extension_repository='$LOCAL_EXTENSION_REPO_UPDATED'; install tpch; install json; INSTALL icu;"
+
+  # Delete the info file from the icu extension
+  rm $LOCAL_EXTENSION_DIR/$DUCKDB_VERSION/$DUCKDB_PLATFORM/icu.duckdb_extension.info
+
+  # Install tpcds directly
+  cp $DUCKDB_BUILD_DIR/extension/tpcds/tpcds.duckdb_extension $DIRECT_INSTALL_DIR/tpcds.duckdb_extension
+  $DUCKDB_BUILD_DIR/duckdb -unsigned -c "set extension_directory='$LOCAL_EXTENSION_DIR'; install '$DIRECT_INSTALL_DIR/tpcds.duckdb_extension';"
+
+  # Set updated extension config where we update the tpch extension but not the json extension
+  cat > $TEST_DIR/extension_config_after.cmake <<EOL
+  duckdb_extension_load(json DONT_LINK EXTENSION_VERSION v0.0.1)
+  duckdb_extension_load(tpch DONT_LINK EXTENSION_VERSION v0.0.2)
+  duckdb_extension_load(icu DONT_LINK EXTENSION_VERSION v0.0.2)
+EOL
+
+  # Build the extensions using the second config
+  LOCAL_EXTENSION_REPO=$LOCAL_EXTENSION_REPO_UPDATED EXTENSION_CONFIGS=$TEST_DIR/extension_config_after.cmake BUILD_EXTENSIONS_ONLY=1 make debug
+
+  # For good measure, we also gzip one of the files in the repo to ensure we can do both gzipped and non gzipped
+  gzip -1 $LOCAL_EXTENSION_REPO_UPDATED/$DUCKDB_VERSION/$DUCKDB_PLATFORM/icu.duckdb_extension
+
+  ##########################################
+  ### Second repo: Incorrect DuckDB platform
+  ##########################################
+  rm -rf $DUCKDB_BUILD_DIR
+  # Set extension config
+  cat > $TEST_DIR/extension_config_incorrect_platform.cmake <<EOL
+  duckdb_extension_load(json DONT_LINK EXTENSION_VERSION v0.0.3)
+EOL
+
+  # Build the extensions using the incorrect platform
+  DUCKDB_PLATFORM=test_platform EXTENSION_CONFIGS=$TEST_DIR/extension_config_incorrect_platform.cmake BUILD_EXTENSIONS_ONLY=1 make debug
+
+  cp $DUCKDB_BUILD_DIR/extension/json/json.duckdb_extension $DIRECT_INSTALL_DIR/json_incorrect_platform.duckdb_extension
+
+  ########################################
+  ### Third repo: Incorrect DuckDB version
+  ########################################
+  rm -rf $DUCKDB_BUILD_DIR
+  # Set extension config
+  cat > $TEST_DIR/extension_config_incorrect_version.cmake <<EOL
+  duckdb_extension_load(json DONT_LINK EXTENSION_VERSION v0.0.4)
+EOL
+
+  # Build the extensions using the incorrect platform
+  DUCKDB_EXPLICIT_VERSION=v1337 EXTENSION_CONFIGS=$TEST_DIR/extension_config_before.cmake BUILD_EXTENSIONS_ONLY=1 make debug
+
+  cp $DUCKDB_BUILD_DIR/extension/json/json.duckdb_extension $DIRECT_INSTALL_DIR/json_incorrect_version.duckdb_extension
+
+  ####################################################
+  ### Fourth repo: Both platform and version incorrect
+  ####################################################
+  rm -rf $DUCKDB_BUILD_DIR
+  # Set extension config
+  cat > $TEST_DIR/extension_config_incorrect_version.cmake <<EOL
+  duckdb_extension_load(json DONT_LINK EXTENSION_VERSION v0.0.4)
+EOL
+
+  # Build the extensions using the incorrect platform
+  DUCKDB_PLATFORM=test_platform DUCKDB_EXPLICIT_VERSION=v1337 EXTENSION_CONFIGS=$TEST_DIR/extension_config_before.cmake BUILD_EXTENSIONS_ONLY=1 make debug
+
+  cp $DUCKDB_BUILD_DIR/extension/json/json.duckdb_extension $DIRECT_INSTALL_DIR/json_incorrect_version_and_platform.duckdb_extension
+
+  # Note that we set the "double wrong" extension to have the proper name, so we can actually load it during testing with
+  # SET allow_extensions_metadata_mismatch=true;
+  cp $DUCKDB_BUILD_DIR/extension/json/json.duckdb_extension $DIRECT_INSTALL_DIR/json.duckdb_extension
+
+  ###########################
+  ### Prepare malformed repos/dirs
+  ###########################
+  # Build clean duckdb
+  rm -rf $DUCKDB_BUILD_DIR
+  make debug
+
+  # Use duckdb to install the extensions into the repositories (note that we are doing a trick here by setting the extension_directory to the local repo dir)
+  $DUCKDB_BUILD_DIR/duckdb -unsigned -c "set allow_extensions_metadata_mismatch=true; set extension_directory='$LOCAL_EXTENSION_REPO_INCORRECT_PLATFORM'; install '$DIRECT_INSTALL_DIR/json_incorrect_platform.duckdb_extension'"
+  $DUCKDB_BUILD_DIR/duckdb -unsigned -c "set allow_extensions_metadata_mismatch=true; set extension_directory='$LOCAL_EXTENSION_REPO_INCORRECT_DUCKDB_VERSION'; install '$DIRECT_INSTALL_DIR/json_incorrect_version.duckdb_extension'"
+  $DUCKDB_BUILD_DIR/duckdb -unsigned -c "set allow_extensions_metadata_mismatch=true; set extension_directory='$LOCAL_EXTENSION_REPO_VERSION_AND_PLATFORM_INCORRECT'; install '$DIRECT_INSTALL_DIR/json_incorrect_version_and_platform.duckdb_extension'"
+
+  # Create dir with malformed info file
+  $DUCKDB_BUILD_DIR/duckdb -unsigned -c "set extension_directory='$LOCAL_EXTENSION_DIR_MALFORMED_INFO'; install '$DIRECT_INSTALL_DIR/tpcds.duckdb_extension';"
+  echo blablablab > $LOCAL_EXTENSION_DIR_MALFORMED_INFO/$DUCKDB_VERSION/$DUCKDB_PLATFORM/tpcds.duckdb_extension.info
+
+  # Create dir with malformed info file: we install a new version from LOCAL_EXTENSION_REPO_UPDATED but preserve the old info file
+  $DUCKDB_BUILD_DIR/duckdb -unsigned -c "set extension_directory='$LOCAL_EXTENSION_DIR_INFO_INCORRECT_VERSION'; install 'tpch' from '$LOCAL_EXTENSION_REPO_UPDATED'"
+  cp $LOCAL_EXTENSION_DIR/$DUCKDB_VERSION/$DUCKDB_PLATFORM/tpch.duckdb_extension.info $LOCAL_EXTENSION_DIR_INFO_INCORRECT_VERSION/$DUCKDB_VERSION/$DUCKDB_PLATFORM/tpch.duckdb_extension.info
+
+  ###################################################################
+  ### Allow using copy instead of regenerating test data on every run
+  ###################################################################
+  cp -R $TEST_DIR $TEST_DIR_COPY
+fi
+
+###########################
+### Set version and platform
+###########################
+DUCKDB_VERSION=`$DUCKDB_BUILD_DIR/duckdb -csv -noheader  -c 'select source_id from pragma_version()'`
+DUCKDB_PLATFORM=`cat $DUCKDB_BUILD_DIR/duckdb_platform_out`
+
+###########################
+### Populate the minio repositories
+###########################
+AWS_DEFAULT_REGION=eu-west-1 AWS_ACCESS_KEY_ID=minio_duckdb_user AWS_SECRET_ACCESS_KEY=minio_duckdb_user_password aws --endpoint-url http://duckdb-minio.com:9000 s3 sync $LOCAL_EXTENSION_REPO_UPDATED s3://test-bucket-public/ci-test-repo
+export REMOTE_EXTENSION_REPO_UPDATED=http://duckdb-minio.com:9000/test-bucket-public/ci-test-repo
+export REMOTE_EXTENSION_REPO_DIRECT_PATH=http://duckdb-minio.com:9000/test-bucket-public/ci-test-repo/$DUCKDB_VERSION/$DUCKDB_PLATFORM
+
+################
+### Run test
+################
+RUN_EXTENSION_UPDATE_TEST=1 $DUCKDB_BUILD_DIR/test/unittest test/extension/update_extensions_ci.test
--- a/external/duckdb/scripts/run_tests_one_by_one.py
+++ b/external/duckdb/scripts/run_tests_one_by_one.py
@@ -0,0 +1,318 @@
+import argparse
+import sys
+import subprocess
+import time
+import threading
+import tempfile
+import os
+import shutil
+import re
+
+
+class ErrorContainer:
+    def __init__(self):
+        self._lock = threading.Lock()
+        self._errors = []
+
+    def append(self, item):
+        with self._lock:
+            self._errors.append(item)
+
+    def get_errors(self):
+        with self._lock:
+            return list(self._errors)
+
+    def __len__(self):
+        with self._lock:
+            return len(self._errors)
+
+
+error_container = ErrorContainer()
+
+
+def valid_timeout(value):
+    try:
+        timeout_float = float(value)
+        if timeout_float <= 0:
+            raise argparse.ArgumentTypeError("Timeout value must be a positive float")
+        return timeout_float
+    except ValueError:
+        raise argparse.ArgumentTypeError("Timeout value must be a float")
+
+
+parser = argparse.ArgumentParser(description='Run tests one by one with optional flags.')
+parser.add_argument('unittest_program', help='Path to the unittest program')
+parser.add_argument('--no-exit', action='store_true', help='Execute all tests, without stopping on first error')
+parser.add_argument('--fast-fail', action='store_true', help='Terminate on first error')
+parser.add_argument('--profile', action='store_true', help='Enable profiling')
+parser.add_argument('--no-assertions', action='store_false', help='Disable assertions')
+parser.add_argument('--time_execution', action='store_true', help='Measure and print the execution time of each test')
+parser.add_argument('--list', action='store_true', help='Print the list of tests to run')
+parser.add_argument('--summarize-failures', action='store_true', help='Summarize failures', default=None)
+parser.add_argument(
+    '--tests-per-invocation', type=int, help='The amount of tests to run per invocation of the runner', default=1
+)
+parser.add_argument(
+    '--print-interval', action='store', help='Prints "Still running..." every N seconds', default=300.0, type=float
+)
+parser.add_argument(
+    '--timeout',
+    action='store',
+    help='Add a timeout for each test (in seconds, default: 3600s - i.e. one hour)',
+    default=3600,
+    type=valid_timeout,
+)
+parser.add_argument('--valgrind', action='store_true', help='Run the tests with valgrind', default=False)
+
+args, extra_args = parser.parse_known_args()
+
+if not args.unittest_program:
+    parser.error('Path to unittest program is required')
+
+# Access the arguments
+unittest_program = args.unittest_program
+no_exit = args.no_exit
+fast_fail = args.fast_fail
+tests_per_invocation = args.tests_per_invocation
+
+if no_exit:
+    if fast_fail:
+        print("--no-exit and --fast-fail can't be combined")
+        exit(1)
+
+profile = args.profile
+assertions = args.no_assertions
+time_execution = args.time_execution
+timeout = args.timeout
+
+summarize_failures = args.summarize_failures
+if summarize_failures is None:
+    # get from env
+    summarize_failures = False
+    if 'SUMMARIZE_FAILURES' in os.environ:
+        summarize_failures = os.environ['SUMMARIZE_FAILURES'] == '1'
+    elif 'CI' in os.environ:
+        # enable by default in CI if not set explicitly
+        summarize_failures = True
+
+# Use the '-l' parameter to output the list of tests to run
+proc = subprocess.run([unittest_program, '-l'] + extra_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+stdout = proc.stdout.decode('utf8').strip()
+stderr = proc.stderr.decode('utf8').strip()
+if len(stderr) > 0:
+    print("Failed to run program " + unittest_program)
+    print("Returncode:", proc.returncode)
+    print(stdout)
+    print(stderr)
+    exit(1)
+
+# The output is in the format of 'PATH\tGROUP', we're only interested in the PATH portion
+test_cases = []
+first_line = True
+for line in stdout.splitlines():
+    if first_line:
+        first_line = False
+        continue
+    if len(line.strip()) == 0:
+        continue
+    splits = line.rsplit('\t', 1)
+    test_cases.append(splits[0])
+
+
+test_count = len(test_cases)
+if args.list:
+    for test_number, test_case in enumerate(test_cases):
+        print(print(f"[{test_number}/{test_count}]: {test_case}"))
+
+all_passed = True
+
+
+def fail():
+    global all_passed
+    all_passed = False
+    if fast_fail:
+        exit(1)
+
+
+def parse_assertions(stdout):
+    for line in stdout.splitlines():
+        if 'All tests were skipped' in line:
+            return "SKIPPED"
+        if line == 'assertions: - none -':
+            return "0 assertions"
+
+        # Parse assertions in format
+        pos = line.find("assertion")
+        if pos != -1:
+            space_before_num = line.rfind(' ', 0, pos - 2)
+            return line[space_before_num + 2 : pos + 10]
+
+    return "ERROR"
+
+
+is_active = False
+
+
+def get_test_name_from(text):
+    match = re.findall(r'\((.*?)\)\!', text)
+    return match[0] if match else ''
+
+
+def get_clean_error_message_from(text):
+    match = re.split(r'^=+\n', text, maxsplit=1, flags=re.MULTILINE)
+    return match[1] if len(match) > 1 else text
+
+
+def print_interval_background(interval):
+    global is_active
+    current_ticker = 0.0
+    while is_active:
+        time.sleep(0.1)
+        current_ticker += 0.1
+        if current_ticker >= interval:
+            print("Still running...")
+            current_ticker = 0
+
+
+def launch_test(test, list_of_tests=False):
+    global is_active
+    # start the background thread
+    is_active = True
+    background_print_thread = threading.Thread(target=print_interval_background, args=[args.print_interval])
+    background_print_thread.start()
+
+    unittest_stdout = sys.stdout if list_of_tests else subprocess.PIPE
+    unittest_stderr = subprocess.PIPE
+
+    start = time.time()
+    try:
+        test_cmd = [unittest_program] + test
+        if args.valgrind:
+            test_cmd = ['valgrind'] + test_cmd
+        # should unset SUMMARIZE_FAILURES to avoid producing exceeding failure logs
+        env = os.environ.copy()
+        # pass env variables globally
+        if list_of_tests or no_exit or tests_per_invocation:
+            env['SUMMARIZE_FAILURES'] = '0'
+            env['NO_DUPLICATING_HEADERS'] = '1'
+        else:
+            env['SUMMARIZE_FAILURES'] = '0'
+        res = subprocess.run(test_cmd, stdout=unittest_stdout, stderr=unittest_stderr, timeout=timeout, env=env)
+    except subprocess.TimeoutExpired as e:
+        if list_of_tests:
+            print("[TIMED OUT]", flush=True)
+        else:
+            print(" (TIMED OUT)", flush=True)
+        test_name = test[0] if not list_of_tests else str(test)
+        error_msg = f'TIMEOUT - exceeded specified timeout of {timeout} seconds'
+        new_data = {"test": test_name, "return_code": 1, "stdout": '', "stderr": error_msg}
+        error_container.append(new_data)
+        fail()
+        return
+
+    stdout = res.stdout.decode('utf8') if not list_of_tests else ''
+    stderr = res.stderr.decode('utf8')
+
+    if len(stderr) > 0:
+        # when list_of_tests test name gets transformed, but we can get it from stderr
+        test_name = test[0] if not list_of_tests else get_test_name_from(stderr)
+        error_message = get_clean_error_message_from(stderr)
+        new_data = {"test": test_name, "return_code": res.returncode, "stdout": stdout, "stderr": error_message}
+        error_container.append(new_data)
+
+    end = time.time()
+
+    # join the background print thread
+    is_active = False
+    background_print_thread.join()
+
+    additional_data = ""
+    if assertions:
+        additional_data += " (" + parse_assertions(stdout) + ")"
+    if args.time_execution:
+        additional_data += f" (Time: {end - start:.4f} seconds)"
+    print(additional_data, flush=True)
+    if profile:
+        print(f'{test_case}	{end - start}')
+    if res.returncode is None or res.returncode == 0:
+        return
+
+    print("FAILURE IN RUNNING TEST")
+    print(
+        """--------------------
+RETURNCODE
+--------------------"""
+    )
+    print(res.returncode)
+    print(
+        """--------------------
+STDOUT
+--------------------"""
+    )
+    print(stdout)
+    print(
+        """--------------------
+STDERR
+--------------------"""
+    )
+    print(stderr)
+
+    # if a test closes unexpectedly (e.g., SEGV), test cleanup doesn't happen,
+    # causing us to run out of space on subsequent tests in GH Actions (not much disk space there)
+    duckdb_unittest_tempdir = os.path.join(
+        os.path.dirname(unittest_program), '..', '..', '..', 'duckdb_unittest_tempdir'
+    )
+    if os.path.exists(duckdb_unittest_tempdir) and os.listdir(duckdb_unittest_tempdir):
+        shutil.rmtree(duckdb_unittest_tempdir)
+    fail()
+
+
+def run_tests_one_by_one():
+    for test_number, test_case in enumerate(test_cases):
+        if not profile:
+            print(f"[{test_number}/{test_count}]: {test_case}", end="", flush=True)
+        launch_test([test_case])
+
+
+def escape_test_case(test_case):
+    return test_case.replace(',', '\\,')
+
+
+def run_tests_batched(batch_count):
+    tmp = tempfile.NamedTemporaryFile()
+    # write the test list to a temporary file
+    with open(tmp.name, 'w') as f:
+        for test_case in test_cases:
+            f.write(escape_test_case(test_case) + '\n')
+    # use start_offset/end_offset to cycle through the test list
+    test_number = 0
+    while test_number < len(test_cases):
+        # gather test cases
+        next_entry = test_number + batch_count
+        if next_entry > len(test_cases):
+            next_entry = len(test_cases)
+
+        launch_test(['-f', tmp.name, '--start-offset', str(test_number), '--end-offset', str(next_entry)], True)
+        test_number = next_entry
+
+
+if args.tests_per_invocation == 1:
+    run_tests_one_by_one()
+else:
+    assertions = False
+    run_tests_batched(args.tests_per_invocation)
+
+if all_passed:
+    exit(0)
+if summarize_failures and len(error_container):
+    print(
+        '''\n\n====================================================
+================  FAILURES SUMMARY  ================
+====================================================\n
+'''
+    )
+    for i, error in enumerate(error_container.get_errors(), start=1):
+        print(f"\n{i}:", error["test"], "\n")
+        print(error["stderr"])
+
+exit(1)
--- a/external/duckdb/scripts/settings_scripts/init.py
+++ b/external/duckdb/scripts/settings_scripts/init.py
@@ -0,0 +1,4 @@
+from .parse_and_sort_settings_in_json import add_all_settings_to_global_list as parse_and_sort_json_file
+from .update_settings_header_file import generate as update_header_file
+from .update_settings_scopes import generate as update_scopes
+from .update_settings_src_code import generate as update_src_code
--- a/external/duckdb/scripts/settings_scripts/config.py
+++ b/external/duckdb/scripts/settings_scripts/config.py
@@ -0,0 +1,197 @@
+import os
+import re
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Set, List
+from functools import total_ordering
+
+# define file paths and global variables
+DUCKDB_DIR = Path(__file__).resolve().parent.parent.parent
+DUCKDB_SETTINGS_HEADER_FILE = os.path.join(DUCKDB_DIR, "src/include/duckdb/main", "settings.hpp")
+DUCKDB_AUTOGENERATED_SETTINGS_FILE = os.path.join(DUCKDB_DIR, "src/main/settings", "autogenerated_settings.cpp")
+DUCKDB_SETTINGS_SCOPE_FILE = os.path.join(DUCKDB_DIR, "src/main", "config.cpp")
+JSON_PATH = os.path.join(DUCKDB_DIR, "src/common", "settings.json")
+
+# define scope values
+VALID_SCOPE_VALUES = ["GLOBAL", "LOCAL", "GLOBAL_LOCAL"]
+INVALID_SCOPE_VALUE = "INVALID"
+SQL_TYPE_MAP = {"UBIGINT": "idx_t", "BIGINT": "int64_t", "BOOLEAN": "bool", "DOUBLE": "double", "VARCHAR": "string"}
+
+
+# global Setting structure
+@total_ordering
+class Setting:
+    # track names of written settings to prevent duplicates
+    __written_settings: Set[str] = set()
+
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        sql_type: str,
+        scope: str,
+        internal_setting: str,
+        on_callbacks: List[str],
+        custom_implementation,
+        struct_name: str,
+        aliases: List[str],
+        default_scope: str,
+        default_value: str,
+    ):
+        self.name = self._get_valid_name(name)
+        self.description = description
+        self.sql_type = self._get_sql_type(sql_type)
+        self.return_type = self._get_setting_type(sql_type)
+        self.is_enum = sql_type.startswith('ENUM')
+        self.internal_setting = internal_setting
+        self.scope = self._get_valid_scope(scope) if scope is not None else None
+        self.on_set, self.on_reset = self._get_on_callbacks(on_callbacks)
+        self.is_generic_setting = self.scope is None
+        if self.is_enum and self.is_generic_setting:
+            self.on_set = True
+        custom_callbacks = ['set', 'reset', 'get']
+        if type(custom_implementation) is bool:
+            self.all_custom = custom_implementation
+            self.custom_implementation = custom_callbacks if custom_implementation else []
+        else:
+            for entry in custom_implementation:
+                if entry not in custom_callbacks:
+                    raise ValueError(
+                        f"Setting {self.name} - incorrect input for custom_implementation - expected set/reset/get, got {entry}"
+                    )
+            self.all_custom = len(set(custom_implementation)) == 3
+            self.custom_implementation = custom_implementation
+        self.aliases = self._get_aliases(aliases)
+        self.struct_name = self._get_struct_name() if len(struct_name) == 0 else struct_name
+        self.default_scope = self._get_valid_default_scope(default_scope) if default_scope is not None else None
+        self.default_value = default_value
+
+    # define all comparisons to be based on the setting's name attribute
+    def __eq__(self, other) -> bool:
+        return isinstance(other, Setting) and self.name == other.name
+
+    def __lt__(self, other) -> bool:
+        return isinstance(other, Setting) and self.name < other.name
+
+    def __hash__(self) -> int:
+        return hash(self.name)
+
+    def __repr__(self):
+        return f"struct {self.struct_name} -> {self.name}, {self.sql_type}, {self.type}, {self.scope}, {self.description} {self.aliases}"
+
+    # validate setting name for correct format and uniqueness
+    def _get_valid_name(self, name: str) -> str:
+        if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name):
+            raise ValueError(f"'{name}' cannot be used as setting name - invalid character")
+        if name in Setting.__written_settings:
+            raise ValueError(f"'{name}' cannot be used as setting name - already exists")
+        Setting.__written_settings.add(name)
+        return name
+
+    # ensure the setting scope is valid based on the accepted values
+    def _get_valid_scope(self, scope: str) -> str:
+        scope = scope.upper()
+        if scope in VALID_SCOPE_VALUES:
+            return scope
+        return INVALID_SCOPE_VALUE
+
+    def _get_valid_default_scope(self, scope: str) -> str:
+        scope = scope.upper()
+        if scope == 'GLOBAL':
+            return scope
+        elif scope == 'LOCAL':
+            return 'SESSION'
+        raise Exception(f"Invalid default scope value {scope}")
+
+    # validate and return the correct type format
+    def _get_sql_type(self, sql_type) -> str:
+        if sql_type.startswith('ENUM'):
+            return 'VARCHAR'
+        if sql_type.endswith('[]'):
+            # recurse into child-element
+            sub_type = self._get_sql_type(sql_type[:-2])
+            return sql_type
+        if sql_type in SQL_TYPE_MAP:
+            return sql_type
+        raise ValueError(f"Invalid SQL type: '{sql_type}' - supported types are {', '.join(SQL_TYPE_MAP.keys())}")
+
+    # validate and return the cpp input type
+    def _get_setting_type(self, type) -> str:
+        if type.startswith('ENUM'):
+            return type[len('ENUM<') : -1]
+        if type.endswith('[]'):
+            subtype = self._get_setting_type(type[:-2])
+            return "vector<" + subtype + ">"
+        return SQL_TYPE_MAP[type]
+
+    # validate and return the correct type format
+    def _get_on_callbacks(self, callbacks) -> (bool, bool):
+        set = False
+        reset = False
+        for entry in callbacks:
+            if entry == 'set':
+                set = True
+            elif entry == 'reset':
+                reset = True
+            else:
+                raise ValueError(f"Invalid entry in on_callbacks list: {entry} (expected set or reset)")
+        return (set, reset)
+
+    # validate and return the set of the aliases
+    def _get_aliases(self, aliases: List[str]) -> List[str]:
+        return [self._get_valid_name(alias) for alias in aliases]
+
+    # generate a function name
+    def _get_struct_name(self) -> str:
+        camel_case_name = ''.join(word.capitalize() for word in re.split(r'[-_]', self.name))
+        if camel_case_name.endswith("Setting"):
+            return f"{camel_case_name}"
+        return f"{camel_case_name}Setting"
+
+
+# this global list (accessible across all files) stores all the settings definitions in the json file
+SettingsList: List[Setting] = []
+
+
+# global method that finds the indexes of a start and an end marker in a file
+def find_start_end_indexes(source_code, start_marker, end_marker, file_path):
+    start_matches = list(re.finditer(start_marker, source_code))
+    if len(start_matches) == 0:
+        raise ValueError(f"Couldn't find start marker {start_marker} in {file_path}")
+    elif len(start_matches) > 1:
+        raise ValueError(f"Start marker found more than once in {file_path}")
+    start_index = start_matches[0].end()
+
+    end_matches = list(re.finditer(end_marker, source_code[start_index:]))
+    if len(end_matches) == 0:
+        raise ValueError(f"Couldn't find end marker {end_marker} in {file_path}")
+    elif len(end_matches) > 1:
+        raise ValueError(f"End marker found more than once in {file_path}")
+    end_index = start_index + end_matches[0].start()
+    return start_index, end_index
+
+
+# global markers
+SEPARATOR = "//===----------------------------------------------------------------------===//\n"
+SRC_CODE_START_MARKER = "namespace duckdb {"
+SRC_CODE_END_MARKER = "} // namespace duckdb"
+
+
+# global method
+def write_content_to_file(new_content, path):
+    with open(path, 'w') as source_file:
+        source_file.write("".join(new_content))
+
+
+def get_setting_heading(setting_struct_name):
+    struct_name_wt_Setting = re.sub(r'Setting$', '', setting_struct_name)
+    heading_name = re.sub(r'(?<!^)(?=[A-Z])', ' ', struct_name_wt_Setting)
+    heading = SEPARATOR + f"// {heading_name}\n" + SEPARATOR
+    return heading
+
+
+def make_format():
+    os.system(f"python3 scripts/format.py {DUCKDB_SETTINGS_HEADER_FILE} --fix --force --noconfirm")
+    os.system(f"python3 scripts/format.py {DUCKDB_SETTINGS_SCOPE_FILE} --fix --force --noconfirm")
+    os.system(f"python3 scripts/format.py {DUCKDB_AUTOGENERATED_SETTINGS_FILE} --fix --force --noconfirm")
--- a/external/duckdb/scripts/settings_scripts/parse_and_sort_settings_in_json.py
+++ b/external/duckdb/scripts/settings_scripts/parse_and_sort_settings_in_json.py
@@ -0,0 +1,58 @@
+import json
+from .config import Setting, SettingsList, JSON_PATH
+
+
+# sort settings in json by name
+def sort_json_data(path):
+    with open(path, 'r') as file:
+        data = json.load(file)
+    sorted_data = sorted(data, key=lambda x: x['name'])
+    with open(path, 'w') as file:
+        json.dump(sorted_data, file, indent=4)
+    return sorted_data
+
+
+# parse json data and stores each entry as a settings object in the global list SettingsList
+def add_all_settings_to_global_list():
+    valid_entries = [
+        'name',
+        'description',
+        'type',
+        'scope',
+        'internal_setting',
+        'on_callbacks',
+        'custom_implementation',
+        'struct',
+        'aliases',
+        'default_scope',
+        'default_value',
+    ]
+
+    print(f"Parsing and sorting the settings data in {JSON_PATH}")
+    clear_global_settings_list()
+    json_data = sort_json_data(JSON_PATH)
+    # store all the settings in the SettingsList
+    for entry in json_data:
+        for field_entry in entry:
+            if field_entry not in valid_entries:
+                raise ValueError(
+                    f"Found entry unexpected entry \"{field_entry}\" in setting, expected entry to be in {', '.join(valid_entries)}"
+                )
+        setting = Setting(
+            name=entry['name'],
+            description=entry['description'],
+            sql_type=entry['type'],
+            internal_setting=entry.get('internal_setting', entry['name']),
+            scope=entry.get('scope', None),
+            struct_name=entry.get('struct', ''),
+            on_callbacks=entry.get('on_callbacks', []),
+            custom_implementation=entry.get('custom_implementation', False),
+            aliases=entry.get('aliases', []),
+            default_scope=entry.get('default_scope', None),
+            default_value=entry.get('default_value', None),
+        )
+        SettingsList.append(setting)
+
+
+def clear_global_settings_list():
+    SettingsList.clear()
--- a/external/duckdb/scripts/settings_scripts/update_autogenerated_functions.py
+++ b/external/duckdb/scripts/settings_scripts/update_autogenerated_functions.py
@@ -0,0 +1,132 @@
+from .config import (
+    SRC_CODE_START_MARKER,
+    SRC_CODE_END_MARKER,
+    SettingsList,
+    find_start_end_indexes,
+    get_setting_heading,
+)
+
+
+def generate_create_value(setting):
+    if setting.sql_type == 'VARCHAR':
+        return 'Value'
+    else:
+        return f'Value::{setting.sql_type}'
+
+
+def add_autogenerated_global_functions(setting):
+    cpp_code = ""
+    if 'set' not in setting.custom_implementation:
+        cpp_code += (
+            f"void {setting.struct_name}::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) {{\n"
+        )
+        if setting.on_set:
+            cpp_code += f"\tif (!OnGlobalSet(db, config, input)) {{\n"
+            cpp_code += f"\t\treturn;\n\t}}\n"
+        if setting.is_enum:
+            cpp_code += f"\tauto str_input = StringUtil::Upper(input.GetValue<string>());\n"
+            cpp_code += f"\tconfig.options.{setting.internal_setting} = EnumUtil::FromString<{setting.return_type}>(str_input);\n"
+        else:
+            cpp_code += f"\tconfig.options.{setting.internal_setting} = input.GetValue<{setting.return_type}>();\n"
+        cpp_code += f"}}\n\n"
+    if 'reset' not in setting.custom_implementation:
+        cpp_code += f"void {setting.struct_name}::ResetGlobal(DatabaseInstance *db, DBConfig &config) {{\n"
+        if setting.on_reset:
+            cpp_code += f"\tif (!OnGlobalReset(db, config)) {{\n"
+            cpp_code += f"\t\treturn;\n\t}}\n"
+        cpp_code += f"\tconfig.options.{setting.internal_setting} = DBConfigOptions().{setting.internal_setting};\n"
+        cpp_code += f"}}\n\n"
+    if 'get' not in setting.custom_implementation:
+        cpp_code += f"Value {setting.struct_name}::GetSetting(const ClientContext &context) {{\n"
+        cpp_code += f"\tauto &config = DBConfig::GetConfig(context);\n"
+        if setting.is_enum:
+            cpp_code += f"\treturn {generate_create_value(setting)}(StringUtil::Lower(EnumUtil::ToString(config.options.{setting.internal_setting})));\n"
+        else:
+            cpp_code += f"\treturn {generate_create_value(setting)}(config.options.{setting.internal_setting});\n"
+        cpp_code += f"}}\n\n"
+    return cpp_code
+
+
+def add_autogenerated_local_functions(setting):
+    cpp_code = ""
+    if 'set' not in setting.custom_implementation:
+        cpp_code += f"void {setting.struct_name}::SetLocal(ClientContext &context, const Value &input) {{\n"
+        if setting.on_set:
+            cpp_code += f"\tif (!OnLocalSet(context, input)) {{\n"
+            cpp_code += f"\t\treturn;\n\t}}\n"
+        cpp_code += f"\tauto &config = ClientConfig::GetConfig(context);\n"
+        if setting.is_enum:
+            cpp_code += f"\tauto str_input = StringUtil::Upper(input.GetValue<string>());\n"
+            cpp_code += (
+                f"\tconfig.{setting.internal_setting} = EnumUtil::FromString<{setting.return_type}>(str_input);\n"
+            )
+        else:
+            cpp_code += f"\tconfig.{setting.internal_setting} = input.GetValue<{setting.return_type}>();\n"
+        cpp_code += f"}}\n\n"
+    if 'reset' not in setting.custom_implementation:
+        cpp_code += f"void {setting.struct_name}::ResetLocal(ClientContext &context) {{\n"
+        if setting.on_reset:
+            cpp_code += f"\tif (!OnLocalReset(context)) {{\n"
+            cpp_code += f"\t\treturn;\n\t}}\n"
+        cpp_code += f"\tClientConfig::GetConfig(context).{setting.internal_setting} = ClientConfig().{setting.internal_setting};\n"
+        cpp_code += f"}}\n\n"
+    if 'get' not in setting.custom_implementation:
+        cpp_code += f"Value {setting.struct_name}::GetSetting(const ClientContext &context) {{\n"
+        cpp_code += f"\tauto &config = ClientConfig::GetConfig(context);\n"
+        if setting.is_enum:
+            cpp_code += f"\treturn {generate_create_value(setting)}(StringUtil::Lower(EnumUtil::ToString(config.{setting.internal_setting})));\n"
+        else:
+            cpp_code += f"\treturn {generate_create_value(setting)}(config.{setting.internal_setting});\n"
+        cpp_code += f"}}\n\n"
+    return cpp_code
+
+
+def add_autogenerated_enum_set(setting):
+    if not setting.on_set:
+        return ""
+    if not setting.is_enum:
+        return ""
+    if 'set' in setting.custom_implementation:
+        return ""
+    cpp_code = ""
+
+    cpp_code += f"void {setting.struct_name}::OnSet(SettingCallbackInfo &info, Value &parameter) {{\n"
+    cpp_code += f"\tEnumUtil::FromString<{setting.return_type}>(StringValue::Get(parameter));\n"
+    cpp_code += f"}}\n\n"
+    return cpp_code
+
+
+def add_autogenerated_functions(path):
+    with open(path, 'r') as source_file:
+        source_code = source_file.read()
+
+    # find start and end indexes of the auto-generated section
+    start_index, end_index = find_start_end_indexes(source_code, SRC_CODE_START_MARKER, SRC_CODE_END_MARKER, path)
+
+    # split source code into sections
+    start_section = source_code[: start_index + 1] + "\n"
+    end_section = source_code[end_index:]
+
+    new_content = ""
+    added = 0
+    for setting in SettingsList:
+        # if the setting doesn't need custom implementation, an autogenerated one will be included
+        if not setting.all_custom:
+            header = get_setting_heading(setting.struct_name)
+            content = ""
+            if setting.is_generic_setting:
+                content += add_autogenerated_enum_set(setting)
+            else:
+                if setting.scope == "GLOBAL" or setting.scope == "GLOBAL_LOCAL":
+                    content += add_autogenerated_global_functions(setting)
+                if setting.scope == "LOCAL" or setting.scope == "GLOBAL_LOCAL":
+                    content += add_autogenerated_local_functions(setting)
+            if len(content) > 0:
+                new_content += header
+                new_content += content
+            added += 1
+    return start_section + new_content + end_section, added
+
+
+if __name__ == '__main__':
+    raise ValueError("Please use 'generate_settings.py' instead of running the individual script(s)")
--- a/external/duckdb/scripts/settings_scripts/update_settings_header_file.py
+++ b/external/duckdb/scripts/settings_scripts/update_settings_header_file.py
@@ -0,0 +1,73 @@
+from .config import SEPARATOR, SettingsList, find_start_end_indexes, write_content_to_file
+
+# markers
+START_MARKER = (
+    f"//===----------------------------------------------------------------------===//\n"
+    f"// This code is autogenerated from 'update_settings_header_file.py'.\n"
+    f"// Please do not make any changes directly here, as they will be overwritten.\n//\n"
+    f"// Start of the auto-generated list of settings structures\n"
+    f"//===----------------------------------------------------------------------===//\n"
+)
+END_MARKER = "// End of the auto-generated list of settings structures"
+
+
+def extract_declarations(setting) -> str:
+    definition = (
+        f"struct {setting.struct_name} {{\n"
+        f"    using RETURN_TYPE = {setting.return_type};\n"
+        f"    static constexpr const char *Name = \"{setting.name}\";\n"
+        f"    static constexpr const char *Description = \"{setting.description}\";\n"
+        f"    static constexpr const char *InputType = \"{setting.sql_type}\";\n"
+    )
+    if setting.scope == "GLOBAL" or setting.scope == "GLOBAL_LOCAL":
+        definition += f"    static void SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &parameter);\n"
+        definition += f"    static void ResetGlobal(DatabaseInstance *db, DBConfig &config);\n"
+        if setting.on_set:
+            definition += f"static bool OnGlobalSet(DatabaseInstance *db, DBConfig &config, const Value &input);\n"
+        if setting.on_reset:
+            definition += f"static bool OnGlobalReset(DatabaseInstance *db, DBConfig &config);\n"
+    if setting.scope == "LOCAL" or setting.scope == "GLOBAL_LOCAL":
+        definition += f"    static void SetLocal(ClientContext &context, const Value &parameter);\n"
+        definition += f"    static void ResetLocal(ClientContext &context);\n"
+        if setting.on_set:
+            definition += f"static bool OnLocalSet(ClientContext &context, const Value &input);\n"
+        if setting.on_reset:
+            definition += f"static bool OnLocalReset(ClientContext &context);\n"
+    if setting.scope is not None:
+        definition += f"    static Value GetSetting(const ClientContext &context);\n"
+    if setting.is_generic_setting:
+        definition += f"    static constexpr const char *DefaultValue = \"{setting.default_value}\";\n"
+        definition += f"    static constexpr SetScope DefaultScope = SetScope::{setting.default_scope};\n"
+        if setting.on_set:
+            definition += f"    static void OnSet(SettingCallbackInfo &info, Value &input);\n"
+
+    definition += f"}};\n\n"
+    return definition
+
+
+# generate code for all the settings for the the header file
+def generate_content(header_file_path):
+    with open(header_file_path, 'r') as source_file:
+        source_code = source_file.read()
+
+    # find start and end indexes of the auto-generated section
+    start_index, end_index = find_start_end_indexes(source_code, START_MARKER, END_MARKER, header_file_path)
+
+    # split source code into sections
+    start_section = source_code[: start_index + 1]
+    end_section = SEPARATOR + source_code[end_index:]
+
+    new_content = "".join(extract_declarations(setting) for setting in SettingsList)
+    return start_section + new_content + end_section
+
+
+def generate():
+    from .config import DUCKDB_SETTINGS_HEADER_FILE
+
+    print(f"Updating {DUCKDB_SETTINGS_HEADER_FILE}")
+    new_content = generate_content(DUCKDB_SETTINGS_HEADER_FILE)
+    write_content_to_file(new_content, DUCKDB_SETTINGS_HEADER_FILE)
+
+
+if __name__ == '__main__':
+    raise ValueError("Please use 'generate_settings.py' instead of running the individual script(s)")
--- a/external/duckdb/scripts/settings_scripts/update_settings_scopes.py
+++ b/external/duckdb/scripts/settings_scripts/update_settings_scopes.py
@@ -0,0 +1,61 @@
+from .config import SettingsList, VALID_SCOPE_VALUES, find_start_end_indexes, write_content_to_file
+
+# markers
+START_MARKER = r'static const ConfigurationOption internal_options\[\] = \{\n'
+END_MARKER = r',\s*FINAL_ALIAS};'
+
+
+# generate the scope code for the ConfigurationOption array and insert into the config file
+def generate_scope_code(file):
+    with open(file, 'r') as source_file:
+        source_code = source_file.read()
+
+    # find the start and end indexes of the settings' scope array
+    start_index, end_index = find_start_end_indexes(source_code, START_MARKER, END_MARKER, file)
+
+    # split source code into sections
+    before_array = source_code[:start_index] + "\n    "
+    after_array = source_code[end_index:]
+
+    # generate new entries for the settings array
+    new_entries = []
+    new_aliases = []
+    for setting in SettingsList:
+        if setting.is_generic_setting:
+            if setting.on_set:
+                new_entries.append([setting.name, f"DUCKDB_SETTING_CALLBACK({setting.struct_name})"])
+            else:
+                new_entries.append([setting.name, f"DUCKDB_SETTING({setting.struct_name})"])
+        elif setting.scope in VALID_SCOPE_VALUES:  # valid setting_scope values
+            new_entries.append([setting.name, f"DUCKDB_{setting.scope}({setting.struct_name})"])
+        else:
+            raise ValueError(f"Setting {setting.name} has invalid input scope value")
+        for alias in setting.aliases:
+            new_aliases.append([alias, setting.name])
+    new_entries.sort(key=lambda x: x[0])
+    new_aliases.sort(key=lambda x: x[0])
+    entry_indexes = {}
+    for i in range(len(new_entries)):
+        entry_indexes[new_entries[i][0]] = i
+    for alias in new_aliases:
+        alias_index = entry_indexes[alias[1]]
+        alias.append(f"DUCKDB_SETTING_ALIAS(\"{alias[0]}\", {alias_index})")
+
+    new_array_section = ',\n    '.join([x[1] for x in new_entries])
+    new_array_section += ',    FINAL_SETTING};\n\n'
+    new_array_section += 'static const ConfigurationAlias setting_aliases[] = {'
+    new_array_section += ',\n    '.join([x[2] for x in new_aliases])
+
+    return before_array + new_array_section + after_array
+
+
+def generate():
+    from .config import DUCKDB_SETTINGS_SCOPE_FILE
+
+    print(f"Updating {DUCKDB_SETTINGS_SCOPE_FILE}")
+    new_content = generate_scope_code(DUCKDB_SETTINGS_SCOPE_FILE)
+    write_content_to_file(new_content, DUCKDB_SETTINGS_SCOPE_FILE)
+
+
+if __name__ == '__main__':
+    raise ValueError("Please use 'generate_settings.py' instead of running the individual script(s)")
--- a/external/duckdb/scripts/settings_scripts/update_settings_src_code.py
+++ b/external/duckdb/scripts/settings_scripts/update_settings_src_code.py
@@ -0,0 +1,18 @@
+import re
+from .config import SettingsList, write_content_to_file, find_start_end_indexes
+from .update_autogenerated_functions import add_autogenerated_functions
+
+
+def generate():
+    from .config import DUCKDB_AUTOGENERATED_SETTINGS_FILE
+
+    print(f"Updating {DUCKDB_AUTOGENERATED_SETTINGS_FILE}")
+    new_autogenerated_content, generated = add_autogenerated_functions(DUCKDB_AUTOGENERATED_SETTINGS_FILE)
+    write_content_to_file(new_autogenerated_content, DUCKDB_AUTOGENERATED_SETTINGS_FILE)
+
+    # NOTE: for debugging purposes
+    # print(f"The total number of settings is {len(SettingsList)}, and {generated} settings are added in {DUCKDB_AUTOGENERATED_SETTINGS_FILE} and, {added_custom} new and {existing_custom} existing added in {DUCKDB_CUSTOM_DEFINED_SETTINGS_FILE}")
+
+
+if __name__ == '__main__':
+    raise ValueError("Please use 'generate_settings.py' instead of running the individual script(s)")
--- a/external/duckdb/scripts/setup_ubuntu1804.sh
+++ b/external/duckdb/scripts/setup_ubuntu1804.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# stuff
+apt-get update -y -qq
+apt-get install -y -qq software-properties-common
+add-apt-repository ppa:git-core/ppa
+apt-get update -y -qq
+apt-get install -y -qq --fix-missing ninja-build make gcc-multilib g++-multilib libssl-dev wget openjdk-8-jdk zip maven unixodbc-dev libc6-dev-i386 lib32readline6-dev libssl-dev libcurl4-gnutls-dev libexpat1-dev gettext unzip build-essential checkinstall libffi-dev curl libz-dev openssh-client pkg-config
+
+# cross compilation stuff
+apt-get install -y -qq gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
+
+# git
+wget https://github.com/git/git/archive/refs/tags/v2.18.5.tar.gz
+tar xvf v2.18.5.tar.gz
+cd git-2.18.5
+make
+make prefix=/usr install
+git --version
+
+# cmake
+wget https://github.com/Kitware/CMake/releases/download/v3.21.3/cmake-3.21.3-linux-x86_64.sh
+chmod +x cmake-3.21.3-linux-x86_64.sh
+./cmake-3.21.3-linux-x86_64.sh --skip-license --prefix=/usr/local
+cmake --version
--- a/external/duckdb/scripts/test_compile.py
+++ b/external/duckdb/scripts/test_compile.py
@@ -0,0 +1,86 @@
+import os
+import sys
+import amalgamation
+import pickle
+import subprocess
+
+# where to cache which files have already been compiled
+cache_file = 'amalgamation.cache'
+ignored_files = ['utf8proc_data.cpp']
+
+RESUME_AUTO = 0
+RESUME_ALWAYS = 1
+RESUME_NEVER = 2
+
+# resume behavior
+# by default, we resume if the previous test_compile was run on the same commit hash as this one
+resume = RESUME_AUTO
+for arg in sys.argv:
+    if arg == '--resume':
+        resume = RESUME_ALWAYS
+    elif arg == '--restart':
+        cache = RESUME_NEVER
+
+if resume == RESUME_NEVER:
+    try:
+        os.remove(cache_file)
+    except:
+        pass
+
+
+def get_git_hash():
+    proc = subprocess.Popen(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE)
+    return proc.stdout.read().strip()
+
+
+current_hash = get_git_hash()
+
+# load the cache, and check the commit hash
+try:
+    with open(cache_file, 'rb') as cf:
+        cache = pickle.load(cf)
+    if resume == RESUME_AUTO:
+        # auto resume, check
+        if cache['commit_hash'] != current_hash:
+            cache = {}
+except:
+    cache = {}
+
+cache['commit_hash'] = current_hash
+
+
+def try_compilation(fpath, cache):
+    if fpath in cache:
+        return
+    print(fpath)
+
+    cmd = (
+        'clang++ -std=c++11 -Wno-deprecated -Wno-writable-strings -S -MMD -MF dependencies.d -o deps.s '
+        + fpath
+        + ' '
+        + ' '.join(["-I" + x for x in amalgamation.include_paths])
+    )
+    ret = os.system(cmd)
+    if ret != 0:
+        raise Exception('Failed compilation of file "' + fpath + '"!\n Command: ' + cmd)
+    cache[fpath] = True
+    with open(cache_file, 'wb') as cf:
+        pickle.dump(cache, cf)
+
+
+def compile_dir(dir, cache):
+    files = os.listdir(dir)
+    files.sort()
+    for fname in files:
+        if fname in amalgamation.excluded_compilation_files or fname in ignored_files:
+            continue
+        fpath = os.path.join(dir, fname)
+        if os.path.isdir(fpath):
+            compile_dir(fpath, cache)
+        elif fname.endswith('.cpp') or fname.endswith('.hpp') or fname.endswith('.c') or fname.endswith('.cc'):
+            try_compilation(fpath, cache)
+
+
+# compile all files in the src directory (including headers!) individually
+for cdir in amalgamation.compile_directories:
+    compile_dir(cdir, cache)
--- a/external/duckdb/scripts/test_docker_images.sh
+++ b/external/duckdb/scripts/test_docker_images.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+
+TEST="./build/release/duckdb -c 'PRAGMA platform;' && make clean && echo 'DOCKER TEST RESULT: SUCCESS' || (echo 'DOCKER TEST RESULT: FAILURE' && make clean)"
+
+make clean
+
+# Currently not working due to cmake version being too low
+# docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb amazonlinux:2 <<< "yum install gcc gcc-c++ git make cmake ninja-build -y && GEN=ninja make && $TEST" 2>&1
+
+docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb alpine:latest <<< "apk add g++ git make cmake ninja python3 && cmake -Bbuild . && cmake --build build && cmake --install build && g++ -std=c++11 examples/embedded-c++/main.cpp"
+docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb amazonlinux:latest <<< "yum install clang git make cmake ninja-build -y && GEN=ninja make && $TEST" 2>&1
+docker run -i --platform linux/arm64 --rm -v $(pwd):/duckdb --workdir /duckdb alpine:latest <<< "apk add g++ git make cmake ninja && GEN=ninja make && $TEST" 2>&1
+docker run -i --platform linux/amd64 --rm -v $(pwd):/duckdb --workdir /duckdb alpine:latest <<< "apk add g++ git make cmake ninja && GEN=ninja make && $TEST" 2>&1
+docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb alpine:latest <<< "apk add g++ git make cmake ninja && GEN=ninja make && $TEST" 2>&1
+docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb alpine:latest <<< "apk add g++ git make cmake ninja python3 && GEN=ninja make && $TEST" 2>&1
+docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb alpine:latest <<< "apk add g++ git make cmake ninja && CXX_STANDARD=23 GEN=ninja make && $TEST" 2>&1
+docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb ubuntu:20.04 <<< "apt-get update && export DEBIAN_FRONTEND=noninteractive && apt-get install g++ git make cmake ninja-build -y && GEN=ninja make && $TEST" 2>&1
+docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb ubuntu:devel <<< "apt-get update && export DEBIAN_FRONTEND=noninteractive && apt-get install g++ git make cmake ninja-build -y && GEN=ninja make && $TEST" 2>&1
+docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb centos <<< "sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && yum install git make cmake clang -y && make && $TEST" 2>&1
+docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb fedora <<< "dnf install make cmake ninja-build gcc g++ -y && make && $TEST" 2>&1
+docker run -i --rm -v $(pwd):/duckdb --workdir /duckdb ghcr.io/mocusez/duckdb-riscv-ci/duckdb-riscv-ci <<< "apt-get update && export DEBIAN_FRONTEND=noninteractive && apt-get install cmake ninja-build libssl-dev g++-riscv64-linux-gnu -y && GEN=ninja CC='riscv64-linux-gnu-gcc -march=rv64gcv_zicsr_zifencei_zihintpause_zvl256b' CXX='riscv64-linux-gnu-g++ -march=rv64gcv_zicsr_zifencei_zihintpause_zvl256b' DUCKDB_PLATFORM=linux_riscv make && cd / && ./start_qemu.sh && cd /duckdb && make clean && echo 'DOCKER TEST RESULT: SUCCESS' || (echo 'DOCKER TEST RESULT: FAILURE' && make clean)" 2>&1
--- a/external/duckdb/scripts/test_peg_parser.py
+++ b/external/duckdb/scripts/test_peg_parser.py
@@ -0,0 +1,230 @@
+import argparse
+import os
+import sqllogictest
+from sqllogictest import SQLParserException, SQLLogicParser, SQLLogicTest
+import subprocess
+import multiprocessing
+import tempfile
+import re
+
+parser = argparse.ArgumentParser(description="Test serialization")
+parser.add_argument("--shell", type=str, help="Shell binary to run", default=os.path.join('build', 'debug', 'duckdb'))
+parser.add_argument("--offset", type=int, help="File offset", default=None)
+parser.add_argument("--count", type=int, help="File count", default=None)
+parser.add_argument('--no-exit', action='store_true', help='Do not exit after a test fails', default=False)
+parser.add_argument('--print-failing-only', action='store_true', help='Print failing tests only', default=False)
+parser.add_argument(
+    '--include-extensions', action='store_true', help='Include test files of out-of-tree extensions', default=False
+)
+group = parser.add_mutually_exclusive_group(required=True)
+group.add_argument("--test-file", type=str, help="Path to the SQL logic file", default='')
+group.add_argument(
+    "--test-list", type=str, help="Path to the file that contains a newline separated list of test files", default=''
+)
+group.add_argument("--all-tests", action='store_true', help="Run all tests", default=False)
+args = parser.parse_args()
+
+
+def extract_git_urls(script: str):
+    pattern = r'GIT_URL\s+(https?://\S+)'
+    return re.findall(pattern, script)
+
+
+import os
+import requests
+from urllib.parse import urlparse
+
+
+def download_directory_contents(api_url, local_path, headers):
+    response = requests.get(api_url, headers=headers)
+    if response.status_code != 200:
+        print(f"⚠️  Could not access {api_url}: {response.status_code}")
+        return
+
+    os.makedirs(local_path, exist_ok=True)
+
+    for item in response.json():
+        item_type = item.get("type")
+        item_name = item.get("name")
+        if item_type == "file":
+            download_url = item.get("download_url")
+            if not download_url:
+                continue
+            file_path = os.path.join(local_path, item_name)
+            file_resp = requests.get(download_url)
+            if file_resp.status_code == 200:
+                with open(file_path, "wb") as f:
+                    f.write(file_resp.content)
+                print(f"  - Downloaded {file_path}")
+            else:
+                print(f"  - Failed to download {file_path}")
+        elif item_type == "dir":
+            subdir_api_url = item.get("url")
+            subdir_local_path = os.path.join(local_path, item_name)
+            download_directory_contents(subdir_api_url, subdir_local_path, headers)
+
+
+def download_test_sql_folder(repo_url, base_folder="extension-test-files"):
+    repo_name = urlparse(repo_url).path.strip("/").split("/")[-1]
+    target_folder = os.path.join(base_folder, repo_name)
+
+    if os.path.exists(target_folder):
+        print(f"✓ Skipping {repo_name}, already exists.")
+        return
+
+    print(f"⬇️ Downloading test/sql from {repo_name}...")
+
+    api_url = f"https://api.github.com/repos/duckdb/{repo_name}/contents/test/sql?ref=main"
+    GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
+    headers = {"Accept": "application/vnd.github.v3+json", "Authorization": f"Bearer {GITHUB_TOKEN}"}
+
+    download_directory_contents(api_url, target_folder, headers)
+
+
+def batch_download_all_test_sql():
+    filename = ".github/config/out_of_tree_extensions.cmake"
+    if not os.path.isfile(filename):
+        raise Exception(f"File {filename} not found")
+    with open(filename, "r") as f:
+        content = f.read()
+    urls = extract_git_urls(content)
+    if urls == []:
+        print("No URLs found.")
+    for url in urls:
+        download_test_sql_folder(url)
+
+
+def find_tests_recursive(dir, excluded_paths):
+    test_list = []
+    for f in os.listdir(dir):
+        path = os.path.join(dir, f)
+        if path in excluded_paths:
+            continue
+        if os.path.isdir(path):
+            test_list += find_tests_recursive(path, excluded_paths)
+        elif path.endswith('.test') or path.endswith('.test_slow'):
+            test_list.append(path)
+    return test_list
+
+
+def parse_test_file(filename):
+    if not os.path.isfile(filename):
+        raise Exception(f"File {filename} not found")
+    parser = SQLLogicParser()
+    try:
+        out: Optional[SQLLogicTest] = parser.parse(filename)
+        if not out:
+            raise SQLParserException(f"Test {filename} could not be parsed")
+    except:
+        return []
+    loop_count = 0
+    statements = []
+    for stmt in out.statements:
+        if type(stmt) is sqllogictest.statement.skip.Skip:
+            # mode skip - just skip entire test
+            break
+        if type(stmt) is sqllogictest.statement.loop.Loop or type(stmt) is sqllogictest.statement.foreach.Foreach:
+            loop_count += 1
+        if type(stmt) is sqllogictest.statement.endloop.Endloop:
+            loop_count -= 1
+        if loop_count > 0:
+            # loops are ignored currently
+            continue
+        if not (
+            type(stmt) is sqllogictest.statement.query.Query or type(stmt) is sqllogictest.statement.statement.Statement
+        ):
+            # only handle query and statement nodes for now
+            continue
+        if type(stmt) is sqllogictest.statement.statement.Statement:
+            # skip expected errors
+            if stmt.expected_result.type == sqllogictest.ExpectedResult.Type.ERROR:
+                if any(
+                    "parser error" in line.lower() or "syntax error" in line.lower()
+                    for line in stmt.expected_result.lines
+                ):
+                    continue
+        query = ' '.join(stmt.lines)
+        statements.append(query)
+    return statements
+
+
+def run_test_case(args_tuple):
+    i, file, shell, print_failing_only = args_tuple
+    results = []
+    if not print_failing_only:
+        print(f"Run test {i}: {file}")
+
+    statements = parse_test_file(file)
+    for statement in statements:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            peg_sql_path = os.path.join(tmpdir, 'peg_test.sql')
+            with open(peg_sql_path, 'w') as f:
+                f.write(f'CALL check_peg_parser($TEST_PEG_PARSER${statement}$TEST_PEG_PARSER$);\n')
+
+            proc = subprocess.run([shell, '-init', peg_sql_path, '-c', '.exit'], capture_output=True)
+            stderr = proc.stderr.decode('utf8')
+
+            if proc.returncode == 0 and ' Error:' not in stderr:
+                continue
+
+            if print_failing_only:
+                print(f"Failed test {i}: {file}")
+            else:
+                print(f'Failed')
+                print(f'-- STDOUT --')
+                print(proc.stdout.decode('utf8'))
+                print(f'-- STDERR --')
+                print(stderr)
+
+            results.append((file, statement))
+            break
+    return results
+
+
+if __name__ == "__main__":
+    files = []
+    excluded_tests = {
+        'test/sql/peg_parser',  # Fail for some reason
+        'test/sql/prepared/parameter_variants.test',  # PostgreSQL parser bug with ?1
+        'test/sql/copy/s3/download_config.test',  # Unknown why this passes in SQLLogicTest
+        'test/sql/function/list/lambdas/arrow/lambda_scope_deprecated.test',  # Error in the tokenization of *+*
+        'test/sql/catalog/function/test_simple_macro.test',  # Bug when mixing named parameters and non-named
+    }
+    if args.all_tests:
+        # run all tests
+        test_dir = os.path.join('test', 'sql')
+        files = find_tests_recursive(test_dir, excluded_tests)
+        if args.include_extensions:
+            batch_download_all_test_sql()
+            extension_files = find_tests_recursive('extension-test-files', {})
+            files = files + extension_files
+    elif len(args.test_list) > 0:
+        with open(args.test_list, 'r') as f:
+            files = [x.strip() for x in f.readlines() if x.strip() not in excluded_tests]
+    else:
+        # run a single test
+        files.append(args.test_file)
+    files.sort()
+
+    start = args.offset if args.offset is not None else 0
+    end = start + args.count if args.count is not None else len(files)
+    work_items = [(i, files[i], args.shell, args.print_failing_only) for i in range(start, end)]
+
+    if not args.no_exit:
+        # Disable multiprocessing for --no-exit behavior
+        failed_test_list = []
+        for item in work_items:
+            res = run_test_case(item)
+            if res:
+                failed_test_list.extend(res)
+                exit(1)
+    else:
+        with multiprocessing.Pool() as pool:
+            results = pool.map(run_test_case, work_items)
+        failed_test_list = [item for sublist in results for item in sublist]
+
+    failed_tests = len(failed_test_list)
+    print("List of failed tests: ")
+    for test, statement in failed_test_list:
+        print(f"{test}\n{statement}\n\n")
+    print(f"Total of {failed_tests} out of {len(files)} failed ({round(failed_tests/len(files) * 100,2)}%). ")
--- a/external/duckdb/scripts/test_serialization_bwc.py
+++ b/external/duckdb/scripts/test_serialization_bwc.py
@@ -0,0 +1,226 @@
+import sqllogictest
+from sqllogictest import SQLParserException, SQLLogicParser, SQLLogicTest
+import duckdb
+from typing import Optional
+import argparse
+import shutil
+import os
+import subprocess
+
+# example usage: python3 scripts/test_serialization_bwc.py --old-source ../duckdb-bugfix --test-file test/sql/aggregate/aggregates/test_median.test
+
+serialized_path = os.path.join('test', 'api', 'serialized_plans')
+db_load_path = os.path.join(serialized_path, 'db_load.sql')
+queries_path = os.path.join(serialized_path, 'queries.sql')
+result_binary = os.path.join(serialized_path, 'serialized_plans.binary')
+unittest_binary = os.path.join('build', 'debug', 'test', 'unittest')
+
+
+def complete_query(q):
+    q = q.strip()
+    if q.endswith(';'):
+        return q
+    return q + ';'
+
+
+def parse_test_file(filename):
+    parser = SQLLogicParser()
+    try:
+        out: Optional[SQLLogicTest] = parser.parse(filename)
+        if not out:
+            raise SQLParserException(f"Test {filename} could not be parsed")
+    except:
+        return {'load': [], 'query': []}
+    loop_count = 0
+    load_statements = []
+    query_statements = []
+    for stmt in out.statements:
+        if type(stmt) is sqllogictest.statement.skip.Skip:
+            # mode skip - just skip entire test
+            break
+        if type(stmt) is sqllogictest.statement.loop.Loop or type(stmt) is sqllogictest.statement.foreach.Foreach:
+            loop_count += 1
+        if type(stmt) is sqllogictest.statement.endloop.Endloop:
+            loop_count -= 1
+        if loop_count > 0:
+            # loops are ignored currently
+            continue
+        if not (
+            type(stmt) is sqllogictest.statement.query.Query or type(stmt) is sqllogictest.statement.statement.Statement
+        ):
+            # only handle query and statement nodes for now
+            continue
+        if type(stmt) is sqllogictest.statement.statement.Statement:
+            # skip expected errors
+            if stmt.expected_result.type == sqllogictest.ExpectedResult.Type.ERROR:
+                continue
+        query = ' '.join(stmt.lines)
+        try:
+            sql_stmt_list = duckdb.extract_statements(query)
+        except KeyboardInterrupt:
+            raise
+        except:
+            continue
+        for sql_stmt in sql_stmt_list:
+            if sql_stmt.type == duckdb.StatementType.SELECT:
+                query_statements.append(query)
+            elif sql_stmt.type == duckdb.StatementType.PRAGMA:
+                continue
+            else:
+                load_statements.append(query)
+    return {'load': load_statements, 'query': query_statements}
+
+
+def build_sources(old_source, new_source):
+    # generate the sources
+    current_path = os.getcwd()
+    os.chdir(old_source)
+    # build if not yet build
+    if not os.path.isfile(unittest_binary):
+        res = subprocess.run(['make', 'debug']).returncode
+        if res != 0:
+            raise Exception("Failed to build old sources")
+
+    # run the verification
+    os.chdir(current_path)
+    os.chdir(new_source)
+
+    # build if not yet build
+    if not os.path.isfile(unittest_binary):
+        res = subprocess.run(['make', 'debug']).returncode
+        if res != 0:
+            raise Exception("Failed to build new sources")
+    os.chdir(current_path)
+
+
+def run_test(filename, old_source, new_source, no_exit):
+    statements = parse_test_file(filename)
+
+    # generate the sources
+    current_path = os.getcwd()
+    os.chdir(old_source)
+    # write the files
+    with open(os.path.join(old_source, db_load_path), 'w+') as f:
+        for stmt in statements['load']:
+            f.write(complete_query(stmt) + '\n')
+
+    with open(os.path.join(old_source, queries_path), 'w+') as f:
+        for stmt in statements['query']:
+            f.write(complete_query(stmt) + '\n')
+
+    # generate the serialization
+    my_env = os.environ.copy()
+    my_env['GEN_PLAN_STORAGE'] = '1'
+    res = subprocess.run(['build/debug/test/unittest', 'Generate serialized plans file'], env=my_env).returncode
+    if res != 0:
+        print(f"SKIPPING TEST {filename}")
+        return True
+
+    os.chdir(current_path)
+
+    # copy over the files
+    for f in [db_load_path, queries_path, result_binary]:
+        shutil.copy(os.path.join(old_source, f), os.path.join(new_source, f))
+
+    # run the verification
+    os.chdir(new_source)
+
+    res = subprocess.run(['build/debug/test/unittest', "Test deserialized plans from file"]).returncode
+    if res != 0:
+        if no_exit:
+            print("BROKEN TEST")
+            with open('broken_tests.list', 'a') as f:
+                f.write(filename + '\n')
+            return False
+        raise Exception("Deserialization failure")
+    os.chdir(current_path)
+    return True
+
+
+def parse_excluded_tests(path):
+    exclusion_list = {}
+    with open(path) as f:
+        for line in f:
+            if len(line.strip()) == 0 or line[0] == '#':
+                continue
+            exclusion_list[line.strip()] = True
+    return exclusion_list
+
+
+def find_tests_recursive(dir, excluded_paths):
+    test_list = []
+    for f in os.listdir(dir):
+        path = os.path.join(dir, f)
+        if path in excluded_paths:
+            continue
+        if os.path.isdir(path):
+            test_list += find_tests_recursive(path, excluded_paths)
+        elif path.endswith('.test'):
+            test_list.append(path)
+    return test_list
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Test serialization")
+    parser.add_argument("--new-source", type=str, help="Path to the new source", default='.')
+    parser.add_argument("--old-source", type=str, help="Path to the old source")
+    parser.add_argument("--start-at", type=str, help="Start running tests at this specific test", default=None)
+    parser.add_argument("--no-exit", action="store_true", help="Keep running even if a test fails", default=False)
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--test-file", type=str, help="Path to the SQL logic file", default='')
+    group.add_argument("--all-tests", action='store_true', help="Run all tests", default=False)
+    group.add_argument("--test-list", type=str, help="Load tests to run from a file list", default=None)
+    args = parser.parse_args()
+
+    old_source = args.old_source
+    new_source = args.new_source
+    files = []
+    if args.all_tests:
+        # run all tests
+        excluded_tests = parse_excluded_tests(
+            os.path.join(new_source, 'test', 'api', 'serialized_plans', 'excluded_tests.list')
+        )
+        test_dir = os.path.join('test', 'sql')
+        if new_source != '.':
+            test_dir = os.path.join(new_source, test_dir)
+        files = find_tests_recursive(test_dir, excluded_tests)
+    elif args.test_list is not None:
+        with open(args.test_list, 'r') as f:
+            for line in f:
+                if len(line.strip()) == 0:
+                    continue
+                files.append(line.strip())
+    else:
+        # run a single test
+        files.append(args.test_file)
+    files.sort()
+
+    current_path = os.getcwd()
+    try:
+        build_sources(old_source, new_source)
+
+        all_succeeded = True
+        started = False
+        if args.start_at is None:
+            started = True
+        for filename in files:
+            if not started:
+                if filename == args.start_at:
+                    started = True
+                else:
+                    continue
+
+            print(f"Run test {filename}")
+            os.chdir(current_path)
+            if not run_test(filename, old_source, new_source, args.no_exit):
+                all_succeeded = False
+        if not all_succeeded:
+            exit(1)
+    except:
+        raise
+    finally:
+        os.chdir(current_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/external/duckdb/scripts/test_storage_compatibility.py
+++ b/external/duckdb/scripts/test_storage_compatibility.py
@@ -0,0 +1,229 @@
+import argparse
+import os
+import subprocess
+import re
+import csv
+from pathlib import Path
+
+parser = argparse.ArgumentParser(description='Run a full benchmark using the CLI and report the results.')
+group = parser.add_mutually_exclusive_group(required=True)
+group.add_argument('--old-cli', action='store', help='Path to the CLI of the old DuckDB version to test')
+group.add_argument('--versions', type=str, action='store', help='DuckDB versions to test')
+parser.add_argument('--new-unittest', action='store', help='Path to the new unittester to run', required=True)
+parser.add_argument('--new-cli', action='store', help='Path to the new unittester to run', default=None)
+parser.add_argument('--compatibility', action='store', help='Storage compatibility version', default='v1.0.0')
+parser.add_argument(
+    '--test-config', action='store', help='Test config script to run', default='test/configs/storage_compatibility.json'
+)
+parser.add_argument('--db-name', action='store', help='Database name to write to', default='bwc_storage_test.db')
+parser.add_argument('--abort-on-failure', action='store_true', help='Abort on first failure', default=False)
+parser.add_argument('--start-offset', type=int, action='store', help='Test start offset', default=None)
+parser.add_argument('--end-offset', type=int, action='store', help='Test end offset', default=None)
+parser.add_argument('--no-summarize-failures', action='store_true', help='Skip failure summary', default=False)
+parser.add_argument('--list-versions', action='store_true', help='Only list versions to test', default=False)
+parser.add_argument(
+    '--run-empty-tests',
+    action='store_true',
+    help='Run tests that don' 't have a CREATE TABLE or CREATE VIEW statement',
+    default=False,
+)
+
+args, extra_args = parser.parse_known_args()
+
+programs_to_test = []
+if args.versions is not None:
+    version_splits = args.versions.split('|')
+    for version in version_splits:
+        cli_path = os.path.join(Path.home(), '.duckdb', 'cli', version, 'duckdb')
+        if not os.path.isfile(cli_path):
+            os.system(f'curl https://install.duckdb.org | DUCKDB_VERSION={version} sh')
+        programs_to_test.append(cli_path)
+else:
+    programs_to_test.append(args.old_cli)
+
+unittest_program = args.new_unittest
+db_name = args.db_name
+new_cli = args.new_unittest.replace('test/unittest', 'duckdb') if args.new_cli is None else args.new_cli
+summarize_failures = not args.no_summarize_failures
+
+# Use the '-l' parameter to output the list of tests to run
+proc = subprocess.run(
+    [unittest_program, '--test-config', args.test_config, '-l'] + extra_args,
+    stdout=subprocess.PIPE,
+    stderr=subprocess.PIPE,
+)
+stdout = proc.stdout.decode('utf8').strip()
+stderr = proc.stderr.decode('utf8').strip()
+if len(stderr) > 0:
+    print("Failed to run program " + unittest_program)
+    print("Returncode:", proc.returncode)
+    print(stdout)
+    print(stderr)
+    exit(1)
+
+
+# The output is in the format of 'PATH\tGROUP', we're only interested in the PATH portion
+test_cases = []
+first_line = True
+for line in stdout.splitlines():
+    if first_line:
+        first_line = False
+        continue
+    if len(line.strip()) == 0:
+        continue
+    splits = line.rsplit('\t', 1)
+    test_cases.append(splits[0])
+
+test_cases.sort()
+if args.compatibility != 'v1.0.0':
+    raise Exception("Only v1.0.0 is supported for now (FIXME)")
+
+
+def escape_cmd_arg(arg):
+    if '"' in arg or '\'' in arg or ' ' in arg or '\\' in arg:
+        arg = arg.replace('\\', '\\\\')
+        arg = arg.replace('"', '\\"')
+        arg = arg.replace("'", "\\'")
+        return f'"{arg}"'
+    return arg
+
+
+error_container = []
+
+
+def handle_failure(test, cmd, msg, stdout, stderr, returncode):
+    print(f"==============FAILURE============")
+    print(test)
+    print(f"==============MESSAGE============")
+    print(msg)
+    print(f"==============COMMAND============")
+    cmd_str = ''
+    for entry in cmd:
+        cmd_str += escape_cmd_arg(entry) + ' '
+    print(cmd_str.strip())
+    print(f"==============RETURNCODE=========")
+    print(str(returncode))
+    print(f"==============STDOUT=============")
+    print(stdout)
+    print(f"==============STDERR=============")
+    print(stderr)
+    print(f"=================================")
+    if args.abort_on_failure:
+        exit(1)
+    else:
+        error_container.append({'test': test, 'stderr': stderr})
+
+
+def run_program(cmd, description):
+    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout = proc.stdout.decode('utf8').strip()
+    stderr = proc.stderr.decode('utf8').strip()
+    if proc.returncode != 0:
+        return {
+            'test': test,
+            'cmd': cmd,
+            'msg': f'Failed to {description}',
+            'stdout': stdout,
+            'stderr': stderr,
+            'returncode': proc.returncode,
+        }
+    return None
+
+
+def try_run_program(cmd, description):
+    result = run_program(cmd, description)
+    if result is None:
+        return True
+    handle_failure(**result)
+    return False
+
+
+index = 0
+start = 0 if args.start_offset is None else args.start_offset
+end = len(test_cases) if args.end_offset is None else args.end_offset
+for i in range(start, end):
+    test = test_cases[i]
+    skipped = ''
+    if not args.run_empty_tests:
+        with open(test, 'r') as f:
+            test_contents = f.read().lower()
+        if 'create table' not in test_contents and 'create view' not in test_contents:
+            skipped = ' (SKIPPED)'
+
+    print(f'[{i}/{len(test_cases)}]: {test}{skipped}')
+    if skipped != '':
+        continue
+    # remove the old db
+    try:
+        os.remove(db_name)
+    except:
+        pass
+    cmd = [unittest_program, '--test-config', args.test_config, test]
+    if not try_run_program(cmd, 'Run Test'):
+        continue
+
+    if not os.path.isfile(db_name):
+        # db not created
+        continue
+
+    cmd = [
+        programs_to_test[-1],
+        db_name,
+        '-c',
+        '.headers off',
+        '-csv',
+        '-c',
+        '.output table_list.csv',
+        '-c',
+        'SHOW ALL TABLES',
+    ]
+    if not try_run_program(cmd, 'List Tables'):
+        continue
+
+    tables = []
+    with open('table_list.csv', newline='') as f:
+        reader = csv.reader(f)
+        for row in reader:
+            tables.append((row[1], row[2]))
+    # no tables / views
+    if len(tables) == 0:
+        continue
+
+    # read all tables / views
+    failures = []
+    for cli in programs_to_test:
+        cmd = [cli, db_name]
+        for table in tables:
+            schema_name = table[0].replace('"', '""')
+            table_name = table[1].replace('"', '""')
+            cmd += ['-c', f'FROM "{schema_name}"."{table_name}"']
+        failure = run_program(cmd, 'Query Tables')
+        if failure is not None:
+            failures.append(failure)
+    if len(failures) > 0:
+        # we failed to query the tables
+        # this MIGHT be expected - e.g. we might have views that reference stale state (e.g. files that are deleted)
+        # try to run it with the new CLI - if this succeeds we have a problem
+        new_cmd = [new_cli] + cmd[1:]
+        new_failure = run_program(new_cmd, 'Query Tables (New)')
+        if new_failure is None:
+            # we succeeded with the new CLI - report the failure
+            for failure in failures:
+                handle_failure(**failure)
+        continue
+
+if len(error_container) == 0:
+    exit(0)
+
+if summarize_failures:
+    print(
+        '''\n\n====================================================
+================  FAILURES SUMMARY  ================
+====================================================\n
+'''
+    )
+    for i, error in enumerate(error_container, start=1):
+        print(f"\n{i}:", error["test"], "\n")
+        print(error["stderr"])
+
+exit(1)
--- a/external/duckdb/scripts/test_zero_initialize.py
+++ b/external/duckdb/scripts/test_zero_initialize.py
@@ -0,0 +1,162 @@
+import os
+import argparse
+import subprocess
+import shutil
+
+parser = argparse.ArgumentParser(
+    description='''Runs storage tests both with explicit one-initialization and with explicit zero-initialization, and verifies that the final storage files are the same.
+The purpose of this is to verify all memory is correctly initialized before writing to disk - which prevents leaking of in-memory data in storage files by writing uninitialized memory to disk.'''
+)
+parser.add_argument('--unittest', default='build/debug/test/unittest', help='path to unittest', dest='unittest')
+parser.add_argument(
+    '--zero_init_dir',
+    default='test_zero_init_db',
+    help='directory to write zero-initialized databases to',
+    dest='zero_init_dir',
+)
+parser.add_argument(
+    '--standard_dir', default='test_standard_db', help='directory to write regular databases to', dest='standard_dir'
+)
+
+args = parser.parse_args()
+
+test_list = [
+    'test/sql/index/art/storage/test_art_checkpoint.test',
+    'test/sql/storage/compression/simple_compression.test',
+    'test/sql/storage/delete/test_store_deletes.test',
+    'test/sql/storage/mix/test_update_delete_string.test',
+    'test/sql/storage/nested/struct_of_lists_unaligned.test',
+    'test/sql/storage/test_store_integers.test',
+    'test/sql/storage/test_store_nulls_strings.test',
+    'test/sql/storage/update/test_store_null_updates.test',
+]
+
+
+def run_test(args):
+    res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout = res.stdout.decode('utf8').strip()
+    stderr = res.stderr.decode('utf8').strip()
+    if res.returncode != 0:
+        print("Failed to run test!")
+        print("----------COMMAND-----------")
+        print(' '.join(args))
+        print("----------STDOUT-----------")
+        print(stdout)
+        print("----------STDERR-----------")
+        print(stderr)
+        print("---------------------")
+        exit(1)
+
+
+header_size = 4096 * 3
+block_size = 262144
+checksum_size = 8
+
+
+def handle_error(i, standard_db, zero_init_db, standard_data, zero_data):
+    print("------------------------------------------------------------------")
+    print(f"FAIL - Mismatch between one-initialized and zero-initialized databases at byte position {i}")
+    print("------------------------------------------------------------------")
+    print(f"One-initialized database {standard_db} - byte value {standard_data}")
+    print(f"Zero-initialized database {zero_init_db} - byte value {zero_data}")
+    if i < header_size:
+        print("This byte is in the initial headers of the file")
+    else:
+        byte_pos = (i - header_size) % block_size
+        if byte_pos >= checksum_size:
+            print(
+                f"This byte is in block id {(i - header_size) // block_size} at byte position {byte_pos - checksum_size} (position {byte_pos} including the block checksum)"
+            )
+        else:
+            print(f"This byte is in block id {(i - header_size) // block_size} at byte position {byte_pos}")
+            print("This is in the checksum part of the block")
+    print("------------------------------------------------------------------")
+    print(
+        "This error likely means that memory was not correctly zero-initialized in a block before being written out to disk."
+    )
+
+
+def compare_database(standard_db, zero_init_db):
+    with open(standard_db, 'rb') as f:
+        standard_data = f.read()
+    with open(zero_init_db, 'rb') as f:
+        zero_data = f.read()
+    if len(standard_data) != len(zero_data):
+        print(
+            f"FAIL - Length mismatch between database {standard_db} ({str(len(standard_data))}) and {zero_init_db} ({str(len(zero_data))})"
+        )
+        return False
+    found_error = None
+    for i in range(len(standard_data)):
+        if standard_data[i] != zero_data[i]:
+            if i > header_size:
+                byte_pos = (i - header_size) % block_size
+                if byte_pos <= 8:
+                    # different checksum, skip because it does not tell us anything!
+                    if found_error is None:
+                        found_error = i
+                    continue
+            handle_error(i, standard_db, zero_init_db, standard_data[i], zero_data[i])
+            return False
+    if found_error is not None:
+        i = found_error
+        handle_error(i, standard_db, zero_init_db, standard_data[i], zero_data[i])
+        return False
+    print("Success!")
+    return True
+
+
+def compare_files(standard_dir, zero_init_dir):
+    standard_list = os.listdir(standard_dir)
+    zero_init_list = os.listdir(zero_init_dir)
+    standard_list.sort()
+    zero_init_list.sort()
+    if standard_list != zero_init_list:
+        print(
+            f"FAIL - Directories contain mismatching files (standard - {str(standard_list)}, zero init - {str(zero_init_list)})"
+        )
+        return False
+    if len(standard_list) == 0:
+        print("FAIL - Directory is empty!")
+        return False
+    success = True
+    for entry in standard_list:
+        if not compare_database(os.path.join(standard_dir, entry), os.path.join(zero_init_dir, entry)):
+            success = False
+    return success
+
+
+def clear_directories(directories):
+    for dir in directories:
+        try:
+            shutil.rmtree(dir)
+        except FileNotFoundError as e:
+            pass
+
+
+test_dirs = [args.standard_dir, args.zero_init_dir]
+
+success = True
+for test in test_list:
+    print(f"Running test {test}")
+    clear_directories(test_dirs)
+    standard_args = [args.unittest, '--test-temp-dir', args.standard_dir, '--one-initialize', '--single-threaded', test]
+    zero_init_args = [
+        args.unittest,
+        '--test-temp-dir',
+        args.zero_init_dir,
+        '--zero-initialize',
+        '--single-threaded',
+        test,
+    ]
+    print(f"Running test in one-initialize mode")
+    run_test(standard_args)
+    print(f"Running test in zero-initialize mode")
+    run_test(zero_init_args)
+    if not compare_files(args.standard_dir, args.zero_init_dir):
+        success = False
+
+clear_directories(test_dirs)
+
+if not success:
+    exit(1)
--- a/external/duckdb/scripts/try_timeout.py
+++ b/external/duckdb/scripts/try_timeout.py
@@ -0,0 +1,48 @@
+import os
+import sys
+import subprocess
+import threading
+
+if len(sys.argv) < 3:
+    print("Expected python3 scripts/try_timeout.py --timeout=[timeout] --retry=[retries] [cmd] [options...]")
+    print("Timeout should be given in seconds")
+    exit(1)
+
+timeout = int(sys.argv[1].replace("--timeout=", ""))
+retries = int(sys.argv[2].replace("--retry=", ""))
+cmd = sys.argv[3:]
+
+
+class Command(object):
+    def __init__(self, cmd):
+        self.cmd = cmd
+        self.process = None
+
+    def run(self, timeout):
+        self.process = None
+
+        def target():
+            self.process = subprocess.Popen(self.cmd)
+            self.process.communicate()
+
+        thread = threading.Thread(target=target)
+        thread.start()
+
+        thread.join(timeout)
+        if thread.is_alive():
+            print('Terminating process: process exceeded timeout of ' + str(timeout) + ' seconds')
+            self.process.terminate()
+            thread.join()
+        if self.process is None:
+            return 1
+        return self.process.returncode
+
+
+for i in range(retries):
+    print("Attempting to run command \"" + ' '.join(cmd) + '"')
+    command = Command(cmd)
+    returncode = command.run(timeout)
+    if returncode == 0:
+        exit(0)
+
+exit(1)
--- a/external/duckdb/scripts/upload-assets-to-staging.sh
+++ b/external/duckdb/scripts/upload-assets-to-staging.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Main extension uploading script
+
+# Usage: ./scripts/upload-staging-asset.sh <folder> <file>*
+# <folder>              : Folder to upload to
+# <file>                : File to be uploaded
+
+if [ -z "$1" ] || [ -z "$2" ]; then
+    echo "Usage: ./scripts/upload-staging-asset.sh <folder> <file1> [... <fileN>]"
+    exit 1
+fi
+
+set -e
+
+# skip if repo is not in duckdb organization
+if [ "$GITHUB_REPOSITORY_OWNER" != "duckdb" ]; then
+  echo "Repository is $GITHUB_REPOSITORY_OWNER (not duckdb)"
+  exit 0
+fi
+
+FOLDER="$1"
+DRY_RUN_PARAM=""
+
+# dryrun if repo is not duckdb/duckdb
+if [ "$GITHUB_REPOSITORY" != "duckdb/duckdb" ]; then
+  echo "Repository is $GITHUB_REPOSITORY (not duckdb/duckdb)"
+  DRY_RUN_PARAM="--dryrun"
+fi
+# dryrun if we are not in main
+if [ "$GITHUB_REF" != "refs/heads/main" ]; then
+  echo "git ref is $GITHUB_REF (not refs/heads/main)"
+  DRY_RUN_PARAM="--dryrun"
+fi
+
+if [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then
+  echo "overriding DRY_RUN_PARAM, forcing upload"
+  DRY_RUN_PARAM=""
+fi
+
+# dryrun if AWS key is not set
+if [ -z "$AWS_ACCESS_KEY_ID" ]; then
+  echo "No access key available"
+  DRY_RUN_PARAM="--dryrun"
+fi
+
+
+TARGET=$(git log -1 --format=%h)
+
+if [ "$UPLOAD_ASSETS_TO_STAGING_TARGET" ]; then
+  TARGET="$UPLOAD_ASSETS_TO_STAGING_TARGET"
+fi
+
+# decide target for staging
+if [ "$OVERRIDE_GIT_DESCRIBE" ]; then
+  TARGET="$TARGET/$OVERRIDE_GIT_DESCRIBE"
+fi
+
+python3 -m pip install awscli
+
+for var in "${@: 2}"
+do
+    aws s3 cp $var s3://duckdb-staging/$TARGET/$GITHUB_REPOSITORY/$FOLDER/ $DRY_RUN_PARAM --region us-east-2
+done
--- a/external/duckdb/scripts/verify_enum_integrity.py
+++ b/external/duckdb/scripts/verify_enum_integrity.py
@@ -0,0 +1,62 @@
+from cxxheaderparser.parser import CxxParser, ParserOptions
+from cxxheaderparser.visitor import CxxVisitor
+from cxxheaderparser.preprocessor import make_pcpp_preprocessor
+from cxxheaderparser.parserstate import NamespaceBlockState
+from cxxheaderparser.types import EnumDecl
+import textwrap
+import os
+
+
+class Visitor:
+    def on_enum(self, state: NamespaceBlockState, cursor: EnumDecl) -> None:
+        enum_name = cursor.typename.segments[0].format()
+        if '<' in enum_name:
+            raise Exception(
+                "Enum '{}' is an anonymous enum, please name it\n".format(cursor.doxygen[3:] if cursor.doxygen else '')
+            )
+
+        enum_constants = dict()
+        for enum_const in cursor.values:
+            name = enum_const.name.format()
+            if enum_const.value is None:
+                raise Exception(f"Enum constant '{name}' in '{enum_name}' does not have an explicit value assignment.")
+            value = enum_const.value.format()
+            if value in enum_constants:
+                other_constant = enum_constants[value]
+                error = f"""
+                        Enum '{enum_name}' contains a duplicate value:
+                        Value {value} is defined for both '{other_constant}' and '{name}'
+                    """
+                error = textwrap.dedent(error)
+                raise Exception(error)
+            enum_constants[value] = name
+        print(f"Successfully verified the integrity of enum {enum_name} ({len(enum_constants)} entries)")
+
+    def __getattr__(self, name):
+        return lambda *args, **kwargs: True
+
+
+def parse_enum(file_path):
+    # Create index
+    parser = CxxParser(
+        file_path,
+        None,
+        visitor=Visitor(),
+        options=ParserOptions(preprocessor=make_pcpp_preprocessor()),
+    )
+    parser.parse()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Parse a C header file and check enum integrity.")
+    parser.add_argument("file_path", type=str, help="Path to the C header file")
+
+    args = parser.parse_args()
+    file_path = args.file_path
+
+    if not os.path.exists(file_path):
+        raise Exception(f"Error: file '{file_path}' does not exist")
+
+    enum_dict = parse_enum(file_path)
--- a/external/duckdb/scripts/windows_ci.py
+++ b/external/duckdb/scripts/windows_ci.py
@@ -0,0 +1,21 @@
+import os
+
+common_path = os.path.join('src', 'include', 'duckdb', 'common', 'common.hpp')
+with open(common_path, 'r') as f:
+    text = f.read()
+
+
+text = text.replace(
+    '#pragma once',
+    '''#pragma once
+
+#ifdef _WIN32
+#ifdef DUCKDB_MAIN_LIBRARY
+#include "duckdb/common/windows.hpp"
+#endif
+#endif
+''',
+)
+
+with open(common_path, 'w+') as f:
+    f.write(text)