Files
email-tracker/external/duckdb/scripts/amalgamation.py
2025-10-24 19:21:19 -05:00

609 lines
22 KiB
Python

# this script creates a single header + source file combination out of the DuckDB sources
import os
import re
import sys
import shutil
import subprocess
from python_helpers import open_utf8, normalize_path
amal_dir = os.path.join('src', 'amalgamation')
header_file = os.path.join(amal_dir, "duckdb.hpp")
source_file = os.path.join(amal_dir, "duckdb.cpp")
temp_header = 'duckdb.hpp.tmp'
temp_source = 'duckdb.cpp.tmp'
skip_duckdb_includes = False
src_dir = 'src'
include_dir = os.path.join('src', 'include')
# files included in the amalgamated "duckdb.hpp" file
main_header_files = [
os.path.join(include_dir, 'duckdb.hpp'),
os.path.join(include_dir, 'duckdb.h'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'date.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'adbc', 'adbc.h'),
os.path.join(include_dir, 'duckdb', 'common', 'adbc', 'adbc.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'arrow', 'arrow.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'arrow', 'arrow_converter.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'arrow', 'arrow_wrapper.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'blob.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'decimal.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'hugeint.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'uhugeint.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'uuid.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'interval.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'timestamp.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'time.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'buffered_file_writer.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'memory_stream.hpp'),
os.path.join(include_dir, 'duckdb', 'main', 'appender.hpp'),
os.path.join(include_dir, 'duckdb', 'main', 'client_context.hpp'),
os.path.join(include_dir, 'duckdb', 'main', 'extension', 'extension_loader.hpp'),
os.path.join(include_dir, 'duckdb', 'function', 'function.hpp'),
os.path.join(include_dir, 'duckdb', 'function', 'table_function.hpp'),
os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_table_function_info.hpp'),
os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_copy_function_info.hpp'),
]
extended_amalgamation = False
if '--extended' in sys.argv:
def add_include_dir(dirpath):
return [os.path.join(dirpath, x) for x in os.listdir(dirpath)]
extended_amalgamation = True
main_header_files += [
os.path.join(include_dir, x)
for x in [
'duckdb/planner/expression/bound_constant_expression.hpp',
'duckdb/planner/expression/bound_function_expression.hpp',
'duckdb/catalog/catalog_entry/scalar_function_catalog_entry.hpp',
'duckdb/parser/parsed_data/create_table_info.hpp',
'duckdb/planner/parsed_data/bound_create_table_info.hpp',
'duckdb/parser/constraints/not_null_constraint.hpp',
'duckdb/storage/data_table.hpp',
'duckdb/function/pragma_function.hpp',
'duckdb/parser/qualified_name.hpp',
'duckdb/parser/parser.hpp',
'duckdb/planner/binder.hpp',
'duckdb/storage/object_cache.hpp',
'duckdb/planner/table_filter.hpp',
"duckdb/storage/statistics/base_statistics.hpp",
"duckdb/planner/filter/conjunction_filter.hpp",
"duckdb/planner/filter/constant_filter.hpp",
"duckdb/common/types/vector_cache.hpp",
"duckdb/common/string_map_set.hpp",
"duckdb/planner/filter/null_filter.hpp",
"duckdb/common/arrow/arrow_wrapper.hpp",
"duckdb/common/hive_partitioning.hpp",
"duckdb/common/multi_file/union_by_name.hpp",
"duckdb/planner/operator/logical_get.hpp",
"duckdb/common/compressed_file_system.hpp",
]
]
main_header_files += add_include_dir(os.path.join(include_dir, 'duckdb/parser/expression'))
main_header_files += add_include_dir(os.path.join(include_dir, 'duckdb/parser/parsed_data'))
main_header_files += add_include_dir(os.path.join(include_dir, 'duckdb/parser/tableref'))
main_header_files = normalize_path(main_header_files)
import package_build
# include paths for where to search for include files during amalgamation
include_paths = [include_dir] + package_build.third_party_includes()
# paths of where to look for files to compile and include to the final amalgamation
compile_directories = [src_dir] + package_build.third_party_sources()
# files always excluded
always_excluded = normalize_path(
[
'src/amalgamation/duckdb.cpp',
'src/amalgamation/duckdb.hpp',
'src/amalgamation/parquet-amalgamation.cpp',
'src/amalgamation/parquet-amalgamation.hpp',
]
)
# files excluded from the amalgamation
excluded_files = ['grammar.cpp', 'grammar.hpp', 'symbols.cpp']
# files excluded from individual file compilation during test_compile
excluded_compilation_files = excluded_files + ['gram.hpp', 'kwlist.hpp', "duckdb-c.cpp"]
linenumbers = False
def get_includes(fpath, text):
# find all the includes referred to in the directory
regex_include_statements = re.findall("(^[\t ]*[#][\t ]*include[\t ]+[\"]([^\"]+)[\"])", text, flags=re.MULTILINE)
include_statements = []
include_files = []
# figure out where they are located
for x in regex_include_statements:
included_file = x[1]
if skip_duckdb_includes and 'duckdb' in included_file:
continue
if (
'extension_helper.cpp' in fpath
and (included_file.endswith('_extension.hpp'))
or included_file == 'generated_extension_loader.hpp'
or included_file == 'generated_extension_headers.hpp'
):
continue
if 'allocator.cpp' in fpath and included_file.endswith('jemalloc_extension.hpp'):
continue
if x[0] in include_statements:
raise Exception(f"duplicate include {x[0]} in file {fpath}")
include_statements.append(x[0])
included_file = os.sep.join(included_file.split('/'))
found = False
for include_path in include_paths:
ipath = os.path.join(include_path, included_file)
if os.path.isfile(ipath):
include_files.append(ipath)
found = True
break
if not found:
raise Exception('Could not find include file "' + included_file + '", included from file "' + fpath + '"')
return (include_statements, include_files)
def cleanup_file(text):
# remove all "#pragma once" notifications
text = re.sub('#pragma once', '', text)
return text
# recursively get all includes and write them
written_files = {}
# licenses
licenses = []
def need_to_write_file(current_file, ignore_excluded=False):
if amal_dir in current_file:
return False
if current_file in always_excluded:
return False
if current_file.split(os.sep)[-1] in excluded_files and not ignore_excluded:
# file is in ignored files set
return False
if current_file in written_files:
# file is already written
return False
return True
def find_license(original_file):
global licenses
file = original_file
license = ""
while True:
(file, end) = os.path.split(file)
if file == "":
break
potential_license = os.path.join(file, "LICENSE")
if os.path.exists(potential_license):
license = potential_license
if license == "":
raise "Could not find license for %s" % original_file
if license not in licenses:
licenses += [license]
return licenses.index(license)
def write_file(current_file, ignore_excluded=False):
global linenumbers
global written_files
if not need_to_write_file(current_file, ignore_excluded):
return ""
written_files[current_file] = True
# first read this file
with open_utf8(current_file, 'r') as f:
text = f.read()
if current_file.startswith("third_party") and not current_file.endswith("LICENSE"):
lic_idx = find_license(current_file)
text = (
"\n\n// LICENSE_CHANGE_BEGIN\n// The following code up to LICENSE_CHANGE_END is subject to THIRD PARTY LICENSE #%s\n// See the end of this file for a list\n\n"
% str(lic_idx + 1)
+ text
+ "\n\n// LICENSE_CHANGE_END\n"
)
(statements, includes) = get_includes(current_file, text)
# find the linenr of the final #include statement we parsed
if len(statements) > 0:
index = text.find(statements[-1])
linenr = len(text[:index].split('\n'))
# now write all the dependencies of this header first
for i in range(len(includes)):
include_text = write_file(includes[i])
if linenumbers and i == len(includes) - 1:
# for the last include statement, we also include a #line directive
include_text += '\n#line %d "%s"\n' % (linenr, current_file)
text = text.replace(statements[i], include_text)
# add the initial line here
if linenumbers:
text = '\n#line 1 "%s"\n' % (current_file,) + text
# print(current_file)
# now read the header and write it
return cleanup_file(text)
def write_dir(dir):
files = os.listdir(dir)
files.sort()
text = ""
for fname in files:
if fname in excluded_files:
continue
# print(fname)
fpath = os.path.join(dir, fname)
if os.path.isdir(fpath):
text += write_dir(fpath)
elif fname.endswith('.cpp') or fname.endswith('.c') or fname.endswith('.cc'):
text += write_file(fpath)
return text
def copy_if_different(src, dest):
if os.path.isfile(dest):
# dest exists, check if the files are different
with open_utf8(src, 'r') as f:
source_text = f.read()
with open_utf8(dest, 'r') as f:
dest_text = f.read()
if source_text == dest_text:
# print("Skipping copy of " + src + ", identical copy already exists at " + dest)
return
# print("Copying " + src + " to " + dest)
shutil.copyfile(src, dest)
def git_commit_hash():
git_describe = package_build.get_git_describe()
hash = git_describe.split('-')[2].lstrip('g')
return hash
######
# MAIN_BRANCH_VERSIONING default should be 'True' for main branch and feature branches
# MAIN_BRANCH_VERSIONING default should be 'False' for release branches
# MAIN_BRANCH_VERSIONING default value needs to keep in sync between:
# - CMakeLists.txt
# - scripts/amalgamation.py
# - scripts/package_build.py
######
MAIN_BRANCH_VERSIONING = True
if os.getenv('MAIN_BRANCH_VERSIONING') == "0":
MAIN_BRANCH_VERSIONING = False
if os.getenv('MAIN_BRANCH_VERSIONING') == "1":
MAIN_BRANCH_VERSIONING = True
def git_dev_version():
try:
long_version = package_build.get_git_describe()
version_splits = long_version.split('-')[0].lstrip('v').split('.')
dev_version = long_version.split('-')[1]
if int(dev_version) == 0:
# directly on a tag: emit the regular version
return "v" + '.'.join(version_splits)
else:
# not on a tag: increment the version by one and add a -devX suffix
# this needs to keep in sync with changes to CMakeLists.txt
if MAIN_BRANCH_VERSIONING == True:
# increment minor version
version_splits[1] = str(int(version_splits[1]) + 1)
else:
# increment patch version
version_splits[2] = str(int(version_splits[2]) + 1)
return "v" + '.'.join(version_splits) + "-dev" + dev_version
except:
return "v0.0.0"
def generate_duckdb_hpp(header_file):
print("-----------------------")
print("-- Writing " + header_file + " --")
print("-----------------------")
with open_utf8(temp_header, 'w+') as hfile:
hfile.write("/*\n")
hfile.write(write_file("LICENSE"))
hfile.write("*/\n\n")
hfile.write("#pragma once\n")
hfile.write("#define DUCKDB_AMALGAMATION 1\n")
if extended_amalgamation:
hfile.write("#define DUCKDB_AMALGAMATION_EXTENDED 1\n")
hfile.write("#define DUCKDB_SOURCE_ID \"%s\"\n" % git_commit_hash())
dev_version = git_dev_version()
dev_v_parts = dev_version.lstrip('v').split('.')
hfile.write("#define DUCKDB_VERSION \"%s\"\n" % dev_version)
hfile.write("#define DUCKDB_MAJOR_VERSION %d\n" % int(dev_v_parts[0]))
hfile.write("#define DUCKDB_MINOR_VERSION %d\n" % int(dev_v_parts[1]))
hfile.write("#define DUCKDB_PATCH_VERSION \"%s\"\n" % dev_v_parts[2])
for fpath in main_header_files:
hfile.write(write_file(fpath))
def generate_amalgamation(source_file, header_file):
# construct duckdb.hpp from these headers
generate_duckdb_hpp(header_file)
# now construct duckdb.cpp
print("------------------------")
print("-- Writing " + source_file + " --")
print("------------------------")
# scan all the .cpp files
with open_utf8(temp_source, 'w+') as sfile:
header_file_name = header_file.split(os.sep)[-1]
sfile.write('#include "' + header_file_name + '"\n\n')
sfile.write("#ifndef DUCKDB_AMALGAMATION\n#error header mismatch\n#endif\n\n")
sfile.write("#if (!defined(DEBUG) && !defined NDEBUG)\n#define NDEBUG\n#endif\n\n")
for compile_dir in compile_directories:
sfile.write(write_dir(compile_dir))
sfile.write('\n\n/*\n')
license_idx = 0
for license in licenses:
sfile.write("\n\n\n### THIRD PARTY LICENSE #%s ###\n\n" % str(license_idx + 1))
sfile.write(write_file(license))
license_idx += 1
sfile.write('\n\n*/\n')
copy_if_different(temp_header, header_file)
copy_if_different(temp_source, source_file)
try:
os.remove(temp_header)
os.remove(temp_source)
except:
pass
def list_files(dname, file_list):
files = os.listdir(dname)
files.sort()
for fname in files:
if fname in excluded_files:
continue
fpath = os.path.join(dname, fname)
if os.path.isdir(fpath):
list_files(fpath, file_list)
elif fname.endswith(('.cpp', '.c', '.cc')):
if need_to_write_file(fpath):
file_list.append(fpath)
def list_sources():
file_list = []
for compile_dir in compile_directories:
list_files(compile_dir, file_list)
return file_list
def list_include_files_recursive(dname, file_list):
files = os.listdir(dname)
files.sort()
for fname in files:
if fname in excluded_files:
continue
fpath = os.path.join(dname, fname)
if os.path.isdir(fpath):
list_include_files_recursive(fpath, file_list)
elif fname.endswith(('.hpp', '.ipp', '.h', '.hh', '.tcc', '.inc')):
file_list.append(fpath)
def list_includes_files(include_dirs):
file_list = []
for include_dir in include_dirs:
list_include_files_recursive(include_dir, file_list)
return file_list
def list_includes():
return list_includes_files(include_paths)
def gather_file(current_file, source_files, header_files):
global linenumbers
global written_files
if not need_to_write_file(current_file, False):
return ""
written_files[current_file] = True
# first read this file
with open_utf8(current_file, 'r') as f:
text = f.read()
(statements, includes) = get_includes(current_file, text)
# find the linenr of the final #include statement we parsed
if len(statements) > 0:
index = text.find(statements[-1])
linenr = len(text[:index].split('\n'))
# now write all the dependencies of this header first
for i in range(len(includes)):
# source file inclusions are inlined into the main text
include_text = write_file(includes[i])
if linenumbers and i == len(includes) - 1:
# for the last include statement, we also include a #line directive
include_text += '\n#line %d "%s"\n' % (linenr, current_file)
if includes[i].endswith('.cpp') or includes[i].endswith('.cc') or includes[i].endswith('.c'):
# source file inclusions are inlined into the main text
text = text.replace(statements[i], include_text)
else:
text = text.replace(statements[i], '')
header_files.append(include_text)
# add the initial line here
if linenumbers:
text = '\n#line 1 "%s"\n' % (current_file,) + text
source_files.append(cleanup_file(text))
def gather_files(dir, source_files, header_files):
files = os.listdir(dir)
files.sort()
for fname in files:
if fname in excluded_files:
continue
fpath = os.path.join(dir, fname)
if os.path.isdir(fpath):
gather_files(fpath, source_files, header_files)
elif fname.endswith('.cpp') or fname.endswith('.c') or fname.endswith('.cc'):
gather_file(fpath, source_files, header_files)
def write_license(hfile):
hfile.write("// See https://raw.githubusercontent.com/duckdb/duckdb/main/LICENSE for licensing information\n\n")
def generate_amalgamation_splits(source_file, header_file, nsplits):
# construct duckdb.hpp from these headers
generate_duckdb_hpp(header_file)
# gather all files to read and write
source_files = []
header_files = []
for compile_dir in compile_directories:
if compile_dir != src_dir:
continue
gather_files(compile_dir, source_files, header_files)
# write duckdb-internal.hpp
if '.hpp' in header_file:
internal_header_file = header_file.replace('.hpp', '-internal.hpp')
elif '.h' in header_file:
internal_header_file = header_file.replace('.h', '-internal.h')
else:
raise "Unknown extension of header file"
temp_internal_header = internal_header_file + '.tmp'
with open_utf8(temp_internal_header, 'w+') as f:
write_license(f)
for hfile in header_files:
f.write(hfile)
# count the total amount of bytes in the source files
total_bytes = 0
for sfile in source_files:
total_bytes += len(sfile)
# now write the individual splits
# we approximate the splitting up by making every file have roughly the same amount of bytes
split_bytes = total_bytes / nsplits
current_bytes = 0
partitions = []
partition_names = []
current_partition = []
current_partition_idx = 1
for sfile in source_files:
current_partition.append(sfile)
current_bytes += len(sfile)
if current_bytes >= split_bytes:
partition_names.append(str(current_partition_idx))
partitions.append(current_partition)
current_partition = []
current_bytes = 0
current_partition_idx += 1
if len(current_partition) > 0:
partition_names.append(str(current_partition_idx))
partitions.append(current_partition)
current_partition = []
current_bytes = 0
# generate partitions from the third party libraries
for compile_dir in compile_directories:
if compile_dir != src_dir:
partition_names.append(compile_dir.split(os.sep)[-1])
partitions.append(write_dir(compile_dir))
header_file_name = header_file.split(os.sep)[-1]
internal_header_file_name = internal_header_file.split(os.sep)[-1]
partition_fnames = []
current_partition = 0
for partition in partitions:
partition_name = source_file.replace('.cpp', '-%s.cpp' % (partition_names[current_partition],))
temp_partition_name = partition_name + '.tmp'
partition_fnames.append([partition_name, temp_partition_name])
with open_utf8(temp_partition_name, 'w+') as f:
write_license(f)
f.write('#include "%s"\n#include "%s"' % (header_file_name, internal_header_file_name))
f.write(
'''
#ifndef DUCKDB_AMALGAMATION
#error header mismatch
#endif
'''
)
for sfile in partition:
f.write(sfile)
current_partition += 1
copy_if_different(temp_header, header_file)
copy_if_different(temp_internal_header, internal_header_file)
try:
os.remove(temp_header)
os.remove(temp_internal_header)
except:
pass
for p in partition_fnames:
copy_if_different(p[1], p[0])
try:
os.remove(p[1])
except:
pass
def list_include_dirs():
return include_paths
if __name__ == "__main__":
nsplits = 1
for arg in sys.argv:
if arg == '--linenumbers':
linenumbers = True
elif arg == '--no-linenumbers':
linenumbers = False
elif arg.startswith('--header='):
header_file = os.path.join(*arg.split('=', 1)[1].split('/'))
elif arg.startswith('--source='):
source_file = os.path.join(*arg.split('=', 1)[1].split('/'))
elif arg.startswith('--splits='):
nsplits = int(arg.split('=', 1)[1])
elif arg.startswith('--list-sources'):
file_list = list_sources()
print('\n'.join(file_list))
exit(1)
elif arg.startswith('--list-objects'):
file_list = list_sources()
print(' '.join([x.rsplit('.', 1)[0] + '.o' for x in file_list]))
exit(1)
elif arg.startswith('--includes'):
include_dirs = list_include_dirs()
print(' '.join(['-I' + x for x in include_dirs]))
exit(1)
elif arg.startswith('--include-directories'):
include_dirs = list_include_dirs()
print('\n'.join(include_dirs))
exit(1)
if os.path.exists(amal_dir):
shutil.rmtree(amal_dir)
os.makedirs(amal_dir)
if nsplits > 1:
generate_amalgamation_splits(source_file, header_file, nsplits)
else:
generate_amalgamation(source_file, header_file)