should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,402 @@
import os
import sys
import duckdb
import pandas as pd
import pyarrow as pa
import time
import argparse
from typing import Dict, List, Any
import numpy as np
TPCH_QUERIES = []
res = duckdb.execute(
"""
select query from tpch_queries()
"""
).fetchall()
for x in res:
TPCH_QUERIES.append(x[0])
parser = argparse.ArgumentParser()
parser.add_argument("--verbose", action="store_true", help="Enable verbose mode", default=False)
parser.add_argument("--threads", type=int, help="Number of threads", default=None)
parser.add_argument("--nruns", type=int, help="Number of runs", default=10)
parser.add_argument("--out-file", type=str, help="Output file path", default=None)
parser.add_argument("--scale-factor", type=float, help="Set the scale factor TPCH is generated at", default=1.0)
args, unknown_args = parser.parse_known_args()
verbose = args.verbose
threads = args.threads
nruns = args.nruns
out_file = args.out_file
scale_factor = args.scale_factor
if unknown_args:
parser.error(f"Unrecognized parameter(s): {', '.join(unknown_args)}")
def print_msg(message: str):
if not verbose:
return
print(message)
def write_result(benchmark_name, nrun, t):
bench_result = f"{benchmark_name}\t{nrun}\t{t}"
if out_file is not None:
if not hasattr(write_result, 'file'):
write_result.file = open(out_file, 'w+')
write_result.file.write(bench_result)
write_result.file.write('\n')
else:
print_msg(bench_result)
def close_result():
if not hasattr(write_result, 'file'):
return
write_result.file.close()
class BenchmarkResult:
def __init__(self, name):
self.name = name
self.runs: List[float] = []
def add(self, duration: float):
self.runs.append(duration)
def write(self):
for i, run in enumerate(self.runs):
write_result(self.name, i, run)
class TPCHData:
TABLES = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
def __init__(self, scale_factor):
self.conn = duckdb.connect()
self.conn.execute(f'CALL dbgen(sf={scale_factor})')
def get_tables(self, convertor) -> Dict[str, Any]:
res = {}
for table in self.TABLES:
res[table] = convertor(self.conn, table)
return res
def load_lineitem(self, collector, benchmark_name) -> BenchmarkResult:
query = 'SELECT * FROM lineitem'
result = BenchmarkResult(benchmark_name)
for _ in range(nruns):
duration = 0.0
start = time.time()
rel = self.conn.sql(query)
res = collector(rel)
end = time.time()
duration = float(end - start)
del res
padding = " " * len(str(nruns))
print_msg(f"T{padding}: {duration}s")
result.add(duration)
return result
class TPCHBenchmarker:
def __init__(self, name: str):
self.initialize_connection()
self.name = name
def initialize_connection(self):
self.con = duckdb.connect()
if not threads:
return
print_msg(f'Limiting threads to {threads}')
self.con.execute(f"SET threads={threads}")
def register_tables(self, tables: Dict[str, Any]):
for name, table in tables.items():
self.con.register(name, table)
def run_tpch(self, collector, benchmark_name) -> BenchmarkResult:
print_msg("")
print_msg(TPCH_QUERIES)
result = BenchmarkResult(benchmark_name)
for _ in range(nruns):
duration = 0.0
# Execute all queries
for i, query in enumerate(TPCH_QUERIES):
start = time.time()
rel = self.con.sql(query)
if rel:
res = collector(rel)
del res
else:
print_msg(f"Query '{query}' did not produce output")
end = time.time()
query_time = float(end - start)
print_msg(f"Q{str(i).ljust(len(str(nruns)), ' ')}: {query_time}")
duration += float(end - start)
padding = " " * len(str(nruns))
print_msg(f"T{padding}: {duration}s")
result.add(duration)
return result
def test_tpch():
print_msg(f"Generating TPCH (sf={scale_factor})")
tpch = TPCHData(scale_factor)
## -------- Benchmark converting LineItem to different formats ---------
def fetch_native(rel: duckdb.DuckDBPyRelation):
return rel.fetchall()
def fetch_pandas(rel: duckdb.DuckDBPyRelation):
return rel.df()
def fetch_arrow(rel: duckdb.DuckDBPyRelation):
return rel.arrow()
COLLECTORS = {'native': fetch_native, 'pandas': fetch_pandas, 'arrow': fetch_arrow}
# For every collector, load lineitem 'nrun' times
for collector in COLLECTORS:
result: BenchmarkResult = tpch.load_lineitem(COLLECTORS[collector], collector + "_load_lineitem")
print_msg(result.name)
print_msg(collector)
result.write()
## ------- Benchmark running TPCH queries on top of different formats --------
def convert_pandas(conn: duckdb.DuckDBPyConnection, table_name: str):
return conn.execute(f"SELECT * FROM {table_name}").df()
def convert_arrow(conn: duckdb.DuckDBPyConnection, table_name: str):
df = convert_pandas(conn, table_name)
return pa.Table.from_pandas(df)
CONVERTORS = {'pandas': convert_pandas, 'arrow': convert_arrow}
# Convert TPCH data to the right format, then run TPCH queries on that data
for convertor in CONVERTORS:
tables = tpch.get_tables(CONVERTORS[convertor])
tester = TPCHBenchmarker(convertor)
tester.register_tables(tables)
collector = COLLECTORS[convertor]
result: BenchmarkResult = tester.run_tpch(collector, f"{convertor}tpch")
result.write()
def generate_string(seed: int):
output = ''
for _ in range(10):
output += chr(ord('A') + int(seed % 26))
seed /= 26
return output
class ArrowDictionary:
def __init__(self, unique_values):
self.size = unique_values
self.dict = [generate_string(x) for x in range(unique_values)]
class ArrowDictionaryBenchmark:
def __init__(self, unique_values, values, arrow_dict: ArrowDictionary):
assert unique_values <= arrow_dict.size
self.initialize_connection()
self.generate(unique_values, values, arrow_dict)
def initialize_connection(self):
self.con = duckdb.connect()
if not threads:
return
print_msg(f'Limiting threads to {threads}')
self.con.execute(f"SET threads={threads}")
def generate(self, unique_values, values, arrow_dict: ArrowDictionary):
self.input = []
self.expected = []
for x in range(values):
value = arrow_dict.dict[x % unique_values]
self.input.append(value)
self.expected.append((value,))
array = pa.array(
self.input,
type=pa.dictionary(pa.int64(), pa.string()),
)
self.table = pa.table([array], names=["x"])
def benchmark(self, benchmark_name) -> BenchmarkResult:
self.con.register('arrow_table', self.table)
result = BenchmarkResult(benchmark_name)
for _ in range(nruns):
duration = 0.0
start = time.time()
res = self.con.execute(
"""
select * from arrow_table
"""
).fetchall()
end = time.time()
duration = float(end - start)
assert self.expected == res
del res
padding = " " * len(str(nruns))
print_msg(f"T{padding}: {duration}s")
result.add(duration)
return result
class SelectAndCallBenchmark:
def __init__(self):
"""
SELECT statements become QueryRelations, any other statement type becomes a MaterializedRelation.
We use SELECT and CALL here because their execution plans are identical
"""
self.initialize_connection()
def initialize_connection(self):
self.con = duckdb.connect()
if not threads:
return
print_msg(f'Limiting threads to {threads}')
self.con.execute(f"SET threads={threads}")
def benchmark(self, name, query) -> List[BenchmarkResult]:
results: List[BenchmarkResult] = []
methods = {'select': 'select * from ', 'call': 'call '}
for key, value in methods.items():
for rowcount in [2048, 50000, 2500000]:
result = BenchmarkResult(f'{key}_{name}_{rowcount}')
query_string = query.format(rows=rowcount)
query_string = value + query_string
rel = self.con.sql(query_string)
print_msg(rel.type)
for _ in range(nruns):
duration = 0.0
start = time.time()
rel.fetchall()
end = time.time()
duration = float(end - start)
padding = " " * len(str(nruns))
print_msg(f"T{padding}: {duration}s")
result.add(duration)
results.append(result)
return results
class PandasDFLoadBenchmark:
def __init__(self):
self.initialize_connection()
self.generate()
def initialize_connection(self):
self.con = duckdb.connect()
if not threads:
return
print_msg(f'Limiting threads to {threads}')
self.con.execute(f"SET threads={threads}")
def generate(self):
self.con.execute("call dbgen(sf=0.1)")
new_table = "*, " + ", ".join(["l_shipdate"] * 300)
self.con.execute(f"create table wide as select {new_table} from lineitem limit 500")
self.con.execute(f"copy wide to 'wide_table.csv' (FORMAT CSV)")
def benchmark(self, benchmark_name) -> BenchmarkResult:
result = BenchmarkResult(benchmark_name)
for _ in range(nruns):
duration = 0.0
pandas_df = pd.read_csv('wide_table.csv')
start = time.time()
for _ in range(30):
res = self.con.execute("""select * from pandas_df""").df()
end = time.time()
duration = float(end - start)
del res
result.add(duration)
return result
class PandasAnalyzerBenchmark:
def __init__(self):
self.initialize_connection()
self.generate()
def initialize_connection(self):
self.con = duckdb.connect()
if not threads:
return
print_msg(f'Limiting threads to {threads}')
self.con.execute(f"SET threads={threads}")
def generate(self):
return
def benchmark(self, benchmark_name) -> BenchmarkResult:
result = BenchmarkResult(benchmark_name)
data = [None] * 9999999 + [1] # Last element is 1, others are None
# Create the DataFrame with the specified data and column type as object
pandas_df = pd.DataFrame(data, columns=['Column'], dtype=object)
for _ in range(nruns):
duration = 0.0
start = time.time()
for _ in range(30):
res = self.con.execute("""select * from pandas_df""").df()
end = time.time()
duration = float(end - start)
del res
result.add(duration)
return result
def test_arrow_dictionaries_scan():
DICT_SIZE = 26 * 1000
print_msg(f"Generating a unique dictionary of size {DICT_SIZE}")
arrow_dict = ArrowDictionary(DICT_SIZE)
DATASET_SIZE = 10000000
for unique_values in [2, 1000, DICT_SIZE]:
test = ArrowDictionaryBenchmark(unique_values, DATASET_SIZE, arrow_dict)
benchmark_name = f"arrow_dict_unique_{unique_values}_total_{DATASET_SIZE}"
result = test.benchmark(benchmark_name)
result.write()
def test_loading_pandas_df_many_times():
test = PandasDFLoadBenchmark()
benchmark_name = f"load_pandas_df_many_times"
result = test.benchmark(benchmark_name)
result.write()
def test_pandas_analyze():
test = PandasAnalyzerBenchmark()
benchmark_name = f"pandas_analyze"
result = test.benchmark(benchmark_name)
result.write()
def test_call_and_select_statements():
test = SelectAndCallBenchmark()
queries = {
'repeat_row': "repeat_row(42, 'test', True, 'this is a long string', num_rows={rows})",
}
for key, value in queries.items():
results = test.benchmark(key, value)
for res in results:
res.write()
def main():
test_tpch()
test_arrow_dictionaries_scan()
test_loading_pandas_df_many_times()
test_pandas_analyze()
test_call_and_select_statements()
close_result()
if __name__ == '__main__':
main()