403 lines
13 KiB
Python
403 lines
13 KiB
Python
import os
|
|
import sys
|
|
import duckdb
|
|
import pandas as pd
|
|
import pyarrow as pa
|
|
import time
|
|
import argparse
|
|
from typing import Dict, List, Any
|
|
import numpy as np
|
|
|
|
TPCH_QUERIES = []
|
|
res = duckdb.execute(
|
|
"""
|
|
select query from tpch_queries()
|
|
"""
|
|
).fetchall()
|
|
for x in res:
|
|
TPCH_QUERIES.append(x[0])
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--verbose", action="store_true", help="Enable verbose mode", default=False)
|
|
parser.add_argument("--threads", type=int, help="Number of threads", default=None)
|
|
parser.add_argument("--nruns", type=int, help="Number of runs", default=10)
|
|
parser.add_argument("--out-file", type=str, help="Output file path", default=None)
|
|
parser.add_argument("--scale-factor", type=float, help="Set the scale factor TPCH is generated at", default=1.0)
|
|
args, unknown_args = parser.parse_known_args()
|
|
|
|
verbose = args.verbose
|
|
threads = args.threads
|
|
nruns = args.nruns
|
|
out_file = args.out_file
|
|
scale_factor = args.scale_factor
|
|
|
|
if unknown_args:
|
|
parser.error(f"Unrecognized parameter(s): {', '.join(unknown_args)}")
|
|
|
|
|
|
def print_msg(message: str):
|
|
if not verbose:
|
|
return
|
|
print(message)
|
|
|
|
|
|
def write_result(benchmark_name, nrun, t):
|
|
bench_result = f"{benchmark_name}\t{nrun}\t{t}"
|
|
if out_file is not None:
|
|
if not hasattr(write_result, 'file'):
|
|
write_result.file = open(out_file, 'w+')
|
|
write_result.file.write(bench_result)
|
|
write_result.file.write('\n')
|
|
else:
|
|
print_msg(bench_result)
|
|
|
|
|
|
def close_result():
|
|
if not hasattr(write_result, 'file'):
|
|
return
|
|
write_result.file.close()
|
|
|
|
|
|
class BenchmarkResult:
|
|
def __init__(self, name):
|
|
self.name = name
|
|
self.runs: List[float] = []
|
|
|
|
def add(self, duration: float):
|
|
self.runs.append(duration)
|
|
|
|
def write(self):
|
|
for i, run in enumerate(self.runs):
|
|
write_result(self.name, i, run)
|
|
|
|
|
|
class TPCHData:
|
|
TABLES = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
|
|
|
|
def __init__(self, scale_factor):
|
|
self.conn = duckdb.connect()
|
|
self.conn.execute(f'CALL dbgen(sf={scale_factor})')
|
|
|
|
def get_tables(self, convertor) -> Dict[str, Any]:
|
|
res = {}
|
|
for table in self.TABLES:
|
|
res[table] = convertor(self.conn, table)
|
|
return res
|
|
|
|
def load_lineitem(self, collector, benchmark_name) -> BenchmarkResult:
|
|
query = 'SELECT * FROM lineitem'
|
|
result = BenchmarkResult(benchmark_name)
|
|
for _ in range(nruns):
|
|
duration = 0.0
|
|
start = time.time()
|
|
rel = self.conn.sql(query)
|
|
res = collector(rel)
|
|
end = time.time()
|
|
duration = float(end - start)
|
|
del res
|
|
padding = " " * len(str(nruns))
|
|
print_msg(f"T{padding}: {duration}s")
|
|
result.add(duration)
|
|
return result
|
|
|
|
|
|
class TPCHBenchmarker:
|
|
def __init__(self, name: str):
|
|
self.initialize_connection()
|
|
self.name = name
|
|
|
|
def initialize_connection(self):
|
|
self.con = duckdb.connect()
|
|
if not threads:
|
|
return
|
|
print_msg(f'Limiting threads to {threads}')
|
|
self.con.execute(f"SET threads={threads}")
|
|
|
|
def register_tables(self, tables: Dict[str, Any]):
|
|
for name, table in tables.items():
|
|
self.con.register(name, table)
|
|
|
|
def run_tpch(self, collector, benchmark_name) -> BenchmarkResult:
|
|
print_msg("")
|
|
print_msg(TPCH_QUERIES)
|
|
result = BenchmarkResult(benchmark_name)
|
|
for _ in range(nruns):
|
|
duration = 0.0
|
|
# Execute all queries
|
|
for i, query in enumerate(TPCH_QUERIES):
|
|
start = time.time()
|
|
rel = self.con.sql(query)
|
|
if rel:
|
|
res = collector(rel)
|
|
del res
|
|
else:
|
|
print_msg(f"Query '{query}' did not produce output")
|
|
end = time.time()
|
|
query_time = float(end - start)
|
|
print_msg(f"Q{str(i).ljust(len(str(nruns)), ' ')}: {query_time}")
|
|
duration += float(end - start)
|
|
padding = " " * len(str(nruns))
|
|
print_msg(f"T{padding}: {duration}s")
|
|
result.add(duration)
|
|
return result
|
|
|
|
|
|
def test_tpch():
|
|
print_msg(f"Generating TPCH (sf={scale_factor})")
|
|
tpch = TPCHData(scale_factor)
|
|
|
|
## -------- Benchmark converting LineItem to different formats ---------
|
|
|
|
def fetch_native(rel: duckdb.DuckDBPyRelation):
|
|
return rel.fetchall()
|
|
|
|
def fetch_pandas(rel: duckdb.DuckDBPyRelation):
|
|
return rel.df()
|
|
|
|
def fetch_arrow(rel: duckdb.DuckDBPyRelation):
|
|
return rel.arrow()
|
|
|
|
COLLECTORS = {'native': fetch_native, 'pandas': fetch_pandas, 'arrow': fetch_arrow}
|
|
# For every collector, load lineitem 'nrun' times
|
|
for collector in COLLECTORS:
|
|
result: BenchmarkResult = tpch.load_lineitem(COLLECTORS[collector], collector + "_load_lineitem")
|
|
print_msg(result.name)
|
|
print_msg(collector)
|
|
result.write()
|
|
|
|
## ------- Benchmark running TPCH queries on top of different formats --------
|
|
|
|
def convert_pandas(conn: duckdb.DuckDBPyConnection, table_name: str):
|
|
return conn.execute(f"SELECT * FROM {table_name}").df()
|
|
|
|
def convert_arrow(conn: duckdb.DuckDBPyConnection, table_name: str):
|
|
df = convert_pandas(conn, table_name)
|
|
return pa.Table.from_pandas(df)
|
|
|
|
CONVERTORS = {'pandas': convert_pandas, 'arrow': convert_arrow}
|
|
# Convert TPCH data to the right format, then run TPCH queries on that data
|
|
for convertor in CONVERTORS:
|
|
tables = tpch.get_tables(CONVERTORS[convertor])
|
|
tester = TPCHBenchmarker(convertor)
|
|
tester.register_tables(tables)
|
|
collector = COLLECTORS[convertor]
|
|
result: BenchmarkResult = tester.run_tpch(collector, f"{convertor}tpch")
|
|
result.write()
|
|
|
|
|
|
def generate_string(seed: int):
|
|
output = ''
|
|
for _ in range(10):
|
|
output += chr(ord('A') + int(seed % 26))
|
|
seed /= 26
|
|
return output
|
|
|
|
|
|
class ArrowDictionary:
|
|
def __init__(self, unique_values):
|
|
self.size = unique_values
|
|
self.dict = [generate_string(x) for x in range(unique_values)]
|
|
|
|
|
|
class ArrowDictionaryBenchmark:
|
|
def __init__(self, unique_values, values, arrow_dict: ArrowDictionary):
|
|
assert unique_values <= arrow_dict.size
|
|
self.initialize_connection()
|
|
self.generate(unique_values, values, arrow_dict)
|
|
|
|
def initialize_connection(self):
|
|
self.con = duckdb.connect()
|
|
if not threads:
|
|
return
|
|
print_msg(f'Limiting threads to {threads}')
|
|
self.con.execute(f"SET threads={threads}")
|
|
|
|
def generate(self, unique_values, values, arrow_dict: ArrowDictionary):
|
|
self.input = []
|
|
self.expected = []
|
|
for x in range(values):
|
|
value = arrow_dict.dict[x % unique_values]
|
|
self.input.append(value)
|
|
self.expected.append((value,))
|
|
|
|
array = pa.array(
|
|
self.input,
|
|
type=pa.dictionary(pa.int64(), pa.string()),
|
|
)
|
|
self.table = pa.table([array], names=["x"])
|
|
|
|
def benchmark(self, benchmark_name) -> BenchmarkResult:
|
|
self.con.register('arrow_table', self.table)
|
|
result = BenchmarkResult(benchmark_name)
|
|
for _ in range(nruns):
|
|
duration = 0.0
|
|
start = time.time()
|
|
res = self.con.execute(
|
|
"""
|
|
select * from arrow_table
|
|
"""
|
|
).fetchall()
|
|
end = time.time()
|
|
duration = float(end - start)
|
|
assert self.expected == res
|
|
del res
|
|
padding = " " * len(str(nruns))
|
|
print_msg(f"T{padding}: {duration}s")
|
|
result.add(duration)
|
|
return result
|
|
|
|
|
|
class SelectAndCallBenchmark:
|
|
def __init__(self):
|
|
"""
|
|
SELECT statements become QueryRelations, any other statement type becomes a MaterializedRelation.
|
|
We use SELECT and CALL here because their execution plans are identical
|
|
"""
|
|
self.initialize_connection()
|
|
|
|
def initialize_connection(self):
|
|
self.con = duckdb.connect()
|
|
if not threads:
|
|
return
|
|
print_msg(f'Limiting threads to {threads}')
|
|
self.con.execute(f"SET threads={threads}")
|
|
|
|
def benchmark(self, name, query) -> List[BenchmarkResult]:
|
|
results: List[BenchmarkResult] = []
|
|
methods = {'select': 'select * from ', 'call': 'call '}
|
|
for key, value in methods.items():
|
|
for rowcount in [2048, 50000, 2500000]:
|
|
result = BenchmarkResult(f'{key}_{name}_{rowcount}')
|
|
query_string = query.format(rows=rowcount)
|
|
query_string = value + query_string
|
|
rel = self.con.sql(query_string)
|
|
print_msg(rel.type)
|
|
for _ in range(nruns):
|
|
duration = 0.0
|
|
start = time.time()
|
|
rel.fetchall()
|
|
end = time.time()
|
|
duration = float(end - start)
|
|
padding = " " * len(str(nruns))
|
|
print_msg(f"T{padding}: {duration}s")
|
|
result.add(duration)
|
|
results.append(result)
|
|
return results
|
|
|
|
|
|
class PandasDFLoadBenchmark:
|
|
def __init__(self):
|
|
self.initialize_connection()
|
|
self.generate()
|
|
|
|
def initialize_connection(self):
|
|
self.con = duckdb.connect()
|
|
if not threads:
|
|
return
|
|
print_msg(f'Limiting threads to {threads}')
|
|
self.con.execute(f"SET threads={threads}")
|
|
|
|
def generate(self):
|
|
self.con.execute("call dbgen(sf=0.1)")
|
|
new_table = "*, " + ", ".join(["l_shipdate"] * 300)
|
|
self.con.execute(f"create table wide as select {new_table} from lineitem limit 500")
|
|
self.con.execute(f"copy wide to 'wide_table.csv' (FORMAT CSV)")
|
|
|
|
def benchmark(self, benchmark_name) -> BenchmarkResult:
|
|
result = BenchmarkResult(benchmark_name)
|
|
for _ in range(nruns):
|
|
duration = 0.0
|
|
pandas_df = pd.read_csv('wide_table.csv')
|
|
start = time.time()
|
|
for _ in range(30):
|
|
res = self.con.execute("""select * from pandas_df""").df()
|
|
end = time.time()
|
|
duration = float(end - start)
|
|
del res
|
|
result.add(duration)
|
|
return result
|
|
|
|
|
|
class PandasAnalyzerBenchmark:
|
|
def __init__(self):
|
|
self.initialize_connection()
|
|
self.generate()
|
|
|
|
def initialize_connection(self):
|
|
self.con = duckdb.connect()
|
|
if not threads:
|
|
return
|
|
print_msg(f'Limiting threads to {threads}')
|
|
self.con.execute(f"SET threads={threads}")
|
|
|
|
def generate(self):
|
|
return
|
|
|
|
def benchmark(self, benchmark_name) -> BenchmarkResult:
|
|
result = BenchmarkResult(benchmark_name)
|
|
data = [None] * 9999999 + [1] # Last element is 1, others are None
|
|
|
|
# Create the DataFrame with the specified data and column type as object
|
|
pandas_df = pd.DataFrame(data, columns=['Column'], dtype=object)
|
|
for _ in range(nruns):
|
|
duration = 0.0
|
|
start = time.time()
|
|
for _ in range(30):
|
|
res = self.con.execute("""select * from pandas_df""").df()
|
|
end = time.time()
|
|
duration = float(end - start)
|
|
del res
|
|
result.add(duration)
|
|
return result
|
|
|
|
|
|
def test_arrow_dictionaries_scan():
|
|
DICT_SIZE = 26 * 1000
|
|
print_msg(f"Generating a unique dictionary of size {DICT_SIZE}")
|
|
arrow_dict = ArrowDictionary(DICT_SIZE)
|
|
DATASET_SIZE = 10000000
|
|
for unique_values in [2, 1000, DICT_SIZE]:
|
|
test = ArrowDictionaryBenchmark(unique_values, DATASET_SIZE, arrow_dict)
|
|
benchmark_name = f"arrow_dict_unique_{unique_values}_total_{DATASET_SIZE}"
|
|
result = test.benchmark(benchmark_name)
|
|
result.write()
|
|
|
|
|
|
def test_loading_pandas_df_many_times():
|
|
test = PandasDFLoadBenchmark()
|
|
benchmark_name = f"load_pandas_df_many_times"
|
|
result = test.benchmark(benchmark_name)
|
|
result.write()
|
|
|
|
|
|
def test_pandas_analyze():
|
|
test = PandasAnalyzerBenchmark()
|
|
benchmark_name = f"pandas_analyze"
|
|
result = test.benchmark(benchmark_name)
|
|
result.write()
|
|
|
|
|
|
def test_call_and_select_statements():
|
|
test = SelectAndCallBenchmark()
|
|
queries = {
|
|
'repeat_row': "repeat_row(42, 'test', True, 'this is a long string', num_rows={rows})",
|
|
}
|
|
for key, value in queries.items():
|
|
results = test.benchmark(key, value)
|
|
for res in results:
|
|
res.write()
|
|
|
|
|
|
def main():
|
|
test_tpch()
|
|
test_arrow_dictionaries_scan()
|
|
test_loading_pandas_df_many_times()
|
|
test_pandas_analyze()
|
|
test_call_and_select_statements()
|
|
|
|
close_result()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|