should be it

2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions
--- a/external/duckdb/scripts/regression_test_python.py
+++ b/external/duckdb/scripts/regression_test_python.py
@@ -0,0 +1,402 @@
+import os
+import sys
+import duckdb
+import pandas as pd
+import pyarrow as pa
+import time
+import argparse
+from typing import Dict, List, Any
+import numpy as np
+
+TPCH_QUERIES = []
+res = duckdb.execute(
+    """
+    select query from tpch_queries()
+"""
+).fetchall()
+for x in res:
+    TPCH_QUERIES.append(x[0])
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--verbose", action="store_true", help="Enable verbose mode", default=False)
+parser.add_argument("--threads", type=int, help="Number of threads", default=None)
+parser.add_argument("--nruns", type=int, help="Number of runs", default=10)
+parser.add_argument("--out-file", type=str, help="Output file path", default=None)
+parser.add_argument("--scale-factor", type=float, help="Set the scale factor TPCH is generated at", default=1.0)
+args, unknown_args = parser.parse_known_args()
+
+verbose = args.verbose
+threads = args.threads
+nruns = args.nruns
+out_file = args.out_file
+scale_factor = args.scale_factor
+
+if unknown_args:
+    parser.error(f"Unrecognized parameter(s): {', '.join(unknown_args)}")
+
+
+def print_msg(message: str):
+    if not verbose:
+        return
+    print(message)
+
+
+def write_result(benchmark_name, nrun, t):
+    bench_result = f"{benchmark_name}\t{nrun}\t{t}"
+    if out_file is not None:
+        if not hasattr(write_result, 'file'):
+            write_result.file = open(out_file, 'w+')
+        write_result.file.write(bench_result)
+        write_result.file.write('\n')
+    else:
+        print_msg(bench_result)
+
+
+def close_result():
+    if not hasattr(write_result, 'file'):
+        return
+    write_result.file.close()
+
+
+class BenchmarkResult:
+    def __init__(self, name):
+        self.name = name
+        self.runs: List[float] = []
+
+    def add(self, duration: float):
+        self.runs.append(duration)
+
+    def write(self):
+        for i, run in enumerate(self.runs):
+            write_result(self.name, i, run)
+
+
+class TPCHData:
+    TABLES = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
+
+    def __init__(self, scale_factor):
+        self.conn = duckdb.connect()
+        self.conn.execute(f'CALL dbgen(sf={scale_factor})')
+
+    def get_tables(self, convertor) -> Dict[str, Any]:
+        res = {}
+        for table in self.TABLES:
+            res[table] = convertor(self.conn, table)
+        return res
+
+    def load_lineitem(self, collector, benchmark_name) -> BenchmarkResult:
+        query = 'SELECT * FROM lineitem'
+        result = BenchmarkResult(benchmark_name)
+        for _ in range(nruns):
+            duration = 0.0
+            start = time.time()
+            rel = self.conn.sql(query)
+            res = collector(rel)
+            end = time.time()
+            duration = float(end - start)
+            del res
+            padding = " " * len(str(nruns))
+            print_msg(f"T{padding}: {duration}s")
+            result.add(duration)
+        return result
+
+
+class TPCHBenchmarker:
+    def __init__(self, name: str):
+        self.initialize_connection()
+        self.name = name
+
+    def initialize_connection(self):
+        self.con = duckdb.connect()
+        if not threads:
+            return
+        print_msg(f'Limiting threads to {threads}')
+        self.con.execute(f"SET threads={threads}")
+
+    def register_tables(self, tables: Dict[str, Any]):
+        for name, table in tables.items():
+            self.con.register(name, table)
+
+    def run_tpch(self, collector, benchmark_name) -> BenchmarkResult:
+        print_msg("")
+        print_msg(TPCH_QUERIES)
+        result = BenchmarkResult(benchmark_name)
+        for _ in range(nruns):
+            duration = 0.0
+            # Execute all queries
+            for i, query in enumerate(TPCH_QUERIES):
+                start = time.time()
+                rel = self.con.sql(query)
+                if rel:
+                    res = collector(rel)
+                    del res
+                else:
+                    print_msg(f"Query '{query}' did not produce output")
+                end = time.time()
+                query_time = float(end - start)
+                print_msg(f"Q{str(i).ljust(len(str(nruns)), ' ')}: {query_time}")
+                duration += float(end - start)
+                padding = " " * len(str(nruns))
+                print_msg(f"T{padding}: {duration}s")
+            result.add(duration)
+        return result
+
+
+def test_tpch():
+    print_msg(f"Generating TPCH (sf={scale_factor})")
+    tpch = TPCHData(scale_factor)
+
+    ## -------- Benchmark converting LineItem to different formats ---------
+
+    def fetch_native(rel: duckdb.DuckDBPyRelation):
+        return rel.fetchall()
+
+    def fetch_pandas(rel: duckdb.DuckDBPyRelation):
+        return rel.df()
+
+    def fetch_arrow(rel: duckdb.DuckDBPyRelation):
+        return rel.arrow()
+
+    COLLECTORS = {'native': fetch_native, 'pandas': fetch_pandas, 'arrow': fetch_arrow}
+    # For every collector, load lineitem 'nrun' times
+    for collector in COLLECTORS:
+        result: BenchmarkResult = tpch.load_lineitem(COLLECTORS[collector], collector + "_load_lineitem")
+        print_msg(result.name)
+        print_msg(collector)
+        result.write()
+
+    ## ------- Benchmark running TPCH queries on top of different formats --------
+
+    def convert_pandas(conn: duckdb.DuckDBPyConnection, table_name: str):
+        return conn.execute(f"SELECT * FROM {table_name}").df()
+
+    def convert_arrow(conn: duckdb.DuckDBPyConnection, table_name: str):
+        df = convert_pandas(conn, table_name)
+        return pa.Table.from_pandas(df)
+
+    CONVERTORS = {'pandas': convert_pandas, 'arrow': convert_arrow}
+    # Convert TPCH data to the right format, then run TPCH queries on that data
+    for convertor in CONVERTORS:
+        tables = tpch.get_tables(CONVERTORS[convertor])
+        tester = TPCHBenchmarker(convertor)
+        tester.register_tables(tables)
+        collector = COLLECTORS[convertor]
+        result: BenchmarkResult = tester.run_tpch(collector, f"{convertor}tpch")
+        result.write()
+
+
+def generate_string(seed: int):
+    output = ''
+    for _ in range(10):
+        output += chr(ord('A') + int(seed % 26))
+        seed /= 26
+    return output
+
+
+class ArrowDictionary:
+    def __init__(self, unique_values):
+        self.size = unique_values
+        self.dict = [generate_string(x) for x in range(unique_values)]
+
+
+class ArrowDictionaryBenchmark:
+    def __init__(self, unique_values, values, arrow_dict: ArrowDictionary):
+        assert unique_values <= arrow_dict.size
+        self.initialize_connection()
+        self.generate(unique_values, values, arrow_dict)
+
+    def initialize_connection(self):
+        self.con = duckdb.connect()
+        if not threads:
+            return
+        print_msg(f'Limiting threads to {threads}')
+        self.con.execute(f"SET threads={threads}")
+
+    def generate(self, unique_values, values, arrow_dict: ArrowDictionary):
+        self.input = []
+        self.expected = []
+        for x in range(values):
+            value = arrow_dict.dict[x % unique_values]
+            self.input.append(value)
+            self.expected.append((value,))
+
+        array = pa.array(
+            self.input,
+            type=pa.dictionary(pa.int64(), pa.string()),
+        )
+        self.table = pa.table([array], names=["x"])
+
+    def benchmark(self, benchmark_name) -> BenchmarkResult:
+        self.con.register('arrow_table', self.table)
+        result = BenchmarkResult(benchmark_name)
+        for _ in range(nruns):
+            duration = 0.0
+            start = time.time()
+            res = self.con.execute(
+                """
+                select * from arrow_table
+            """
+            ).fetchall()
+            end = time.time()
+            duration = float(end - start)
+            assert self.expected == res
+            del res
+            padding = " " * len(str(nruns))
+            print_msg(f"T{padding}: {duration}s")
+            result.add(duration)
+        return result
+
+
+class SelectAndCallBenchmark:
+    def __init__(self):
+        """
+        SELECT statements become QueryRelations, any other statement type becomes a MaterializedRelation.
+        We use SELECT and CALL here because their execution plans are identical
+        """
+        self.initialize_connection()
+
+    def initialize_connection(self):
+        self.con = duckdb.connect()
+        if not threads:
+            return
+        print_msg(f'Limiting threads to {threads}')
+        self.con.execute(f"SET threads={threads}")
+
+    def benchmark(self, name, query) -> List[BenchmarkResult]:
+        results: List[BenchmarkResult] = []
+        methods = {'select': 'select * from ', 'call': 'call '}
+        for key, value in methods.items():
+            for rowcount in [2048, 50000, 2500000]:
+                result = BenchmarkResult(f'{key}_{name}_{rowcount}')
+                query_string = query.format(rows=rowcount)
+                query_string = value + query_string
+                rel = self.con.sql(query_string)
+                print_msg(rel.type)
+                for _ in range(nruns):
+                    duration = 0.0
+                    start = time.time()
+                    rel.fetchall()
+                    end = time.time()
+                    duration = float(end - start)
+                    padding = " " * len(str(nruns))
+                    print_msg(f"T{padding}: {duration}s")
+                    result.add(duration)
+                results.append(result)
+        return results
+
+
+class PandasDFLoadBenchmark:
+    def __init__(self):
+        self.initialize_connection()
+        self.generate()
+
+    def initialize_connection(self):
+        self.con = duckdb.connect()
+        if not threads:
+            return
+        print_msg(f'Limiting threads to {threads}')
+        self.con.execute(f"SET threads={threads}")
+
+    def generate(self):
+        self.con.execute("call dbgen(sf=0.1)")
+        new_table = "*, " + ", ".join(["l_shipdate"] * 300)
+        self.con.execute(f"create table wide as select {new_table} from lineitem limit 500")
+        self.con.execute(f"copy wide to 'wide_table.csv' (FORMAT CSV)")
+
+    def benchmark(self, benchmark_name) -> BenchmarkResult:
+        result = BenchmarkResult(benchmark_name)
+        for _ in range(nruns):
+            duration = 0.0
+            pandas_df = pd.read_csv('wide_table.csv')
+            start = time.time()
+            for _ in range(30):
+                res = self.con.execute("""select * from pandas_df""").df()
+            end = time.time()
+            duration = float(end - start)
+            del res
+            result.add(duration)
+        return result
+
+
+class PandasAnalyzerBenchmark:
+    def __init__(self):
+        self.initialize_connection()
+        self.generate()
+
+    def initialize_connection(self):
+        self.con = duckdb.connect()
+        if not threads:
+            return
+        print_msg(f'Limiting threads to {threads}')
+        self.con.execute(f"SET threads={threads}")
+
+    def generate(self):
+        return
+
+    def benchmark(self, benchmark_name) -> BenchmarkResult:
+        result = BenchmarkResult(benchmark_name)
+        data = [None] * 9999999 + [1]  # Last element is 1, others are None
+
+        # Create the DataFrame with the specified data and column type as object
+        pandas_df = pd.DataFrame(data, columns=['Column'], dtype=object)
+        for _ in range(nruns):
+            duration = 0.0
+            start = time.time()
+            for _ in range(30):
+                res = self.con.execute("""select * from pandas_df""").df()
+            end = time.time()
+            duration = float(end - start)
+            del res
+            result.add(duration)
+        return result
+
+
+def test_arrow_dictionaries_scan():
+    DICT_SIZE = 26 * 1000
+    print_msg(f"Generating a unique dictionary of size {DICT_SIZE}")
+    arrow_dict = ArrowDictionary(DICT_SIZE)
+    DATASET_SIZE = 10000000
+    for unique_values in [2, 1000, DICT_SIZE]:
+        test = ArrowDictionaryBenchmark(unique_values, DATASET_SIZE, arrow_dict)
+        benchmark_name = f"arrow_dict_unique_{unique_values}_total_{DATASET_SIZE}"
+        result = test.benchmark(benchmark_name)
+        result.write()
+
+
+def test_loading_pandas_df_many_times():
+    test = PandasDFLoadBenchmark()
+    benchmark_name = f"load_pandas_df_many_times"
+    result = test.benchmark(benchmark_name)
+    result.write()
+
+
+def test_pandas_analyze():
+    test = PandasAnalyzerBenchmark()
+    benchmark_name = f"pandas_analyze"
+    result = test.benchmark(benchmark_name)
+    result.write()
+
+
+def test_call_and_select_statements():
+    test = SelectAndCallBenchmark()
+    queries = {
+        'repeat_row': "repeat_row(42, 'test', True, 'this is a long string', num_rows={rows})",
+    }
+    for key, value in queries.items():
+        results = test.benchmark(key, value)
+        for res in results:
+            res.write()
+
+
+def main():
+    test_tpch()
+    test_arrow_dictionaries_scan()
+    test_loading_pandas_df_many_times()
+    test_pandas_analyze()
+    test_call_and_select_statements()
+
+    close_result()
+
+
+if __name__ == '__main__':
+    main()