email-tracker/external/duckdb/scripts/regression_test_python.py

import os
import sys
import duckdb
import pandas as pd
import pyarrow as pa
import time
import argparse
from typing import Dict, List, Any
import numpy as np

TPCH_QUERIES = []
res = duckdb.execute(
    """
    select query from tpch_queries()
"""
).fetchall()
for x in res:
    TPCH_QUERIES.append(x[0])

parser = argparse.ArgumentParser()
parser.add_argument("--verbose", action="store_true", help="Enable verbose mode", default=False)
parser.add_argument("--threads", type=int, help="Number of threads", default=None)
parser.add_argument("--nruns", type=int, help="Number of runs", default=10)
parser.add_argument("--out-file", type=str, help="Output file path", default=None)
parser.add_argument("--scale-factor", type=float, help="Set the scale factor TPCH is generated at", default=1.0)
args, unknown_args = parser.parse_known_args()

verbose = args.verbose
threads = args.threads
nruns = args.nruns
out_file = args.out_file
scale_factor = args.scale_factor

if unknown_args:
    parser.error(f"Unrecognized parameter(s): {', '.join(unknown_args)}")


def print_msg(message: str):
    if not verbose:
        return
    print(message)


def write_result(benchmark_name, nrun, t):
    bench_result = f"{benchmark_name}\t{nrun}\t{t}"
    if out_file is not None:
        if not hasattr(write_result, 'file'):
            write_result.file = open(out_file, 'w+')
        write_result.file.write(bench_result)
        write_result.file.write('\n')
    else:
        print_msg(bench_result)


def close_result():
    if not hasattr(write_result, 'file'):
        return
    write_result.file.close()


class BenchmarkResult:
    def __init__(self, name):
        self.name = name
        self.runs: List[float] = []

    def add(self, duration: float):
        self.runs.append(duration)

    def write(self):
        for i, run in enumerate(self.runs):
            write_result(self.name, i, run)


class TPCHData:
    TABLES = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]

    def __init__(self, scale_factor):
        self.conn = duckdb.connect()
        self.conn.execute(f'CALL dbgen(sf={scale_factor})')

    def get_tables(self, convertor) -> Dict[str, Any]:
        res = {}
        for table in self.TABLES:
            res[table] = convertor(self.conn, table)
        return res

    def load_lineitem(self, collector, benchmark_name) -> BenchmarkResult:
        query = 'SELECT * FROM lineitem'
        result = BenchmarkResult(benchmark_name)
        for _ in range(nruns):
            duration = 0.0
            start = time.time()
            rel = self.conn.sql(query)
            res = collector(rel)
            end = time.time()
            duration = float(end - start)
            del res
            padding = " " * len(str(nruns))
            print_msg(f"T{padding}: {duration}s")
            result.add(duration)
        return result


class TPCHBenchmarker:
    def __init__(self, name: str):
        self.initialize_connection()
        self.name = name

    def initialize_connection(self):
        self.con = duckdb.connect()
        if not threads:
            return
        print_msg(f'Limiting threads to {threads}')
        self.con.execute(f"SET threads={threads}")

    def register_tables(self, tables: Dict[str, Any]):
        for name, table in tables.items():
            self.con.register(name, table)

    def run_tpch(self, collector, benchmark_name) -> BenchmarkResult:
        print_msg("")
        print_msg(TPCH_QUERIES)
        result = BenchmarkResult(benchmark_name)
        for _ in range(nruns):
            duration = 0.0
            # Execute all queries
            for i, query in enumerate(TPCH_QUERIES):
                start = time.time()
                rel = self.con.sql(query)
                if rel:
                    res = collector(rel)
                    del res
                else:
                    print_msg(f"Query '{query}' did not produce output")
                end = time.time()
                query_time = float(end - start)
                print_msg(f"Q{str(i).ljust(len(str(nruns)), ' ')}: {query_time}")
                duration += float(end - start)
                padding = " " * len(str(nruns))
                print_msg(f"T{padding}: {duration}s")
            result.add(duration)
        return result


def test_tpch():
    print_msg(f"Generating TPCH (sf={scale_factor})")
    tpch = TPCHData(scale_factor)

    ## -------- Benchmark converting LineItem to different formats ---------

    def fetch_native(rel: duckdb.DuckDBPyRelation):
        return rel.fetchall()

    def fetch_pandas(rel: duckdb.DuckDBPyRelation):
        return rel.df()

    def fetch_arrow(rel: duckdb.DuckDBPyRelation):
        return rel.arrow()

    COLLECTORS = {'native': fetch_native, 'pandas': fetch_pandas, 'arrow': fetch_arrow}
    # For every collector, load lineitem 'nrun' times
    for collector in COLLECTORS:
        result: BenchmarkResult = tpch.load_lineitem(COLLECTORS[collector], collector + "_load_lineitem")
        print_msg(result.name)
        print_msg(collector)
        result.write()

    ## ------- Benchmark running TPCH queries on top of different formats --------

    def convert_pandas(conn: duckdb.DuckDBPyConnection, table_name: str):
        return conn.execute(f"SELECT * FROM {table_name}").df()

    def convert_arrow(conn: duckdb.DuckDBPyConnection, table_name: str):
        df = convert_pandas(conn, table_name)
        return pa.Table.from_pandas(df)

    CONVERTORS = {'pandas': convert_pandas, 'arrow': convert_arrow}
    # Convert TPCH data to the right format, then run TPCH queries on that data
    for convertor in CONVERTORS:
        tables = tpch.get_tables(CONVERTORS[convertor])
        tester = TPCHBenchmarker(convertor)
        tester.register_tables(tables)
        collector = COLLECTORS[convertor]
        result: BenchmarkResult = tester.run_tpch(collector, f"{convertor}tpch")
        result.write()


def generate_string(seed: int):
    output = ''
    for _ in range(10):
        output += chr(ord('A') + int(seed % 26))
        seed /= 26
    return output


class ArrowDictionary:
    def __init__(self, unique_values):
        self.size = unique_values
        self.dict = [generate_string(x) for x in range(unique_values)]


class ArrowDictionaryBenchmark:
    def __init__(self, unique_values, values, arrow_dict: ArrowDictionary):
        assert unique_values <= arrow_dict.size
        self.initialize_connection()
        self.generate(unique_values, values, arrow_dict)

    def initialize_connection(self):
        self.con = duckdb.connect()
        if not threads:
            return
        print_msg(f'Limiting threads to {threads}')
        self.con.execute(f"SET threads={threads}")

    def generate(self, unique_values, values, arrow_dict: ArrowDictionary):
        self.input = []
        self.expected = []
        for x in range(values):
            value = arrow_dict.dict[x % unique_values]
            self.input.append(value)
            self.expected.append((value,))

        array = pa.array(
            self.input,
            type=pa.dictionary(pa.int64(), pa.string()),
        )
        self.table = pa.table([array], names=["x"])

    def benchmark(self, benchmark_name) -> BenchmarkResult:
        self.con.register('arrow_table', self.table)
        result = BenchmarkResult(benchmark_name)
        for _ in range(nruns):
            duration = 0.0
            start = time.time()
            res = self.con.execute(
                """
                select * from arrow_table
            """
            ).fetchall()
            end = time.time()
            duration = float(end - start)
            assert self.expected == res
            del res
            padding = " " * len(str(nruns))
            print_msg(f"T{padding}: {duration}s")
            result.add(duration)
        return result


class SelectAndCallBenchmark:
    def __init__(self):
        """
        SELECT statements become QueryRelations, any other statement type becomes a MaterializedRelation.
        We use SELECT and CALL here because their execution plans are identical
        """
        self.initialize_connection()

    def initialize_connection(self):
        self.con = duckdb.connect()
        if not threads:
            return
        print_msg(f'Limiting threads to {threads}')
        self.con.execute(f"SET threads={threads}")

    def benchmark(self, name, query) -> List[BenchmarkResult]:
        results: List[BenchmarkResult] = []
        methods = {'select': 'select * from ', 'call': 'call '}
        for key, value in methods.items():
            for rowcount in [2048, 50000, 2500000]:
                result = BenchmarkResult(f'{key}_{name}_{rowcount}')
                query_string = query.format(rows=rowcount)
                query_string = value + query_string
                rel = self.con.sql(query_string)
                print_msg(rel.type)
                for _ in range(nruns):
                    duration = 0.0
                    start = time.time()
                    rel.fetchall()
                    end = time.time()
                    duration = float(end - start)
                    padding = " " * len(str(nruns))
                    print_msg(f"T{padding}: {duration}s")
                    result.add(duration)
                results.append(result)
        return results


class PandasDFLoadBenchmark:
    def __init__(self):
        self.initialize_connection()
        self.generate()

    def initialize_connection(self):
        self.con = duckdb.connect()
        if not threads:
            return
        print_msg(f'Limiting threads to {threads}')
        self.con.execute(f"SET threads={threads}")

    def generate(self):
        self.con.execute("call dbgen(sf=0.1)")
        new_table = "*, " + ", ".join(["l_shipdate"] * 300)
        self.con.execute(f"create table wide as select {new_table} from lineitem limit 500")
        self.con.execute(f"copy wide to 'wide_table.csv' (FORMAT CSV)")

    def benchmark(self, benchmark_name) -> BenchmarkResult:
        result = BenchmarkResult(benchmark_name)
        for _ in range(nruns):
            duration = 0.0
            pandas_df = pd.read_csv('wide_table.csv')
            start = time.time()
            for _ in range(30):
                res = self.con.execute("""select * from pandas_df""").df()
            end = time.time()
            duration = float(end - start)
            del res
            result.add(duration)
        return result


class PandasAnalyzerBenchmark:
    def __init__(self):
        self.initialize_connection()
        self.generate()

    def initialize_connection(self):
        self.con = duckdb.connect()
        if not threads:
            return
        print_msg(f'Limiting threads to {threads}')
        self.con.execute(f"SET threads={threads}")

    def generate(self):
        return

    def benchmark(self, benchmark_name) -> BenchmarkResult:
        result = BenchmarkResult(benchmark_name)
        data = [None] * 9999999 + [1]  # Last element is 1, others are None

        # Create the DataFrame with the specified data and column type as object
        pandas_df = pd.DataFrame(data, columns=['Column'], dtype=object)
        for _ in range(nruns):
            duration = 0.0
            start = time.time()
            for _ in range(30):
                res = self.con.execute("""select * from pandas_df""").df()
            end = time.time()
            duration = float(end - start)
            del res
            result.add(duration)
        return result


def test_arrow_dictionaries_scan():
    DICT_SIZE = 26 * 1000
    print_msg(f"Generating a unique dictionary of size {DICT_SIZE}")
    arrow_dict = ArrowDictionary(DICT_SIZE)
    DATASET_SIZE = 10000000
    for unique_values in [2, 1000, DICT_SIZE]:
        test = ArrowDictionaryBenchmark(unique_values, DATASET_SIZE, arrow_dict)
        benchmark_name = f"arrow_dict_unique_{unique_values}_total_{DATASET_SIZE}"
        result = test.benchmark(benchmark_name)
        result.write()


def test_loading_pandas_df_many_times():
    test = PandasDFLoadBenchmark()
    benchmark_name = f"load_pandas_df_many_times"
    result = test.benchmark(benchmark_name)
    result.write()


def test_pandas_analyze():
    test = PandasAnalyzerBenchmark()
    benchmark_name = f"pandas_analyze"
    result = test.benchmark(benchmark_name)
    result.write()


def test_call_and_select_statements():
    test = SelectAndCallBenchmark()
    queries = {
        'repeat_row': "repeat_row(42, 'test', True, 'this is a long string', num_rows={rows})",
    }
    for key, value in queries.items():
        results = test.benchmark(key, value)
        for res in results:
            res.write()


def main():
    test_tpch()
    test_arrow_dictionaries_scan()
    test_loading_pandas_df_many_times()
    test_pandas_analyze()
    test_call_and_select_statements()

    close_result()


if __name__ == '__main__':
    main()