should be it

2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions
--- a/external/duckdb/scripts/regression/init.py
+++ b/external/duckdb/scripts/regression/init.py
@@ -0,0 +1 @@
+
--- a/external/duckdb/scripts/regression/benchmark.py
+++ b/external/duckdb/scripts/regression/benchmark.py
@@ -0,0 +1,193 @@
+import subprocess
+import statistics
+from io import StringIO
+import csv
+from dataclasses import dataclass
+import argparse
+from typing import Optional, Union, Tuple, List
+import functools
+
+print = functools.partial(print, flush=True)
+
+STDERR_HEADER = '''====================================================
+==============         STDERR          =============
+====================================================
+'''
+
+STDOUT_HEADER = '''====================================================
+==============         STDOUT          =============
+====================================================
+'''
+
+# timeouts in seconds
+MAX_TIMEOUT = 3600
+DEFAULT_TIMEOUT = 600
+
+
+@dataclass
+class BenchmarkRunnerConfig:
+    "Configuration for a BenchmarkRunner"
+
+    benchmark_runner: str
+    benchmark_file: str
+    verbose: bool = False
+    threads: Optional[int] = None
+    memory_limit: Optional[str] = None
+    disable_timeout: bool = False
+    max_timeout: int = MAX_TIMEOUT
+    root_dir: str = ""
+    no_summary: bool = False
+
+    @classmethod
+    def from_params(cls, benchmark_runner, benchmark_file, **kwargs) -> "BenchmarkRunnerConfig":
+        verbose = kwargs.get("verbose", False)
+        threads = kwargs.get("threads", None)
+        memory_limit = kwargs.get("memory_limit", None)
+        disable_timeout = kwargs.get("disable_timeout", False)
+        max_timeout = kwargs.get("max_timeout", MAX_TIMEOUT)
+        root_dir = kwargs.get("root_dir", "")
+        no_summary = kwargs.get("no_summary", False)
+
+        config = cls(
+            benchmark_runner=benchmark_runner,
+            benchmark_file=benchmark_file,
+            verbose=verbose,
+            threads=threads,
+            memory_limit=memory_limit,
+            disable_timeout=disable_timeout,
+            max_timeout=max_timeout,
+            root_dir=root_dir,
+            no_summary=no_summary,
+        )
+        return config
+
+    @classmethod
+    def from_args(cls) -> "BenchmarkRunnerConfig":
+        parser = argparse.ArgumentParser(description="Benchmark script with old and new runners.")
+
+        # Define the arguments
+        parser.add_argument("--path", type=str, help="Path to the benchmark_runner executable", required=True)
+        parser.add_argument("--benchmarks", type=str, help="Path to the benchmark file.", required=True)
+        parser.add_argument("--verbose", action="store_true", help="Enable verbose output.")
+        parser.add_argument("--threads", type=int, help="Number of threads to use.")
+        parser.add_argument("--memory_limit", type=str, help="Memory limit to use.")
+        parser.add_argument("--disable-timeout", action="store_true", help="Disable timeout.")
+        parser.add_argument(
+            "--max-timeout", type=int, default=3600, help="Set maximum timeout in seconds (default: 3600)."
+        )
+        parser.add_argument("--root-dir", type=str, default="", help="Root directory.")
+        parser.add_argument(
+            "--no-summary", type=str, default=False, help="No failures summary is outputed when passing this flag."
+        )
+
+        # Parse arguments
+        parsed_args = parser.parse_args()
+
+        # Create an instance of BenchmarkRunnerConfig using parsed arguments
+        config = cls(
+            benchmark_runner=parsed_args.path,
+            benchmark_file=parsed_args.benchmarks,
+            verbose=parsed_args.verbose,
+            threads=parsed_args.threads,
+            memory_limit=parsed_args.memory_limit,
+            disable_timeout=parsed_args.disable_timeout,
+            max_timeout=parsed_args.max_timeout,
+            root_dir=parsed_args.root_dir,
+            no_summary=parsed_args.no_summary,
+        )
+        return config
+
+
+class BenchmarkRunner:
+    def __init__(self, config: BenchmarkRunnerConfig):
+        self.config = config
+        self.complete_timings = []
+        self.benchmark_list: List[str] = []
+        with open(self.config.benchmark_file, 'r') as f:
+            self.benchmark_list = [x.strip() for x in f.read().split('\n') if len(x) > 0]
+
+    def construct_args(self, benchmark_path):
+        benchmark_args = []
+        benchmark_args.extend([self.config.benchmark_runner, benchmark_path])
+        if self.config.root_dir:
+            benchmark_args.extend(['--root-dir', self.config.root_dir])
+        if self.config.threads:
+            benchmark_args.extend([f"--threads={self.config.threads}"])
+        if self.config.memory_limit:
+            benchmark_args.extend([f"--memory_limit={self.config.memory_limit}"])
+        if self.config.disable_timeout:
+            benchmark_args.extend(["--disable-timeout"])
+        if self.config.no_summary:
+            benchmark_args.extend(["--no-summary"])
+        return benchmark_args
+
+    def run_benchmark(self, benchmark) -> Tuple[Union[float, str], Optional[str]]:
+        benchmark_args = self.construct_args(benchmark)
+        timeout_seconds = DEFAULT_TIMEOUT
+        if self.config.disable_timeout:
+            timeout_seconds = self.config.max_timeout
+
+        try:
+            proc = subprocess.run(
+                benchmark_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout_seconds
+            )
+            out = proc.stdout.decode('utf8')
+            err = proc.stderr.decode('utf8')
+            returncode = proc.returncode
+        except subprocess.TimeoutExpired:
+            print("Failed to run benchmark " + benchmark)
+            print(f"Aborted due to exceeding the limit of {timeout_seconds} seconds")
+            return (
+                'Failed to run benchmark ' + benchmark,
+                f"Aborted due to exceeding the limit of {timeout_seconds} seconds",
+            )
+        if returncode != 0:
+            print("Failed to run benchmark " + benchmark)
+            print(STDERR_HEADER)
+            print(err)
+            print(STDOUT_HEADER)
+            print(out)
+            if 'HTTP' in err:
+                print("Ignoring HTTP error and terminating the running of the regression tests")
+                exit(0)
+            return 'Failed to run benchmark ' + benchmark, err
+        if self.config.verbose:
+            print(err)
+        # read the input CSV
+        f = StringIO(err)
+        csv_reader = csv.reader(f, delimiter='\t')
+        header = True
+        timings = []
+        try:
+            for row in csv_reader:
+                if len(row) == 0:
+                    continue
+                if header:
+                    header = False
+                else:
+                    timings.append(row[2])
+                    self.complete_timings.append(row[2])
+            return float(statistics.median(timings)), None
+        except:
+            print("Failed to run benchmark " + benchmark)
+            print(err)
+            return 'Failed to run benchmark ' + benchmark, err
+
+    def run_benchmarks(self, benchmark_list: List[str]):
+        results = {}
+        failures = {}
+        for benchmark in benchmark_list:
+            result, failure_message = self.run_benchmark(benchmark)
+            results[benchmark] = result
+            failures[benchmark] = failure_message if failure_message else None
+        return results, failures
+
+
+def main():
+    config = BenchmarkRunnerConfig.from_args()
+    runner = BenchmarkRunner(config)
+    runner.run_benchmarks()
+
+
+if __name__ == "__main__":
+    main()
--- a/external/duckdb/scripts/regression/test_runner.py
+++ b/external/duckdb/scripts/regression/test_runner.py
@@ -0,0 +1,227 @@
+import os
+import math
+import functools
+import shutil
+from benchmark import BenchmarkRunner, BenchmarkRunnerConfig
+from dataclasses import dataclass
+from typing import Optional, List, Union
+import subprocess
+
+print = functools.partial(print, flush=True)
+
+
+def is_number(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+
+# Geometric mean of an array of numbers
+def geomean(xs):
+    if len(xs) == 0:
+        return 'EMPTY'
+    for entry in xs:
+        if not is_number(entry):
+            return entry
+    return math.exp(math.fsum(math.log(float(x)) for x in xs) / len(xs))
+
+
+import argparse
+
+# Set up the argument parser
+parser = argparse.ArgumentParser(description="Benchmark script with old and new runners.")
+
+# Define the arguments
+parser.add_argument("--old", type=str, help="Path to the old runner.", required=True)
+parser.add_argument("--new", type=str, help="Path to the new runner.", required=True)
+parser.add_argument("--benchmarks", type=str, help="Path to the benchmark file.", required=True)
+parser.add_argument("--verbose", action="store_true", help="Enable verbose output.")
+parser.add_argument("--threads", type=int, help="Number of threads to use.")
+parser.add_argument("--memory_limit", type=str, help="Memory limit to use.")
+parser.add_argument("--nofail", action="store_true", help="Do not fail on regression.")
+parser.add_argument("--disable-timeout", action="store_true", help="Disable timeout.")
+parser.add_argument("--max-timeout", type=int, default=3600, help="Set maximum timeout in seconds (default: 3600).")
+parser.add_argument("--root-dir", type=str, default="", help="Root directory.")
+parser.add_argument("--no-summary", type=str, default=False, help="No summary in the end.")
+parser.add_argument(
+    "--regression-threshold-seconds",
+    type=float,
+    default=0.05,
+    help="REGRESSION_THRESHOLD_SECONDS value for large benchmarks.",
+)
+
+# Parse the arguments
+args = parser.parse_args()
+
+# Assign parsed arguments to variables
+old_runner_path = args.old
+new_runner_path = args.new
+benchmark_file = args.benchmarks
+verbose = args.verbose
+threads = args.threads
+memory_limit = args.memory_limit
+no_regression_fail = args.nofail
+disable_timeout = args.disable_timeout
+max_timeout = args.max_timeout
+root_dir = args.root_dir
+no_summary = args.no_summary
+regression_threshold_seconds = args.regression_threshold_seconds
+
+
+# how many times we will run the experiment, to be sure of the regression
+NUMBER_REPETITIONS = 5
+# the threshold at which we consider something a regression (percentage)
+REGRESSION_THRESHOLD_PERCENTAGE = 0.1
+# minimal seconds diff for something to be a regression (for very fast benchmarks)
+REGRESSION_THRESHOLD_SECONDS = regression_threshold_seconds
+
+if not os.path.isfile(old_runner_path):
+    print(f"Failed to find old runner {old_runner_path}")
+    exit(1)
+
+if not os.path.isfile(new_runner_path):
+    print(f"Failed to find new runner {new_runner_path}")
+    exit(1)
+
+config_dict = vars(args)
+old_runner = BenchmarkRunner(BenchmarkRunnerConfig.from_params(old_runner_path, benchmark_file, **config_dict))
+new_runner = BenchmarkRunner(BenchmarkRunnerConfig.from_params(new_runner_path, benchmark_file, **config_dict))
+
+benchmark_list = old_runner.benchmark_list
+
+summary = []
+
+
+@dataclass
+class BenchmarkResult:
+    benchmark: str
+    old_result: Union[float, str]
+    new_result: Union[float, str]
+    old_failure: Optional[str] = None
+    new_failure: Optional[str] = None
+
+
+multiply_percentage = 1.0 + REGRESSION_THRESHOLD_PERCENTAGE
+other_results: List[BenchmarkResult] = []
+error_list: List[BenchmarkResult] = []
+for i in range(NUMBER_REPETITIONS):
+    regression_list: List[BenchmarkResult] = []
+    if len(benchmark_list) == 0:
+        break
+    print(
+        f'''====================================================
+==============      ITERATION {i}        =============
+==============      REMAINING {len(benchmark_list)}        =============
+====================================================
+'''
+    )
+
+    old_results, old_failures = old_runner.run_benchmarks(benchmark_list)
+    new_results, new_failures = new_runner.run_benchmarks(benchmark_list)
+
+    for benchmark in benchmark_list:
+        old_res = old_results[benchmark]
+        new_res = new_results[benchmark]
+
+        old_fail = old_failures[benchmark]
+        new_fail = new_failures[benchmark]
+
+        if isinstance(old_res, str) or isinstance(new_res, str):
+            # benchmark failed to run - always a regression
+            error_list.append(BenchmarkResult(benchmark, old_res, new_res, old_fail, new_fail))
+        elif (no_regression_fail == False) and (
+            (old_res + REGRESSION_THRESHOLD_SECONDS) * multiply_percentage < new_res
+        ):
+            regression_list.append(BenchmarkResult(benchmark, old_res, new_res))
+        else:
+            other_results.append(BenchmarkResult(benchmark, old_res, new_res))
+    benchmark_list = [res.benchmark for res in regression_list]
+
+exit_code = 0
+regression_list.extend(error_list)
+summary = []
+if len(regression_list) > 0:
+    exit_code = 1
+    print(
+        '''====================================================
+==============  REGRESSIONS DETECTED   =============
+====================================================
+'''
+    )
+    for regression in regression_list:
+        print(f"{regression.benchmark}")
+        print(f"Old timing: {regression.old_result}")
+        print(f"New timing: {regression.new_result}")
+        if regression.old_failure or regression.new_failure:
+            new_data = {
+                "benchmark": regression.benchmark,
+                "old_failure": regression.old_failure,
+                "new_failure": regression.new_failure,
+            }
+            summary.append(new_data)
+        print("")
+    print(
+        '''====================================================
+==============     OTHER TIMINGS       =============
+====================================================
+'''
+    )
+else:
+    print(
+        '''====================================================
+============== NO REGRESSIONS DETECTED  =============
+====================================================
+'''
+    )
+
+other_results.sort(key=lambda x: x.benchmark)
+for res in other_results:
+    print(f"{res.benchmark}")
+    print(f"Old timing: {res.old_result}")
+    print(f"New timing: {res.new_result}")
+    print("")
+
+time_a = geomean(old_runner.complete_timings)
+time_b = geomean(new_runner.complete_timings)
+
+
+print("")
+if isinstance(time_a, str) or isinstance(time_b, str):
+    print(f"Old: {time_a}")
+    print(f"New: {time_b}")
+elif time_a > time_b * 1.01:
+    print(f"Old timing geometric mean: {time_a}")
+    print(f"New timing geometric mean: {time_b}, roughly {int((time_a - time_b) * 100.0 / time_a)}% faster")
+elif time_b > time_a * 1.01:
+    print(f"Old timing geometric mean: {time_a}, roughly {int((time_b - time_a) * 100.0 / time_b)}% faster")
+    print(f"New timing geometric mean: {time_b}")
+else:
+    print(f"Old timing geometric mean: {time_a}")
+    print(f"New timing geometric mean: {time_b}")
+
+# nuke cached benchmark data between runs
+if os.path.isdir("duckdb_benchmark_data"):
+    shutil.rmtree('duckdb_benchmark_data')
+
+if summary and not no_summary:
+    print(
+        '''\n\n====================================================
+================  FAILURES SUMMARY  ================
+====================================================
+'''
+    )
+    # check the value is "true" otherwise you'll see the prefix in local run outputs
+    prefix = "::error::" if ('CI' in os.environ and os.getenv('CI') == 'true') else ""
+    for i, failure_message in enumerate(summary, start=1):
+        prefix_str = f"{prefix}{i}" if len(prefix) > 0 else f"{i}"
+        print(f"{prefix_str}: ", failure_message["benchmark"])
+        if failure_message["old_failure"] != failure_message["new_failure"]:
+            print("Old:\n", failure_message["old_failure"])
+            print("New:\n", failure_message["new_failure"])
+        else:
+            print(failure_message["old_failure"])
+        print("-", 52)
+
+exit(exit_code)