should be it

2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions
--- a/external/duckdb/scripts/regression/test_runner.py
+++ b/external/duckdb/scripts/regression/test_runner.py
@@ -0,0 +1,227 @@
+import os
+import math
+import functools
+import shutil
+from benchmark import BenchmarkRunner, BenchmarkRunnerConfig
+from dataclasses import dataclass
+from typing import Optional, List, Union
+import subprocess
+
+print = functools.partial(print, flush=True)
+
+
+def is_number(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+
+# Geometric mean of an array of numbers
+def geomean(xs):
+    if len(xs) == 0:
+        return 'EMPTY'
+    for entry in xs:
+        if not is_number(entry):
+            return entry
+    return math.exp(math.fsum(math.log(float(x)) for x in xs) / len(xs))
+
+
+import argparse
+
+# Set up the argument parser
+parser = argparse.ArgumentParser(description="Benchmark script with old and new runners.")
+
+# Define the arguments
+parser.add_argument("--old", type=str, help="Path to the old runner.", required=True)
+parser.add_argument("--new", type=str, help="Path to the new runner.", required=True)
+parser.add_argument("--benchmarks", type=str, help="Path to the benchmark file.", required=True)
+parser.add_argument("--verbose", action="store_true", help="Enable verbose output.")
+parser.add_argument("--threads", type=int, help="Number of threads to use.")
+parser.add_argument("--memory_limit", type=str, help="Memory limit to use.")
+parser.add_argument("--nofail", action="store_true", help="Do not fail on regression.")
+parser.add_argument("--disable-timeout", action="store_true", help="Disable timeout.")
+parser.add_argument("--max-timeout", type=int, default=3600, help="Set maximum timeout in seconds (default: 3600).")
+parser.add_argument("--root-dir", type=str, default="", help="Root directory.")
+parser.add_argument("--no-summary", type=str, default=False, help="No summary in the end.")
+parser.add_argument(
+    "--regression-threshold-seconds",
+    type=float,
+    default=0.05,
+    help="REGRESSION_THRESHOLD_SECONDS value for large benchmarks.",
+)
+
+# Parse the arguments
+args = parser.parse_args()
+
+# Assign parsed arguments to variables
+old_runner_path = args.old
+new_runner_path = args.new
+benchmark_file = args.benchmarks
+verbose = args.verbose
+threads = args.threads
+memory_limit = args.memory_limit
+no_regression_fail = args.nofail
+disable_timeout = args.disable_timeout
+max_timeout = args.max_timeout
+root_dir = args.root_dir
+no_summary = args.no_summary
+regression_threshold_seconds = args.regression_threshold_seconds
+
+
+# how many times we will run the experiment, to be sure of the regression
+NUMBER_REPETITIONS = 5
+# the threshold at which we consider something a regression (percentage)
+REGRESSION_THRESHOLD_PERCENTAGE = 0.1
+# minimal seconds diff for something to be a regression (for very fast benchmarks)
+REGRESSION_THRESHOLD_SECONDS = regression_threshold_seconds
+
+if not os.path.isfile(old_runner_path):
+    print(f"Failed to find old runner {old_runner_path}")
+    exit(1)
+
+if not os.path.isfile(new_runner_path):
+    print(f"Failed to find new runner {new_runner_path}")
+    exit(1)
+
+config_dict = vars(args)
+old_runner = BenchmarkRunner(BenchmarkRunnerConfig.from_params(old_runner_path, benchmark_file, **config_dict))
+new_runner = BenchmarkRunner(BenchmarkRunnerConfig.from_params(new_runner_path, benchmark_file, **config_dict))
+
+benchmark_list = old_runner.benchmark_list
+
+summary = []
+
+
+@dataclass
+class BenchmarkResult:
+    benchmark: str
+    old_result: Union[float, str]
+    new_result: Union[float, str]
+    old_failure: Optional[str] = None
+    new_failure: Optional[str] = None
+
+
+multiply_percentage = 1.0 + REGRESSION_THRESHOLD_PERCENTAGE
+other_results: List[BenchmarkResult] = []
+error_list: List[BenchmarkResult] = []
+for i in range(NUMBER_REPETITIONS):
+    regression_list: List[BenchmarkResult] = []
+    if len(benchmark_list) == 0:
+        break
+    print(
+        f'''====================================================
+==============      ITERATION {i}        =============
+==============      REMAINING {len(benchmark_list)}        =============
+====================================================
+'''
+    )
+
+    old_results, old_failures = old_runner.run_benchmarks(benchmark_list)
+    new_results, new_failures = new_runner.run_benchmarks(benchmark_list)
+
+    for benchmark in benchmark_list:
+        old_res = old_results[benchmark]
+        new_res = new_results[benchmark]
+
+        old_fail = old_failures[benchmark]
+        new_fail = new_failures[benchmark]
+
+        if isinstance(old_res, str) or isinstance(new_res, str):
+            # benchmark failed to run - always a regression
+            error_list.append(BenchmarkResult(benchmark, old_res, new_res, old_fail, new_fail))
+        elif (no_regression_fail == False) and (
+            (old_res + REGRESSION_THRESHOLD_SECONDS) * multiply_percentage < new_res
+        ):
+            regression_list.append(BenchmarkResult(benchmark, old_res, new_res))
+        else:
+            other_results.append(BenchmarkResult(benchmark, old_res, new_res))
+    benchmark_list = [res.benchmark for res in regression_list]
+
+exit_code = 0
+regression_list.extend(error_list)
+summary = []
+if len(regression_list) > 0:
+    exit_code = 1
+    print(
+        '''====================================================
+==============  REGRESSIONS DETECTED   =============
+====================================================
+'''
+    )
+    for regression in regression_list:
+        print(f"{regression.benchmark}")
+        print(f"Old timing: {regression.old_result}")
+        print(f"New timing: {regression.new_result}")
+        if regression.old_failure or regression.new_failure:
+            new_data = {
+                "benchmark": regression.benchmark,
+                "old_failure": regression.old_failure,
+                "new_failure": regression.new_failure,
+            }
+            summary.append(new_data)
+        print("")
+    print(
+        '''====================================================
+==============     OTHER TIMINGS       =============
+====================================================
+'''
+    )
+else:
+    print(
+        '''====================================================
+============== NO REGRESSIONS DETECTED  =============
+====================================================
+'''
+    )
+
+other_results.sort(key=lambda x: x.benchmark)
+for res in other_results:
+    print(f"{res.benchmark}")
+    print(f"Old timing: {res.old_result}")
+    print(f"New timing: {res.new_result}")
+    print("")
+
+time_a = geomean(old_runner.complete_timings)
+time_b = geomean(new_runner.complete_timings)
+
+
+print("")
+if isinstance(time_a, str) or isinstance(time_b, str):
+    print(f"Old: {time_a}")
+    print(f"New: {time_b}")
+elif time_a > time_b * 1.01:
+    print(f"Old timing geometric mean: {time_a}")
+    print(f"New timing geometric mean: {time_b}, roughly {int((time_a - time_b) * 100.0 / time_a)}% faster")
+elif time_b > time_a * 1.01:
+    print(f"Old timing geometric mean: {time_a}, roughly {int((time_b - time_a) * 100.0 / time_b)}% faster")
+    print(f"New timing geometric mean: {time_b}")
+else:
+    print(f"Old timing geometric mean: {time_a}")
+    print(f"New timing geometric mean: {time_b}")
+
+# nuke cached benchmark data between runs
+if os.path.isdir("duckdb_benchmark_data"):
+    shutil.rmtree('duckdb_benchmark_data')
+
+if summary and not no_summary:
+    print(
+        '''\n\n====================================================
+================  FAILURES SUMMARY  ================
+====================================================
+'''
+    )
+    # check the value is "true" otherwise you'll see the prefix in local run outputs
+    prefix = "::error::" if ('CI' in os.environ and os.getenv('CI') == 'true') else ""
+    for i, failure_message in enumerate(summary, start=1):
+        prefix_str = f"{prefix}{i}" if len(prefix) > 0 else f"{i}"
+        print(f"{prefix_str}: ", failure_message["benchmark"])
+        if failure_message["old_failure"] != failure_message["new_failure"]:
+            print("Old:\n", failure_message["old_failure"])
+            print("New:\n", failure_message["new_failure"])
+        else:
+            print(failure_message["old_failure"])
+        print("-", 52)
+
+exit(exit_code)