should be it

2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions
--- a/external/duckdb/scripts/generate_tpcds_results.py
+++ b/external/duckdb/scripts/generate_tpcds_results.py
@@ -0,0 +1,137 @@
+import psycopg2
+import argparse
+import os
+import platform
+import shutil
+import sys
+import subprocess
+import multiprocessing.pool
+
+parser = argparse.ArgumentParser(description='Generate TPC-DS reference results from Postgres.')
+parser.add_argument(
+    '--sf', dest='sf', action='store', help='The TPC-DS scale factor reference results to generate', default=1
+)
+parser.add_argument(
+    '--query-dir',
+    dest='query_dir',
+    action='store',
+    help='The directory with queries to run',
+    default='extension/tpcds/dsdgen/queries',
+)
+parser.add_argument(
+    '--answer-dir',
+    dest='answer_dir',
+    action='store',
+    help='The directory where to store the answers',
+    default='extension/tpcds/dsdgen/answers/sf${SF}',
+)
+parser.add_argument(
+    '--duckdb-path',
+    dest='duckdb_path',
+    action='store',
+    help='The path to the DuckDB executable',
+    default='build/reldebug/duckdb',
+)
+parser.add_argument(
+    '--skip-load',
+    dest='skip_load',
+    action='store_const',
+    const=True,
+    help='Whether or not to skip loading',
+    default=False,
+)
+parser.add_argument(
+    '--query-list', dest='query_list', action='store', help='The list of queries to run (default = all)', default=''
+)
+parser.add_argument('--nthreads', dest='nthreads', action='store', type=int, help='The number of threads', default=0)
+
+args = parser.parse_args()
+
+con = psycopg2.connect(database='postgres')
+c = con.cursor()
+if not args.skip_load:
+    tpcds_dir = f'tpcds_sf{args.sf}'
+
+    q = f"""
+    CALL dsdgen(sf={args.sf});
+    EXPORT DATABASE '{tpcds_dir}' (DELIMITER '|');
+    """
+    proc = subprocess.Popen([args.duckdb_path, "-c", q])
+    proc.wait()
+    if proc.returncode != 0:
+        exit(1)
+
+    # drop the previous tables
+    tables = [
+        'name',
+        'web_site',
+        'web_sales',
+        'web_returns',
+        'web_page',
+        'warehouse',
+        'time_dim',
+        'store_sales',
+        'store_returns',
+        'store',
+        'ship_mode',
+        'reason',
+        'promotion',
+        'item',
+        'inventory',
+        'income_band',
+        'household_demographics',
+        'date_dim',
+        'customer_demographics',
+        'customer_address',
+        'customer',
+        'catalog_sales',
+        'catalog_returns',
+        'catalog_page',
+        'call_center',
+    ]
+    for table in tables:
+        c.execute(f'DROP TABLE IF EXISTS {table};')
+
+    with open(os.path.join(tpcds_dir, 'schema.sql'), 'r') as f:
+        schema = f.read()
+
+    c.execute(schema)
+
+    with open(os.path.join(tpcds_dir, 'load.sql'), 'r') as f:
+        load = f.read()
+
+    load = load.replace(f'{tpcds_dir}/', f'{os.getcwd()}/{tpcds_dir}/')
+
+    c.execute(load)
+
+    con.commit()
+
+# get a list of all queries
+queries = os.listdir(args.query_dir)
+queries.sort()
+
+answer_dir = args.answer_dir.replace('${SF}', args.sf)
+
+if len(args.query_list) > 0:
+    passing_queries = [x + '.sql' for x in args.query_list.split(',')]
+    queries = [x for x in queries if x in passing_queries]
+    queries.sort()
+
+
+def run_query(q):
+    print(q)
+    with open(os.path.join(args.query_dir, q), 'r') as f:
+        sql_query = f.read()
+    answer_path = os.path.join(os.getcwd(), answer_dir, q.replace('.sql', '.csv'))
+    c.execute(f'DROP TABLE IF EXISTS "query_result{q}"')
+    c.execute(f'CREATE TABLE "query_result{q}" AS ' + sql_query)
+    c.execute(f"COPY \"query_result{q}\" TO '{answer_path}' (FORMAT CSV, DELIMITER '|', HEADER, NULL 'NULL')")
+
+
+if args.nthreads == 0:
+    for q in queries:
+        run_query(q)
+else:
+    pool = multiprocessing.pool.ThreadPool(processes=args.nthreads)
+
+    pool.map(run_query, queries)