should be it

2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions
--- a/external/duckdb/data/parquet-testing/pyarrow-generate-parquet.py
+++ b/external/duckdb/data/parquet-testing/pyarrow-generate-parquet.py
@@ -0,0 +1,65 @@
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pyarrow.csv as csv
+from pathlib import Path
+
+
+def generate_parquet(data_dir: Path):
+    generate_silly_names(data_dir / 'silly-names.parquet')
+    generate_byte_stream_split(data_dir / 'byte_stream_split.parquet')
+
+
+def generate_silly_names(path: Path):
+    df = pd.DataFrame({'önë': [1, 2, 3],
+                       '': ['foo', 'bar', 'baz'],
+                       '🦆': [True, False, True]})
+    table = pa.Table.from_pandas(df)
+    pq.write_table(table, path)
+
+
+def generate_byte_stream_split(path: Path):
+    num_rows = 100
+    rng = np.random.default_rng(0)
+
+    floats = pa.array(rng.uniform(-100.0, 100.0, num_rows), type=pa.float32())
+    doubles = pa.array(rng.uniform(-100.0, 100.0, num_rows), type=pa.float64())
+
+    null_mask = np.ones(num_rows, dtype=np.bool_)
+    null_mask[num_rows // 10:] = False
+    rng.shuffle(null_mask)
+    nullable_floats = pa.array(
+            rng.uniform(-100.0, 100.0, num_rows), type=pa.float32(), mask=null_mask)
+
+    table = pa.Table.from_arrays(
+            [floats, doubles, nullable_floats],
+            ["floats", "doubles", "nullable_floats"])
+
+    with pq.ParquetWriter(
+            path,
+            table.schema,
+            use_dictionary=False,
+            use_byte_stream_split=True) as writer:
+        writer.write_table(table)
+
+    csv_path = path.with_suffix('.csv')
+    options = csv.WriteOptions(include_header=True, delimiter='|')
+    csv.write_csv(table, csv_path, options)
+    fix_csv_nulls(csv_path)
+
+
+def fix_csv_nulls(path: Path):
+    """ Replace empty values with 'NULL' """
+    with open(path, 'r') as f:
+        lines = f.readlines()
+
+    with open(path, 'w') as f:
+        for line in lines:
+            split_line = ["NULL" if val == "" else val for val in line.strip().split("|")]
+            f.write("|".join(split_line) + "\n")
+
+
+if __name__ == '__main__':
+    data_dir = Path(__file__).parent
+    generate_parquet(data_dir)