should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,65 @@
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv as csv
from pathlib import Path
def generate_parquet(data_dir: Path):
generate_silly_names(data_dir / 'silly-names.parquet')
generate_byte_stream_split(data_dir / 'byte_stream_split.parquet')
def generate_silly_names(path: Path):
df = pd.DataFrame({'önë': [1, 2, 3],
'': ['foo', 'bar', 'baz'],
'🦆': [True, False, True]})
table = pa.Table.from_pandas(df)
pq.write_table(table, path)
def generate_byte_stream_split(path: Path):
num_rows = 100
rng = np.random.default_rng(0)
floats = pa.array(rng.uniform(-100.0, 100.0, num_rows), type=pa.float32())
doubles = pa.array(rng.uniform(-100.0, 100.0, num_rows), type=pa.float64())
null_mask = np.ones(num_rows, dtype=np.bool_)
null_mask[num_rows // 10:] = False
rng.shuffle(null_mask)
nullable_floats = pa.array(
rng.uniform(-100.0, 100.0, num_rows), type=pa.float32(), mask=null_mask)
table = pa.Table.from_arrays(
[floats, doubles, nullable_floats],
["floats", "doubles", "nullable_floats"])
with pq.ParquetWriter(
path,
table.schema,
use_dictionary=False,
use_byte_stream_split=True) as writer:
writer.write_table(table)
csv_path = path.with_suffix('.csv')
options = csv.WriteOptions(include_header=True, delimiter='|')
csv.write_csv(table, csv_path, options)
fix_csv_nulls(csv_path)
def fix_csv_nulls(path: Path):
""" Replace empty values with 'NULL' """
with open(path, 'r') as f:
lines = f.readlines()
with open(path, 'w') as f:
for line in lines:
split_line = ["NULL" if val == "" else val for val in line.strip().split("|")]
f.write("|".join(split_line) + "\n")
if __name__ == '__main__':
data_dir = Path(__file__).parent
generate_parquet(data_dir)