Files
email-tracker/external/duckdb/data/geoparquet/generate_test_data.py
2025-10-24 19:21:19 -05:00

188 lines
5.5 KiB
Python

# Taken from https://github.com/opengeospatial/geoparquet/tree/main/test_data
"""
Generates example data using pyarrow by running `python generate_test_data.py`.
You can print the metadata with:
.. code-block:: python
>>> import json, pprint, pyarrow.parquet as pq
>>> pprint.pprint(json.loads(pq.read_schema("example.parquet").metadata[b"geo"]))
"""
import json
import pathlib
import copy
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow.csv import write_csv
from shapely import from_wkt, to_wkb
HERE = pathlib.Path(__file__).parent
metadata_template = {
"version": "1.1.0",
"primary_column": "geometry",
"columns": {
"geometry": {
"encoding": "WKB",
"geometry_types": [],
},
},
}
## Various geometry types with WKB and native (GeoArrow-based) encodings
def write_encoding_files(geometries_wkt, geometries_geoarrow, geometry_type):
table = pa.table({"col": range(len(geometries_wkt)), "geometry": geometries_wkt})
write_csv(table, HERE / f"data-{geometry_type.lower()}-wkt.csv")
# WKB encoding
table = pa.table({"col": range(len(geometries_wkt)), "geometry": to_wkb(from_wkt(geometries_wkt))})
metadata = copy.deepcopy(metadata_template)
metadata["columns"]["geometry"]["geometry_types"] = [geometry_type]
table = table.replace_schema_metadata({"geo": json.dumps(metadata)})
pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_wkb.parquet")
# native (geoarrow) encoding
table = pa.table({"col": range(len(geometries_wkt)), "geometry": geometries_geoarrow})
metadata["columns"]["geometry"]["encoding"] = geometry_type.lower()
table = table.replace_schema_metadata({"geo": json.dumps(metadata)})
pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_native.parquet")
# point
geometries_wkt = [
"POINT (30 10)",
"POINT EMPTY",
None,
"POINT (40 40)",
]
point_type = pa.struct([pa.field("x", pa.float64(), nullable=False), pa.field("y", pa.float64(), nullable=False)])
geometries = pa.array(
[(30, 10), (float("nan"), float("nan")), (float("nan"), float("nan")), (40, 40)],
mask=np.array([False, False, True, False]),
type=point_type,
)
write_encoding_files(geometries_wkt, geometries, geometry_type="Point")
# linestring
geometries_wkt = ["LINESTRING (30 10, 10 30, 40 40)", "LINESTRING EMPTY", None]
linestring_type = pa.list_(pa.field("vertices", point_type, nullable=False))
geometries = pa.array(
[[(30, 10), (10, 30), (40, 40)], [], []], mask=np.array([False, False, True]), type=linestring_type
)
write_encoding_files(geometries_wkt, geometries, geometry_type="LineString")
# polygon
geometries_wkt = [
"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
"POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))",
"POLYGON EMPTY",
None,
]
polygon_type = pa.list_(pa.field("rings", pa.list_(pa.field("vertices", point_type, nullable=False)), nullable=False))
geometries = pa.array(
[
[[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]],
[[(35, 10), (45, 45), (15, 40), (10, 20), (35, 10)], [(20, 30), (35, 35), (30, 20), (20, 30)]],
[],
[],
],
mask=np.array([False, False, False, True]),
type=polygon_type,
)
write_encoding_files(geometries_wkt, geometries, geometry_type="Polygon")
# multipoint
geometries_wkt = [
"MULTIPOINT ((30 10))",
"MULTIPOINT ((10 40), (40 30), (20 20), (30 10))",
"MULTIPOINT EMPTY",
None,
]
multipoint_type = pa.list_(pa.field("points", point_type, nullable=False))
geometries = pa.array(
[
[(30, 10)],
[(10, 40), (40, 30), (20, 20), (30, 10)],
[],
[],
],
mask=np.array([False, False, False, True]),
type=multipoint_type,
)
write_encoding_files(geometries_wkt, geometries, geometry_type="MultiPoint")
# multilinestring
geometries_wkt = [
"MULTILINESTRING ((30 10, 10 30, 40 40))",
"MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))",
"MULTILINESTRING EMPTY",
None,
]
multilinestring_type = pa.list_(pa.field("linestrings", linestring_type, nullable=False))
geometries = pa.array(
[
[[(30, 10), (10, 30), (40, 40)]],
[[(10, 10), (20, 20), (10, 40)], [(40, 40), (30, 30), (40, 20), (30, 10)]],
[],
[],
],
mask=np.array([False, False, False, True]),
type=multilinestring_type,
)
write_encoding_files(geometries_wkt, geometries, geometry_type="MultiLineString")
# multipolygon
geometries_wkt = [
"MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))",
"MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))",
"MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))",
"MULTIPOLYGON EMPTY",
None,
]
multipolygon_type = pa.list_(pa.field("polygons", polygon_type, nullable=False))
geometries = pa.array(
[
[[[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]]],
[[[(30, 20), (45, 40), (10, 40), (30, 20)]], [[(15, 5), (40, 10), (10, 20), (5, 10), (15, 5)]]],
[
[[(40, 40), (20, 45), (45, 30), (40, 40)]],
[[(20, 35), (10, 30), (10, 10), (30, 5), (45, 20), (20, 35)], [(30, 20), (20, 15), (20, 25), (30, 20)]],
],
[],
[],
],
mask=np.array([False, False, False, False, True]),
type=multipolygon_type,
)
write_encoding_files(geometries_wkt, geometries, geometry_type="MultiPolygon")