188 lines
5.5 KiB
Python
188 lines
5.5 KiB
Python
# Taken from https://github.com/opengeospatial/geoparquet/tree/main/test_data
|
|
|
|
"""
|
|
Generates example data using pyarrow by running `python generate_test_data.py`.
|
|
|
|
You can print the metadata with:
|
|
|
|
.. code-block:: python
|
|
|
|
>>> import json, pprint, pyarrow.parquet as pq
|
|
>>> pprint.pprint(json.loads(pq.read_schema("example.parquet").metadata[b"geo"]))
|
|
"""
|
|
|
|
import json
|
|
import pathlib
|
|
import copy
|
|
|
|
import numpy as np
|
|
import pyarrow as pa
|
|
import pyarrow.parquet as pq
|
|
from pyarrow.csv import write_csv
|
|
|
|
from shapely import from_wkt, to_wkb
|
|
|
|
|
|
HERE = pathlib.Path(__file__).parent
|
|
|
|
|
|
metadata_template = {
|
|
"version": "1.1.0",
|
|
"primary_column": "geometry",
|
|
"columns": {
|
|
"geometry": {
|
|
"encoding": "WKB",
|
|
"geometry_types": [],
|
|
},
|
|
},
|
|
}
|
|
|
|
|
|
## Various geometry types with WKB and native (GeoArrow-based) encodings
|
|
|
|
|
|
def write_encoding_files(geometries_wkt, geometries_geoarrow, geometry_type):
|
|
|
|
table = pa.table({"col": range(len(geometries_wkt)), "geometry": geometries_wkt})
|
|
write_csv(table, HERE / f"data-{geometry_type.lower()}-wkt.csv")
|
|
|
|
# WKB encoding
|
|
table = pa.table({"col": range(len(geometries_wkt)), "geometry": to_wkb(from_wkt(geometries_wkt))})
|
|
metadata = copy.deepcopy(metadata_template)
|
|
metadata["columns"]["geometry"]["geometry_types"] = [geometry_type]
|
|
table = table.replace_schema_metadata({"geo": json.dumps(metadata)})
|
|
pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_wkb.parquet")
|
|
|
|
# native (geoarrow) encoding
|
|
table = pa.table({"col": range(len(geometries_wkt)), "geometry": geometries_geoarrow})
|
|
metadata["columns"]["geometry"]["encoding"] = geometry_type.lower()
|
|
table = table.replace_schema_metadata({"geo": json.dumps(metadata)})
|
|
pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_native.parquet")
|
|
|
|
|
|
# point
|
|
|
|
geometries_wkt = [
|
|
"POINT (30 10)",
|
|
"POINT EMPTY",
|
|
None,
|
|
"POINT (40 40)",
|
|
]
|
|
|
|
point_type = pa.struct([pa.field("x", pa.float64(), nullable=False), pa.field("y", pa.float64(), nullable=False)])
|
|
geometries = pa.array(
|
|
[(30, 10), (float("nan"), float("nan")), (float("nan"), float("nan")), (40, 40)],
|
|
mask=np.array([False, False, True, False]),
|
|
type=point_type,
|
|
)
|
|
|
|
write_encoding_files(geometries_wkt, geometries, geometry_type="Point")
|
|
|
|
# linestring
|
|
|
|
geometries_wkt = ["LINESTRING (30 10, 10 30, 40 40)", "LINESTRING EMPTY", None]
|
|
|
|
linestring_type = pa.list_(pa.field("vertices", point_type, nullable=False))
|
|
geometries = pa.array(
|
|
[[(30, 10), (10, 30), (40, 40)], [], []], mask=np.array([False, False, True]), type=linestring_type
|
|
)
|
|
|
|
write_encoding_files(geometries_wkt, geometries, geometry_type="LineString")
|
|
|
|
# polygon
|
|
|
|
geometries_wkt = [
|
|
"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
|
|
"POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))",
|
|
"POLYGON EMPTY",
|
|
None,
|
|
]
|
|
|
|
polygon_type = pa.list_(pa.field("rings", pa.list_(pa.field("vertices", point_type, nullable=False)), nullable=False))
|
|
geometries = pa.array(
|
|
[
|
|
[[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]],
|
|
[[(35, 10), (45, 45), (15, 40), (10, 20), (35, 10)], [(20, 30), (35, 35), (30, 20), (20, 30)]],
|
|
[],
|
|
[],
|
|
],
|
|
mask=np.array([False, False, False, True]),
|
|
type=polygon_type,
|
|
)
|
|
|
|
write_encoding_files(geometries_wkt, geometries, geometry_type="Polygon")
|
|
|
|
# multipoint
|
|
|
|
geometries_wkt = [
|
|
"MULTIPOINT ((30 10))",
|
|
"MULTIPOINT ((10 40), (40 30), (20 20), (30 10))",
|
|
"MULTIPOINT EMPTY",
|
|
None,
|
|
]
|
|
|
|
multipoint_type = pa.list_(pa.field("points", point_type, nullable=False))
|
|
geometries = pa.array(
|
|
[
|
|
[(30, 10)],
|
|
[(10, 40), (40, 30), (20, 20), (30, 10)],
|
|
[],
|
|
[],
|
|
],
|
|
mask=np.array([False, False, False, True]),
|
|
type=multipoint_type,
|
|
)
|
|
|
|
write_encoding_files(geometries_wkt, geometries, geometry_type="MultiPoint")
|
|
|
|
# multilinestring
|
|
|
|
geometries_wkt = [
|
|
"MULTILINESTRING ((30 10, 10 30, 40 40))",
|
|
"MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))",
|
|
"MULTILINESTRING EMPTY",
|
|
None,
|
|
]
|
|
|
|
multilinestring_type = pa.list_(pa.field("linestrings", linestring_type, nullable=False))
|
|
geometries = pa.array(
|
|
[
|
|
[[(30, 10), (10, 30), (40, 40)]],
|
|
[[(10, 10), (20, 20), (10, 40)], [(40, 40), (30, 30), (40, 20), (30, 10)]],
|
|
[],
|
|
[],
|
|
],
|
|
mask=np.array([False, False, False, True]),
|
|
type=multilinestring_type,
|
|
)
|
|
|
|
write_encoding_files(geometries_wkt, geometries, geometry_type="MultiLineString")
|
|
|
|
# multipolygon
|
|
|
|
geometries_wkt = [
|
|
"MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))",
|
|
"MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))",
|
|
"MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))",
|
|
"MULTIPOLYGON EMPTY",
|
|
None,
|
|
]
|
|
|
|
multipolygon_type = pa.list_(pa.field("polygons", polygon_type, nullable=False))
|
|
geometries = pa.array(
|
|
[
|
|
[[[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]]],
|
|
[[[(30, 20), (45, 40), (10, 40), (30, 20)]], [[(15, 5), (40, 10), (10, 20), (5, 10), (15, 5)]]],
|
|
[
|
|
[[(40, 40), (20, 45), (45, 30), (40, 40)]],
|
|
[[(20, 35), (10, 30), (10, 10), (30, 5), (45, 20), (20, 35)], [(30, 20), (20, 15), (20, 25), (30, 20)]],
|
|
],
|
|
[],
|
|
[],
|
|
],
|
|
mask=np.array([False, False, False, False, True]),
|
|
type=multipolygon_type,
|
|
)
|
|
|
|
write_encoding_files(geometries_wkt, geometries, geometry_type="MultiPolygon")
|