should be it
This commit is contained in:
414
external/duckdb/test/sql/json/table/read_json.test
vendored
Normal file
414
external/duckdb/test/sql/json/table/read_json.test
vendored
Normal file
@@ -0,0 +1,414 @@
|
||||
# name: test/sql/json/table/read_json.test
|
||||
# description: Read json files straight to columnar data
|
||||
# group: [table]
|
||||
|
||||
require json
|
||||
|
||||
statement ok
|
||||
pragma enable_verification
|
||||
|
||||
statement error
|
||||
SELECT * FROM read_json('data/json/example_n.ndjson', auto_detect=false)
|
||||
----
|
||||
Binder Error
|
||||
|
||||
# can't read ndjson with array
|
||||
statement error
|
||||
SELECT * FROM read_json('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='array')
|
||||
----
|
||||
Invalid Input Error: Expected top-level JSON array
|
||||
|
||||
# read_ndjson works
|
||||
query II
|
||||
SELECT * FROM read_ndjson('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'})
|
||||
----
|
||||
1 O Brother, Where Art Thou?
|
||||
2 Home for the Holidays
|
||||
3 The Firm
|
||||
4 Broadcast News
|
||||
5 Raising Arizona
|
||||
|
||||
# We can also read only one of the columns
|
||||
query I
|
||||
SELECT * FROM read_ndjson('data/json/example_n.ndjson', columns={id: 'INTEGER'})
|
||||
----
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
|
||||
query I
|
||||
SELECT * FROM read_ndjson('data/json/example_n.ndjson', columns={name: 'VARCHAR'})
|
||||
----
|
||||
O Brother, Where Art Thou?
|
||||
Home for the Holidays
|
||||
The Firm
|
||||
Broadcast News
|
||||
Raising Arizona
|
||||
|
||||
# what about a broken JSON file
|
||||
query II
|
||||
SELECT * FROM read_ndjson('data/json/unterminated_quotes.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, ignore_errors=true)
|
||||
----
|
||||
1 O Brother, Where Art Thou?
|
||||
2 Home for the Holidays
|
||||
NULL NULL
|
||||
4 Broadcast News
|
||||
5 Raising Arizona
|
||||
|
||||
# some of these values don't have "name"
|
||||
query II
|
||||
SELECT * FROM read_ndjson('data/json/different_schemas.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'})
|
||||
----
|
||||
1 O Brother, Where Art Thou?
|
||||
2 NULL
|
||||
3 The Firm
|
||||
4 NULL
|
||||
5 Raising Arizona
|
||||
|
||||
# test projection pushdown (unstructured json)
|
||||
query I
|
||||
SELECT id FROM read_json('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='unstructured')
|
||||
----
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
|
||||
query I
|
||||
SELECT name FROM read_json('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='unstructured')
|
||||
----
|
||||
O Brother, Where Art Thou?
|
||||
Home for the Holidays
|
||||
The Firm
|
||||
Broadcast News
|
||||
Raising Arizona
|
||||
|
||||
# test projection pushdown (newline-delimited json)
|
||||
query I
|
||||
SELECT id FROM read_json('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='newline_delimited')
|
||||
----
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
|
||||
query I
|
||||
SELECT name FROM read_ndjson('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='nd')
|
||||
----
|
||||
O Brother, Where Art Thou?
|
||||
Home for the Holidays
|
||||
The Firm
|
||||
Broadcast News
|
||||
Raising Arizona
|
||||
|
||||
# auto-detect
|
||||
query II
|
||||
SELECT * FROM read_json_auto('data/json/example_n.ndjson')
|
||||
----
|
||||
1 O Brother, Where Art Thou?
|
||||
2 Home for the Holidays
|
||||
3 The Firm
|
||||
4 Broadcast News
|
||||
5 Raising Arizona
|
||||
|
||||
query II
|
||||
SELECT * FROM 'data/json/example_n.ndjson'
|
||||
----
|
||||
1 O Brother, Where Art Thou?
|
||||
2 Home for the Holidays
|
||||
3 The Firm
|
||||
4 Broadcast News
|
||||
5 Raising Arizona
|
||||
|
||||
# we can detect at varying levels, level 0 is just JSON
|
||||
query I
|
||||
SELECT * FROM read_json_auto('data/json/with_list.json', maximum_depth=0)
|
||||
----
|
||||
{"id":1,"name":["O","Brother,","Where","Art","Thou?"]}
|
||||
{"id":2,"name":["Home","for","the","Holidays"]}
|
||||
{"id":3,"name":["The","Firm"]}
|
||||
{"id":4,"name":["Broadcast","News"]}
|
||||
{"id":5,"name":["Raising","Arizona"]}
|
||||
|
||||
# at level one we get JSON and JSON
|
||||
query II
|
||||
SELECT * FROM read_json_auto('data/json/with_list.json', maximum_depth=1)
|
||||
----
|
||||
1 ["O","Brother,","Where","Art","Thou?"]
|
||||
2 ["Home","for","the","Holidays"]
|
||||
3 ["The","Firm"]
|
||||
4 ["Broadcast","News"]
|
||||
5 ["Raising","Arizona"]
|
||||
|
||||
# at level 2 we get BIGINT and JSON[]
|
||||
query II
|
||||
SELECT * FROM read_json_auto('data/json/with_list.json', maximum_depth=2)
|
||||
----
|
||||
1 ["O", "Brother,", "Where", "Art", "Thou?"]
|
||||
2 ["Home", "for", "the", "Holidays"]
|
||||
3 ["The", "Firm"]
|
||||
4 ["Broadcast", "News"]
|
||||
5 ["Raising", "Arizona"]
|
||||
|
||||
# at level 3 it's fully detected, and we get BIGINT and VARCHAR[]
|
||||
query II
|
||||
SELECT * FROM read_json_auto('data/json/with_list.json', maximum_depth=3)
|
||||
----
|
||||
1 [O, 'Brother,', Where, Art, Thou?]
|
||||
2 [Home, for, the, Holidays]
|
||||
3 [The, Firm]
|
||||
4 [Broadcast, News]
|
||||
5 [Raising, Arizona]
|
||||
|
||||
# we can detect lists too
|
||||
query III
|
||||
SELECT id, typeof(name), unnest(name) FROM 'data/json/with_list.json'
|
||||
----
|
||||
1 VARCHAR[] O
|
||||
1 VARCHAR[] Brother,
|
||||
1 VARCHAR[] Where
|
||||
1 VARCHAR[] Art
|
||||
1 VARCHAR[] Thou?
|
||||
2 VARCHAR[] Home
|
||||
2 VARCHAR[] for
|
||||
2 VARCHAR[] the
|
||||
2 VARCHAR[] Holidays
|
||||
3 VARCHAR[] The
|
||||
3 VARCHAR[] Firm
|
||||
4 VARCHAR[] Broadcast
|
||||
4 VARCHAR[] News
|
||||
5 VARCHAR[] Raising
|
||||
5 VARCHAR[] Arizona
|
||||
|
||||
# with depth 2 we don't bother detecting inside of the list - defaults to JSON
|
||||
query III
|
||||
SELECT id, typeof(name), unnest(name) FROM read_json_auto('data/json/with_list.json', maximum_depth=2)
|
||||
----
|
||||
1 JSON[] "O"
|
||||
1 JSON[] "Brother,"
|
||||
1 JSON[] "Where"
|
||||
1 JSON[] "Art"
|
||||
1 JSON[] "Thou?"
|
||||
2 JSON[] "Home"
|
||||
2 JSON[] "for"
|
||||
2 JSON[] "the"
|
||||
2 JSON[] "Holidays"
|
||||
3 JSON[] "The"
|
||||
3 JSON[] "Firm"
|
||||
4 JSON[] "Broadcast"
|
||||
4 JSON[] "News"
|
||||
5 JSON[] "Raising"
|
||||
5 JSON[] "Arizona"
|
||||
|
||||
# with depth 0 we don't bother detecting anything, everything defaults to JSON (even the "id" column in this case)
|
||||
query II
|
||||
SELECT typeof(id), typeof(name) FROM read_json_auto('data/json/with_list.json', maximum_depth=1)
|
||||
----
|
||||
JSON JSON
|
||||
JSON JSON
|
||||
JSON JSON
|
||||
JSON JSON
|
||||
JSON JSON
|
||||
|
||||
# we can detect UUID's
|
||||
query II
|
||||
SELECT id, typeof(id) FROM 'data/json/with_uuid.json'
|
||||
----
|
||||
bbd05ae7-76e5-4f1a-a31f-247408251fc9 UUID
|
||||
d5c52052-5f8e-473f-bc8d-176342643ef5 UUID
|
||||
3b6a6de3-0732-4591-93ed-8df6091eb00d UUID
|
||||
ae24e69e-e0bf-4e85-9848-27d35df85b8b UUID
|
||||
63928b16-1814-436f-8b30-b3c40cc31d51 UUID
|
||||
|
||||
# top-level array of values
|
||||
query I
|
||||
select * from read_json('data/json/top_level_array.json', columns={conclusion: 'VARCHAR'})
|
||||
----
|
||||
cancelled
|
||||
cancelled
|
||||
|
||||
query I
|
||||
select * from read_json('data/json/top_level_array.json', auto_detect=true)
|
||||
----
|
||||
cancelled
|
||||
cancelled
|
||||
|
||||
# if we try to read it as 'unstructured' records
|
||||
statement error
|
||||
select * from read_json('data/json/top_level_array.json', columns={conclusion: 'VARCHAR'}, format='unstructured', records=true)
|
||||
----
|
||||
Invalid Input Error: JSON transform error in file "data/json/top_level_array.json", in record/value 1: Expected OBJECT, but got ARRAY
|
||||
|
||||
# if we try to read an ndjson file as if it is an array of values, we get an error
|
||||
statement error
|
||||
select * from read_json_auto('data/json/example_n.ndjson', format='array')
|
||||
----
|
||||
Invalid Input Error: Expected top-level JSON array
|
||||
|
||||
# test that we can read a list of longer than STANDARD_VECTOR_SIZE properly
|
||||
statement ok
|
||||
copy (select 42 duck from range(10000)) to '__TEST_DIR__/my_file.json' (array true)
|
||||
|
||||
query T
|
||||
select count(*) from read_json('__TEST_DIR__/my_file.json', columns={duck: 'INTEGER'}, format='array')
|
||||
----
|
||||
10000
|
||||
|
||||
query T
|
||||
select sum(duck) = 42*10000 from read_json('__TEST_DIR__/my_file.json', columns={duck: 'INTEGER'}, format='array')
|
||||
----
|
||||
true
|
||||
|
||||
# read_json_auto also understands ARRAY format
|
||||
query T
|
||||
select count(*) from '__TEST_DIR__/my_file.json'
|
||||
----
|
||||
10000
|
||||
|
||||
query T
|
||||
select sum(duck) = 42*10000 from '__TEST_DIR__/my_file.json'
|
||||
----
|
||||
true
|
||||
|
||||
# what if we do an array of non-records?
|
||||
statement ok
|
||||
copy (select list(range) from range(10)) to '__TEST_DIR__/my_file.json' (format csv, quote '', HEADER 0)
|
||||
|
||||
query T
|
||||
select * from '__TEST_DIR__/my_file.json'
|
||||
----
|
||||
0
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
|
||||
# fails because it's not records
|
||||
statement error
|
||||
select * from read_json('__TEST_DIR__/my_file.json', format='array', columns={range: 'INTEGER'}, records=true)
|
||||
----
|
||||
Invalid Input Error: JSON transform error
|
||||
|
||||
# fails because it's not records
|
||||
statement error
|
||||
select * from read_json_auto('__TEST_DIR__/my_file.json', format='array', records=true)
|
||||
----
|
||||
Binder Error: json_read expected records
|
||||
|
||||
query T
|
||||
select * from read_json('__TEST_DIR__/my_file.json', format='auto', records=false, auto_detect=true)
|
||||
----
|
||||
0
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
|
||||
# need to supply columns
|
||||
statement error
|
||||
select * from read_json('__TEST_DIR__/my_file.json', format='auto', records='false', auto_detect=false)
|
||||
----
|
||||
Binder Error
|
||||
|
||||
# read as unstructured values, so we just get the array
|
||||
query T
|
||||
select * from read_json('__TEST_DIR__/my_file.json', format='unstructured', records='false', auto_detect=true)
|
||||
----
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
|
||||
# array of non-records
|
||||
query T
|
||||
select * from read_json('__TEST_DIR__/my_file.json', format='array', records='false', auto_detect=true)
|
||||
----
|
||||
0
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
|
||||
# also works with auto
|
||||
query T
|
||||
select * from read_json('__TEST_DIR__/my_file.json', format='array', records='auto', auto_detect=true)
|
||||
----
|
||||
0
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
|
||||
# lower thread count so the next tests don't OOM on many-core machines
|
||||
statement ok
|
||||
SET threads=2
|
||||
|
||||
# issue 6646, this is not an array, but we try to read it as one
|
||||
statement error
|
||||
select json_structure(json ->> '$.metadata') as structure,
|
||||
from read_json('data/json/issue.json', format='array', columns={'json': 'JSON'}, maximum_object_size=104857600)
|
||||
limit 1;
|
||||
----
|
||||
Invalid Input Error: Expected top-level JSON array
|
||||
|
||||
# let's try a variation
|
||||
statement error
|
||||
select json_structure(json ->> '$.metadata') as structure,
|
||||
from read_json('data/json/issue.json', format='array', records='false', columns={'json': 'JSON'}, maximum_object_size=104857600)
|
||||
limit 1;
|
||||
----
|
||||
Invalid Input Error: Expected top-level JSON array
|
||||
|
||||
# we can parse it as unstructured values, and give it a different col name
|
||||
query I
|
||||
select json_structure(my_json ->> '$.metadata') as structure,
|
||||
from read_json('data/json/issue.json', format='unstructured', records='false', columns={'my_json': 'JSON'}, maximum_object_size=104857600)
|
||||
limit 1;
|
||||
----
|
||||
{"argv":["VARCHAR"],"dag":{"dag_size":"VARCHAR","tasks":{"load_oscar":{"status":"VARCHAR","type":"VARCHAR","upstream":"VARCHAR","products":{"nb":"VARCHAR"}},"load_weather":{"status":"VARCHAR","type":"VARCHAR","upstream":"VARCHAR","products":{"nb":"VARCHAR"}},"compress":{"status":"VARCHAR","type":"VARCHAR","upstream":{"load_oscar":"VARCHAR"},"products":{"nb":"VARCHAR"}}}}}
|
||||
|
||||
statement ok
|
||||
pragma disable_verification
|
||||
|
||||
# test that we can read a JSON list that spans more than one buffer size
|
||||
# the JSON is 55 bytes, and the minimum buffer size is 32MB
|
||||
# let's do 50k to be safe
|
||||
statement ok
|
||||
copy (select 42 this_is_a_very_long_field_name_yes_very_much_so from range(50000)) to '__TEST_DIR__/my_file.json' (array true)
|
||||
|
||||
query T
|
||||
select sum(this_is_a_very_long_field_name_yes_very_much_so) = 42 * 50000 from '__TEST_DIR__/my_file.json'
|
||||
----
|
||||
true
|
||||
|
||||
require httpfs
|
||||
|
||||
query II
|
||||
select * from read_json_auto('https://github.com/duckdb/duckdb-data/releases/download/v1.0/example_rn.ndjson');
|
||||
----
|
||||
1 O Brother, Where Art Thou?
|
||||
2 Home for the Holidays
|
||||
3 The Firm
|
||||
4 Broadcast News
|
||||
5 Raising Arizona
|
||||
Reference in New Issue
Block a user