should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,14 @@
# name: test/sql/json/table/auto_glob_directory.test
# description: Test auto globbing a directory
# group: [table]
require json
statement ok
COPY (SELECT i%2 AS grp, i FROM range(1000) t(i)) TO '__TEST_DIR__/glob_dir_json' (FORMAT json, PER_THREAD_OUTPUT);
query II
SELECT grp, COUNT(*) FROM read_json('__TEST_DIR__/glob_dir_json') GROUP BY ALL ORDER BY ALL
----
0 500
1 500

View File

@@ -0,0 +1,49 @@
# name: test/sql/json/table/json_empty_array.test
# description: Read json files with empty arrays
# group: [table]
require json
statement ok
pragma enable_verification
# empty file
query I
select * from 'data/json/empty.ndjson'
----
query I
select * from 'data/json/whitespace_only.json'
----
# empty array
query I
SELECT * FROM read_json_auto('data/json/empty_array.json')
----
query I
SELECT * FROM read_json_auto('data/json/empty_no_newline.json')
----
# malformed files
statement error
SELECT * FROM read_json_auto('data/json/malformed/empty_array_malformed.json')
----
Missing closing brace
statement error
SELECT * FROM read_json_auto('data/json/malformed/empty_array_trailing.json', format='array')
----
Empty array with trailing data when parsing JSON array
statement error
SELECT * FROM read_json_auto('data/json/malformed/array_comma_malformed.json', format='array')
----
Malformed JSON
query I
SELECT * FROM read_json_auto('data/json/array_of_empty_arrays.json', format='array')
----
[]
[]
[]

View File

@@ -0,0 +1,164 @@
# name: test/sql/json/table/json_multi_file_reader.test
# description: Test MultiFileReader integration in JSON reader
# group: [table]
require json
statement ok
create table test as SELECT i as i, to_json([i%4]) as j FROM range(0,20) as tbl(i)
# FIXME: we can't do partitioned JSON writes yet because the column we partition by is packed into a to_json
# because we just push an expression and then use the csv writer, this uses the csv writer for now
statement ok
COPY test TO '__TEST_DIR__/json_part' (FORMAT csv, quote '', PARTITION_BY (j), HEADER 0);
# some tests for read_json first
query III
select * exclude (filename), replace(filename, '\', '/') as filename from read_json_auto('data/json/example_*.ndjson', filename=true) order by all
----
1 O Brother, Where Art Thou? data/json/example_n.ndjson
1 O Brother, Where Art Thou? data/json/example_r.ndjson
1 O Brother, Where Art Thou? data/json/example_rn.ndjson
2 Home for the Holidays data/json/example_n.ndjson
2 Home for the Holidays data/json/example_r.ndjson
2 Home for the Holidays data/json/example_rn.ndjson
3 The Firm data/json/example_n.ndjson
3 The Firm data/json/example_r.ndjson
3 The Firm data/json/example_rn.ndjson
4 Broadcast News data/json/example_n.ndjson
4 Broadcast News data/json/example_r.ndjson
4 Broadcast News data/json/example_rn.ndjson
5 Raising Arizona data/json/example_n.ndjson
5 Raising Arizona data/json/example_r.ndjson
5 Raising Arizona data/json/example_rn.ndjson
# virtual column
query III
select *, replace(filename, '\', '/') from read_json_auto('data/json/example_*.ndjson') order by all
----
1 O Brother, Where Art Thou? data/json/example_n.ndjson
1 O Brother, Where Art Thou? data/json/example_r.ndjson
1 O Brother, Where Art Thou? data/json/example_rn.ndjson
2 Home for the Holidays data/json/example_n.ndjson
2 Home for the Holidays data/json/example_r.ndjson
2 Home for the Holidays data/json/example_rn.ndjson
3 The Firm data/json/example_n.ndjson
3 The Firm data/json/example_r.ndjson
3 The Firm data/json/example_rn.ndjson
4 Broadcast News data/json/example_n.ndjson
4 Broadcast News data/json/example_r.ndjson
4 Broadcast News data/json/example_rn.ndjson
5 Raising Arizona data/json/example_n.ndjson
5 Raising Arizona data/json/example_r.ndjson
5 Raising Arizona data/json/example_rn.ndjson
query III
select * from read_json_auto(['data/json/example_n.ndjson', 'data/json/top_level_array.json'], union_by_name=true) order by all
----
1 O Brother, Where Art Thou? NULL
2 Home for the Holidays NULL
3 The Firm NULL
4 Broadcast News NULL
5 Raising Arizona NULL
NULL NULL cancelled
NULL NULL cancelled
# despite not being able to do partitioned writes, we can do partitioned json reads already!
query II
SELECT j, count(*) FROM read_json_auto('__TEST_DIR__/json_part/j=*/*.csv', HIVE_PARTITIONING=1) group by j order by j;
----
[0] 5
[1] 5
[2] 5
[3] 5
# also test read_json_objects
query II
select * exclude (filename), replace(filename, '\', '/') as filename from read_json_objects_auto('data/json/example_*.ndjson', filename=true) order by all
----
{"id":1,"name":"O Brother, Where Art Thou?"} data/json/example_n.ndjson
{"id":1,"name":"O Brother, Where Art Thou?"} data/json/example_r.ndjson
{"id":1,"name":"O Brother, Where Art Thou?"} data/json/example_rn.ndjson
{"id":2,"name":"Home for the Holidays"} data/json/example_n.ndjson
{"id":2,"name":"Home for the Holidays"} data/json/example_r.ndjson
{"id":2,"name":"Home for the Holidays"} data/json/example_rn.ndjson
{"id":3,"name":"The Firm"} data/json/example_n.ndjson
{"id":3,"name":"The Firm"} data/json/example_r.ndjson
{"id":3,"name":"The Firm"} data/json/example_rn.ndjson
{"id":4,"name":"Broadcast News"} data/json/example_n.ndjson
{"id":4,"name":"Broadcast News"} data/json/example_r.ndjson
{"id":4,"name":"Broadcast News"} data/json/example_rn.ndjson
{"id":5,"name":"Raising Arizona"} data/json/example_n.ndjson
{"id":5,"name":"Raising Arizona"} data/json/example_r.ndjson
{"id":5,"name":"Raising Arizona"} data/json/example_rn.ndjson
query I
select * from read_json_objects_auto(['data/json/example_n.ndjson', 'data/json/top_level_array.json'], union_by_name=true) order by all
----
{"conclusion":"cancelled"}
{"conclusion":"cancelled"}
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
query II
select j, count(*) from read_json_objects_auto('__TEST_DIR__/json_part/j=*/*.csv', HIVE_PARTITIONING=1) group by j order by j
----
[0] 5
[1] 5
[2] 5
[3] 5
# also test the filter pushdown
query II
SELECT j, count(*)
FROM read_json_auto('__TEST_DIR__/json_part/j=*/*.csv', HIVE_PARTITIONING=1)
where j='[2]'
group by j
order by j;
----
[2] 5
query II
SELECT j, count(*)
FROM read_json_auto('__TEST_DIR__/json_part/j=*/*.csv', HIVE_PARTITIONING=1)
where j>'[2]'
group by j
order by j;
----
[3] 5
query II
SELECT j, count(*)
FROM read_json_auto('__TEST_DIR__/json_part/j=*/*.csv', HIVE_PARTITIONING=1)
where sqrt(j[2]::int) > 1.5
group by j
order by j;
----
[3] 5
# the JSON multi-file reader is a bit different, because we always sample sample_size
# even across multiple files when union_by_name=false
# there two files have a different schema, but we can read them together nonetheless
statement ok
SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'])
# both have 5 rows, so if we set sample_size=1, and maximum_sample_files=1, we cannot read them together anymore
statement error
SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'], sample_size=1, maximum_sample_files=1)
----
Invalid Input Error
# if we increase maximum_sample_files, or set union_by_name=true, then we can read them again
statement ok
SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'], sample_size=1, maximum_sample_files=99)
# if we set union_by_name=true, then we sample sample_size rows per file, so then we can read them again
statement ok
SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'], sample_size=1, union_by_name=true)
# with sample size 6 we sample 1 line from the second file, and of course we can read it again
statement ok
SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'], sample_size=6)

View File

@@ -0,0 +1,23 @@
# name: test/sql/json/table/multi_file_hang.test
# description: Test that we do not hang when reading multiple JSON files while only sampling one
# group: [table]
require json
# needs more threads than the number of files for this to happen
statement ok
set threads=8
# only happened with these parameters
statement error
from read_json('data/json/multi_file_hang/*.json', sample_size=1, maximum_sample_files=1)
----
Invalid Input Error: JSON transform error
# the fuzzer also detected a single file hang, because we tried not to error here
# we cannot ignore errors of this kind when the data is not newline-delimited
# because we wouldn't know how to continue
statement error
SELECT * FROM read_json('data/json/fuzzer_hang.json', ignore_errors=true);
----
Invalid Input Error

View File

@@ -0,0 +1,414 @@
# name: test/sql/json/table/read_json.test
# description: Read json files straight to columnar data
# group: [table]
require json
statement ok
pragma enable_verification
statement error
SELECT * FROM read_json('data/json/example_n.ndjson', auto_detect=false)
----
Binder Error
# can't read ndjson with array
statement error
SELECT * FROM read_json('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='array')
----
Invalid Input Error: Expected top-level JSON array
# read_ndjson works
query II
SELECT * FROM read_ndjson('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'})
----
1 O Brother, Where Art Thou?
2 Home for the Holidays
3 The Firm
4 Broadcast News
5 Raising Arizona
# We can also read only one of the columns
query I
SELECT * FROM read_ndjson('data/json/example_n.ndjson', columns={id: 'INTEGER'})
----
1
2
3
4
5
query I
SELECT * FROM read_ndjson('data/json/example_n.ndjson', columns={name: 'VARCHAR'})
----
O Brother, Where Art Thou?
Home for the Holidays
The Firm
Broadcast News
Raising Arizona
# what about a broken JSON file
query II
SELECT * FROM read_ndjson('data/json/unterminated_quotes.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, ignore_errors=true)
----
1 O Brother, Where Art Thou?
2 Home for the Holidays
NULL NULL
4 Broadcast News
5 Raising Arizona
# some of these values don't have "name"
query II
SELECT * FROM read_ndjson('data/json/different_schemas.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'})
----
1 O Brother, Where Art Thou?
2 NULL
3 The Firm
4 NULL
5 Raising Arizona
# test projection pushdown (unstructured json)
query I
SELECT id FROM read_json('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='unstructured')
----
1
2
3
4
5
query I
SELECT name FROM read_json('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='unstructured')
----
O Brother, Where Art Thou?
Home for the Holidays
The Firm
Broadcast News
Raising Arizona
# test projection pushdown (newline-delimited json)
query I
SELECT id FROM read_json('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='newline_delimited')
----
1
2
3
4
5
query I
SELECT name FROM read_ndjson('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='nd')
----
O Brother, Where Art Thou?
Home for the Holidays
The Firm
Broadcast News
Raising Arizona
# auto-detect
query II
SELECT * FROM read_json_auto('data/json/example_n.ndjson')
----
1 O Brother, Where Art Thou?
2 Home for the Holidays
3 The Firm
4 Broadcast News
5 Raising Arizona
query II
SELECT * FROM 'data/json/example_n.ndjson'
----
1 O Brother, Where Art Thou?
2 Home for the Holidays
3 The Firm
4 Broadcast News
5 Raising Arizona
# we can detect at varying levels, level 0 is just JSON
query I
SELECT * FROM read_json_auto('data/json/with_list.json', maximum_depth=0)
----
{"id":1,"name":["O","Brother,","Where","Art","Thou?"]}
{"id":2,"name":["Home","for","the","Holidays"]}
{"id":3,"name":["The","Firm"]}
{"id":4,"name":["Broadcast","News"]}
{"id":5,"name":["Raising","Arizona"]}
# at level one we get JSON and JSON
query II
SELECT * FROM read_json_auto('data/json/with_list.json', maximum_depth=1)
----
1 ["O","Brother,","Where","Art","Thou?"]
2 ["Home","for","the","Holidays"]
3 ["The","Firm"]
4 ["Broadcast","News"]
5 ["Raising","Arizona"]
# at level 2 we get BIGINT and JSON[]
query II
SELECT * FROM read_json_auto('data/json/with_list.json', maximum_depth=2)
----
1 ["O", "Brother,", "Where", "Art", "Thou?"]
2 ["Home", "for", "the", "Holidays"]
3 ["The", "Firm"]
4 ["Broadcast", "News"]
5 ["Raising", "Arizona"]
# at level 3 it's fully detected, and we get BIGINT and VARCHAR[]
query II
SELECT * FROM read_json_auto('data/json/with_list.json', maximum_depth=3)
----
1 [O, 'Brother,', Where, Art, Thou?]
2 [Home, for, the, Holidays]
3 [The, Firm]
4 [Broadcast, News]
5 [Raising, Arizona]
# we can detect lists too
query III
SELECT id, typeof(name), unnest(name) FROM 'data/json/with_list.json'
----
1 VARCHAR[] O
1 VARCHAR[] Brother,
1 VARCHAR[] Where
1 VARCHAR[] Art
1 VARCHAR[] Thou?
2 VARCHAR[] Home
2 VARCHAR[] for
2 VARCHAR[] the
2 VARCHAR[] Holidays
3 VARCHAR[] The
3 VARCHAR[] Firm
4 VARCHAR[] Broadcast
4 VARCHAR[] News
5 VARCHAR[] Raising
5 VARCHAR[] Arizona
# with depth 2 we don't bother detecting inside of the list - defaults to JSON
query III
SELECT id, typeof(name), unnest(name) FROM read_json_auto('data/json/with_list.json', maximum_depth=2)
----
1 JSON[] "O"
1 JSON[] "Brother,"
1 JSON[] "Where"
1 JSON[] "Art"
1 JSON[] "Thou?"
2 JSON[] "Home"
2 JSON[] "for"
2 JSON[] "the"
2 JSON[] "Holidays"
3 JSON[] "The"
3 JSON[] "Firm"
4 JSON[] "Broadcast"
4 JSON[] "News"
5 JSON[] "Raising"
5 JSON[] "Arizona"
# with depth 0 we don't bother detecting anything, everything defaults to JSON (even the "id" column in this case)
query II
SELECT typeof(id), typeof(name) FROM read_json_auto('data/json/with_list.json', maximum_depth=1)
----
JSON JSON
JSON JSON
JSON JSON
JSON JSON
JSON JSON
# we can detect UUID's
query II
SELECT id, typeof(id) FROM 'data/json/with_uuid.json'
----
bbd05ae7-76e5-4f1a-a31f-247408251fc9 UUID
d5c52052-5f8e-473f-bc8d-176342643ef5 UUID
3b6a6de3-0732-4591-93ed-8df6091eb00d UUID
ae24e69e-e0bf-4e85-9848-27d35df85b8b UUID
63928b16-1814-436f-8b30-b3c40cc31d51 UUID
# top-level array of values
query I
select * from read_json('data/json/top_level_array.json', columns={conclusion: 'VARCHAR'})
----
cancelled
cancelled
query I
select * from read_json('data/json/top_level_array.json', auto_detect=true)
----
cancelled
cancelled
# if we try to read it as 'unstructured' records
statement error
select * from read_json('data/json/top_level_array.json', columns={conclusion: 'VARCHAR'}, format='unstructured', records=true)
----
Invalid Input Error: JSON transform error in file "data/json/top_level_array.json", in record/value 1: Expected OBJECT, but got ARRAY
# if we try to read an ndjson file as if it is an array of values, we get an error
statement error
select * from read_json_auto('data/json/example_n.ndjson', format='array')
----
Invalid Input Error: Expected top-level JSON array
# test that we can read a list of longer than STANDARD_VECTOR_SIZE properly
statement ok
copy (select 42 duck from range(10000)) to '__TEST_DIR__/my_file.json' (array true)
query T
select count(*) from read_json('__TEST_DIR__/my_file.json', columns={duck: 'INTEGER'}, format='array')
----
10000
query T
select sum(duck) = 42*10000 from read_json('__TEST_DIR__/my_file.json', columns={duck: 'INTEGER'}, format='array')
----
true
# read_json_auto also understands ARRAY format
query T
select count(*) from '__TEST_DIR__/my_file.json'
----
10000
query T
select sum(duck) = 42*10000 from '__TEST_DIR__/my_file.json'
----
true
# what if we do an array of non-records?
statement ok
copy (select list(range) from range(10)) to '__TEST_DIR__/my_file.json' (format csv, quote '', HEADER 0)
query T
select * from '__TEST_DIR__/my_file.json'
----
0
1
2
3
4
5
6
7
8
9
# fails because it's not records
statement error
select * from read_json('__TEST_DIR__/my_file.json', format='array', columns={range: 'INTEGER'}, records=true)
----
Invalid Input Error: JSON transform error
# fails because it's not records
statement error
select * from read_json_auto('__TEST_DIR__/my_file.json', format='array', records=true)
----
Binder Error: json_read expected records
query T
select * from read_json('__TEST_DIR__/my_file.json', format='auto', records=false, auto_detect=true)
----
0
1
2
3
4
5
6
7
8
9
# need to supply columns
statement error
select * from read_json('__TEST_DIR__/my_file.json', format='auto', records='false', auto_detect=false)
----
Binder Error
# read as unstructured values, so we just get the array
query T
select * from read_json('__TEST_DIR__/my_file.json', format='unstructured', records='false', auto_detect=true)
----
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# array of non-records
query T
select * from read_json('__TEST_DIR__/my_file.json', format='array', records='false', auto_detect=true)
----
0
1
2
3
4
5
6
7
8
9
# also works with auto
query T
select * from read_json('__TEST_DIR__/my_file.json', format='array', records='auto', auto_detect=true)
----
0
1
2
3
4
5
6
7
8
9
# lower thread count so the next tests don't OOM on many-core machines
statement ok
SET threads=2
# issue 6646, this is not an array, but we try to read it as one
statement error
select json_structure(json ->> '$.metadata') as structure,
from read_json('data/json/issue.json', format='array', columns={'json': 'JSON'}, maximum_object_size=104857600)
limit 1;
----
Invalid Input Error: Expected top-level JSON array
# let's try a variation
statement error
select json_structure(json ->> '$.metadata') as structure,
from read_json('data/json/issue.json', format='array', records='false', columns={'json': 'JSON'}, maximum_object_size=104857600)
limit 1;
----
Invalid Input Error: Expected top-level JSON array
# we can parse it as unstructured values, and give it a different col name
query I
select json_structure(my_json ->> '$.metadata') as structure,
from read_json('data/json/issue.json', format='unstructured', records='false', columns={'my_json': 'JSON'}, maximum_object_size=104857600)
limit 1;
----
{"argv":["VARCHAR"],"dag":{"dag_size":"VARCHAR","tasks":{"load_oscar":{"status":"VARCHAR","type":"VARCHAR","upstream":"VARCHAR","products":{"nb":"VARCHAR"}},"load_weather":{"status":"VARCHAR","type":"VARCHAR","upstream":"VARCHAR","products":{"nb":"VARCHAR"}},"compress":{"status":"VARCHAR","type":"VARCHAR","upstream":{"load_oscar":"VARCHAR"},"products":{"nb":"VARCHAR"}}}}}
statement ok
pragma disable_verification
# test that we can read a JSON list that spans more than one buffer size
# the JSON is 55 bytes, and the minimum buffer size is 32MB
# let's do 50k to be safe
statement ok
copy (select 42 this_is_a_very_long_field_name_yes_very_much_so from range(50000)) to '__TEST_DIR__/my_file.json' (array true)
query T
select sum(this_is_a_very_long_field_name_yes_very_much_so) = 42 * 50000 from '__TEST_DIR__/my_file.json'
----
true
require httpfs
query II
select * from read_json_auto('https://github.com/duckdb/duckdb-data/releases/download/v1.0/example_rn.ndjson');
----
1 O Brother, Where Art Thou?
2 Home for the Holidays
3 The Firm
4 Broadcast News
5 Raising Arizona

View File

@@ -0,0 +1,354 @@
# name: test/sql/json/table/read_json_auto.test_slow
# description: Read json files - schema detection
# group: [table]
require json
statement ok
pragma enable_verification
# some arrow tests (python/pyarrow/tests/test_json.py) on their github
# these are very similar to the pandas tests, so let's not copy those
# instead of adding all of these files to data/test we just create them on the fly here
# whenever we add a '' at the end it's just to check we skip the newline at the end that's sometimes there
statement ok
copy (select * from (values ('{"a": 1, "b": 2}'), (''))) to '__TEST_DIR__/my_file.json' (format csv, quote '', header 0);
query II
select * from '__TEST_DIR__/my_file.json'
----
1 2
statement ok
copy (select * from (values ('{"a": 1}'), ('{"a": 2}'), ('{"a": 3}'))) to '__TEST_DIR__/my_file.json' (format csv, quote '', header 0)
query I
select * from '__TEST_DIR__/my_file.json'
----
1
2
3
query I
select count(*) from '__TEST_DIR__/my_file.json'
----
3
statement ok
copy (select * from (values ('{"a": 1,"b": 2, "c": 3}'), ('{"a": 4,"b": 5, "c": 6}'))) to '__TEST_DIR__/my_file.json' (format csv, quote '', header 0)
query III
select * from '__TEST_DIR__/my_file.json'
----
1 2 3
4 5 6
statement ok
copy (select * from (values ('{"a": 1,"b": 2, "c": "3", "d": false}'), ('{"a": 4.0, "b": -5, "c": "foo", "d": true}'), (''))) to '__TEST_DIR__/my_file.json' (format csv, quote '', header 0)
query IIII
select * from '__TEST_DIR__/my_file.json'
----
1.0 2 3 false
4.0 -5 foo true
# mixed types that cannot be resolved, defaults to JSON (column 3)
statement ok
copy (select * from (values ('{"a": 1, "b": 2, "c": null, "d": null, "e": null}'), ('{"a": null, "b": -5, "c": "foo", "d": null, "e": true}'), ('{"a": 4.5, "b": null, "c": "nan", "d": null,"e": false}'), (''))) to '__TEST_DIR__/my_file.json' (format csv, quote '', header 0)
query IIIII
select * from '__TEST_DIR__/my_file.json'
----
1.0 2 NULL NULL NULL
NULL -5 foo NULL true
4.5 NULL nan NULL false
# mixed types are resolved to DOUBLE here
statement ok
copy (select * from (values ('{"a": 1}'), ('{"a": 1.45}'), ('{"a": -23.456}'), ('{}'), (''))) to '__TEST_DIR__/my_file.json' (format csv, quote '', header 0)
query II
select typeof(a), a from '__TEST_DIR__/my_file.json'
----
DOUBLE 1.0
DOUBLE 1.45
DOUBLE -23.456
DOUBLE NULL
statement ok
copy (select * from (values ('{"foo": "bar", "num": 0}'), ('{"foo": "baz", "num": 1}'), (''))) to '__TEST_DIR__/my_file.json' (format csv, quote '', header 0)
query II
select * from '__TEST_DIR__/my_file.json'
----
bar 0
baz 1
# we can read values from a top-level list
query I
select * from 'data/json/top_level_array.json'
----
cancelled
cancelled
query I
select count(*) from 'data/json/top_level_array.json'
----
2
# for maximum_depth=0 this is two records of JSON
query I
select * from read_json_auto('data/json/top_level_array.json', maximum_depth=0)
----
{"conclusion":"cancelled"}
{"conclusion":"cancelled"}
# for 1 it's 1 column of JSON
query I
select * from read_json_auto('data/json/top_level_array.json', maximum_depth=1)
----
"cancelled"
"cancelled"
# if we read this with records='false', we get the struct instead of the unpacked columns
query I
select typeof(json) from read_json_auto('data/json/top_level_array.json', records='false')
----
STRUCT(conclusion VARCHAR)
STRUCT(conclusion VARCHAR)
# however, if there are multiple top-level arrays, we default to reading them as lists
query I
select * from 'data/json/top_level_two_arrays.json'
----
[{'conclusion': cancelled}, {'conclusion': cancelled}]
[{'conclusion': cancelled}, {'conclusion': cancelled}]
# if we read a top-level array as if it is a record, then we get an error
statement error
select * from read_json_auto('data/json/top_level_array.json', format='unstructured', records='true')
----
Binder Error: json_read expected records
# issue Mark found when analyzing a JSON dump of our CI - projection pushdown wasn't working properly
statement ok
select * from 'data/json/projection_pushdown_example.json' WHERE status <> 'completed'
# different schema's - this one should work regardless of sampling 1 or all lines
query II
select * from read_json_auto('data/json/different_schemas.ndjson', sample_size=1)
----
1 O Brother, Where Art Thou?
2 NULL
3 The Firm
4 NULL
5 Raising Arizona
query II
select * from read_json_auto('data/json/different_schemas.ndjson', sample_size=-1)
----
1 O Brother, Where Art Thou?
2 NULL
3 The Firm
4 NULL
5 Raising Arizona
# if we require fields to appear in all objects by setting field_appearance_threshold=1, we default to MAP
query I
select typeof(COLUMNS(*)) from read_json_auto('data/json/different_schemas.ndjson', field_appearance_threshold=1) limit 1
----
MAP(VARCHAR, JSON)
query I
select * from read_json_auto('data/json/different_schemas.ndjson', field_appearance_threshold=1)
----
{id=1, name='"O Brother, Where Art Thou?"'}
{id=2}
{id=3, name='"The Firm"'}
{id=4}
{id=5, name='"Raising Arizona"'}
# if we set it to 0.5 it should work already since "name" appears in 3/5 objects, which is greater than 0.5
query II
select * from read_json_auto('data/json/different_schemas.ndjson', field_appearance_threshold=0.5)
----
1 O Brother, Where Art Thou?
2 NULL
3 The Firm
4 NULL
5 Raising Arizona
# can't set it to less than 0 or more than 1
statement error
select * from read_json_auto('data/json/different_schemas.ndjson', field_appearance_threshold=-1)
----
Binder Error: read_json_auto "field_appearance_threshold" parameter must be between 0 and 1
statement error
select * from read_json_auto('data/json/different_schemas.ndjson', field_appearance_threshold=2)
----
Binder Error: read_json_auto "field_appearance_threshold" parameter must be between 0 and 1
# inconsistent schema's - if we only sample 1 row, we get an error, because we only see a NULL value for the 2nd column
statement error
select * from read_json_auto('data/json/inconsistent_schemas.ndjson', sample_size=1, convert_strings_to_integers=true)
----
Invalid Input Error: JSON transform error in file "data/json/inconsistent_schemas.ndjson", in line 3
# if we increase the sample size to 2, we can read it just fine
query II
select * from read_json_auto('data/json/inconsistent_schemas.ndjson', sample_size=2)
----
"1" NULL
2 Home for the Holidays
[3] The Firm
4 Broadcast News
5 Raising Arizona
# we can also find bigint in strings (happens a lot in JSON for some reason ...)
statement ok
copy (select * from (values ('{"id": "26941143801"}'), ('{"id": "26941143807"}'))) to '__TEST_DIR__/my_file.json' (format csv, quote '', header 0)
# but only if we set the parameter to true
query T
select typeof(id) from read_json('__TEST_DIR__/my_file.json', convert_strings_to_integers=true)
----
BIGINT
BIGINT
# empty array and the example file works
query II
select * from read_json_auto(['data/json/empty_array.json', 'data/json/example_n.ndjson']);
----
1 O Brother, Where Art Thou?
2 Home for the Holidays
3 The Firm
4 Broadcast News
5 Raising Arizona
# Simple map inference with default threshold
query T
select distinct typeof(a) from read_json_auto('data/json/simple_map.jsonl')
----
MAP(VARCHAR, BIGINT)
# Test setting map_inference_threshold high
query T
select distinct typeof(a) from read_json_auto('data/json/simple_map.jsonl', map_inference_threshold=1000)
----
MAP(VARCHAR, BIGINT)
# Map inference can be disabled
query T
select distinct typeof(a) from read_json_auto('data/json/simple_map.jsonl', map_inference_threshold=-1, field_appearance_threshold=0)
----
STRUCT("1" JSON, "2" BIGINT, "3" BIGINT, "4" BIGINT, "5" BIGINT, "6" BIGINT, "7" BIGINT, "8" BIGINT, "9" BIGINT, "10" BIGINT, "11" BIGINT, "12" BIGINT, "13" BIGINT, "14" BIGINT, "15" BIGINT, "16" JSON, "17" BIGINT, "18" BIGINT, "19" BIGINT, "20" BIGINT, "21" BIGINT, "22" BIGINT, "23" BIGINT, "24" BIGINT, "25" BIGINT, "26" BIGINT, "27" BIGINT, "28" BIGINT, "29" BIGINT, "30" BIGINT, "31" BIGINT, "32" BIGINT, "33" BIGINT, "34" BIGINT, "35" BIGINT, "36" BIGINT, "37" BIGINT, "38" BIGINT, "39" BIGINT, "40" BIGINT, "41" BIGINT, "42" BIGINT, "43" BIGINT, "44" BIGINT, "45" BIGINT, "46" BIGINT, "47" BIGINT, "48" BIGINT, "49" BIGINT, "50" BIGINT, "51" BIGINT, "52" BIGINT, "53" BIGINT, "54" BIGINT, "55" BIGINT, "56" BIGINT, "57" BIGINT, "58" BIGINT, "59" BIGINT, "60" BIGINT, "61" BIGINT, "62" BIGINT, "63" BIGINT, "64" BIGINT, "65" BIGINT, "66" BIGINT, "67" BIGINT, "68" BIGINT, "69" BIGINT, "70" BIGINT, "71" BIGINT, "72" BIGINT, "73" BIGINT, "74" BIGINT, "75" BIGINT, "76" BIGINT, "77" BIGINT, "78" BIGINT, "79" BIGINT, "80" BIGINT, "81" BIGINT, "82" BIGINT, "83" BIGINT, "84" BIGINT, "85" BIGINT, "86" BIGINT, "87" BIGINT, "88" BIGINT, "89" BIGINT, "90" BIGINT, "91" BIGINT, "92" BIGINT, "93" BIGINT, "94" BIGINT, "95" BIGINT, "96" BIGINT, "97" BIGINT, "98" BIGINT, "99" BIGINT, "100" BIGINT)
# Map inference with max_depth works as expected
query T
select distinct typeof(a) from read_json_auto('data/json/simple_map.jsonl', maximum_depth=2)
----
MAP(VARCHAR, JSON)
query T
select distinct typeof(a) from read_json_auto('data/json/simple_map.jsonl', maximum_depth=1)
----
JSON
# Map where all values are null
query T
select distinct typeof(a) from read_json_auto('data/json/map_of_nulls.jsonl')
----
MAP(VARCHAR, JSON)
# Map type can be inferred at the top level
query T
select distinct typeof(json) from read_json_auto('data/json/top_level_map.jsonl')
----
MAP(VARCHAR, BIGINT)
# Map type can be inferred for struct value type
query T
select distinct typeof(a) from read_json_auto('data/json/map_of_structs.jsonl')
----
MAP(VARCHAR, STRUCT(b BIGINT))
# Map 80% similarity check works
query T
select distinct typeof(a) from read_json_auto('data/json/map_50_50.jsonl', map_inference_threshold=10)
----
STRUCT(s1 STRUCT(f1 BIGINT[]), s2 STRUCT(f2 BIGINT[]), s3 STRUCT(f1 BIGINT[]), s4 STRUCT(f2 BIGINT[]), s5 STRUCT(f1 BIGINT[]), s6 STRUCT(f2 BIGINT[]), s7 STRUCT(f1 BIGINT[]), s8 STRUCT(f2 BIGINT[]), s9 STRUCT(f1 BIGINT[]), s10 STRUCT(f2 BIGINT[]))
# Map of maps
query T
select distinct typeof(a) from read_json_auto('data/json/map_of_map.jsonl', map_inference_threshold=10)
----
MAP(VARCHAR, MAP(VARCHAR, BIGINT))
# All NULL types get converted to JSON if we do map inference
query T
select distinct typeof(a) from read_json_auto('data/json/map_of_struct_with_nulls.jsonl', map_inference_threshold=10)
----
MAP(VARCHAR, STRUCT(a JSON[]))
# Candidate types are properly handled for map inference
query I
SELECT distinct typeof(a) FROM read_json_auto('data/json/map_of_dates.jsonl', map_inference_threshold=25)
----
MAP(VARCHAR, DATE)
# Mixed candidate types are also handled
query I
SELECT distinct typeof(a) FROM read_json_auto('data/json/map_of_mixed_date_timestamps.jsonl', map_inference_threshold=25)
----
MAP(VARCHAR, VARCHAR)
# Incompatible types are handled correctly
query T
select distinct typeof(a) from read_json_auto('data/json/map_incompatible.jsonl', map_inference_threshold=10)
----
STRUCT(s1 STRUCT("1" JSON), s2 STRUCT("1" MAP(VARCHAR, JSON)), s3 STRUCT("1" VARCHAR), s4 STRUCT("1" BIGINT[]), s5 STRUCT("1" BIGINT), s6 STRUCT("1" VARCHAR), s7 STRUCT("1" BIGINT[]), s8 STRUCT("1" BIGINT), s9 STRUCT("1" VARCHAR), s10 STRUCT("1" BIGINT[]))
# Can't set map_inference_threshold to a negative value (except -1)
statement error
select * from read_json_auto('data/json/simple_map.jsonl', map_inference_threshold=-10)
----
Binder Error: read_json_auto "map_inference_threshold" parameter must be 0 or positive, or -1 to disable map inference for consistent objects.
# if we only sample the first file, we default to a single JSON column
query I
select * from read_json_auto(['data/json/empty_array.json', 'data/json/example_n.ndjson'], maximum_sample_files=1);
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
# -1 is unlimited
query II
select * from read_json_auto(['data/json/empty_array.json', 'data/json/example_n.ndjson'], maximum_sample_files=-1);
----
1 O Brother, Where Art Thou?
2 Home for the Holidays
3 The Firm
4 Broadcast News
5 Raising Arizona
# can't be -2 or lower
statement error
select * from read_json_auto(['data/json/empty_array.json', 'data/json/example_n.ndjson'], maximum_sample_files=-2);
----
Binder Error
# can't be 0
statement error
select * from read_json_auto(['data/json/empty_array.json', 'data/json/example_n.ndjson'], maximum_sample_files=0);
----
Binder Error
# cannot be NULL either
statement error
select * from read_json_auto(['data/json/empty_array.json', 'data/json/example_n.ndjson'], maximum_sample_files=NULL);
----
Binder Error

View File

@@ -0,0 +1,130 @@
# name: test/sql/json/table/read_json_dates.test
# description: Read json files - date detection
# group: [table]
require json
statement ok
pragma enable_verification
# issue #6774
query I
select * from read_json_auto('data/json/simple_timestamp.json', columns={"ts": "TIMESTAMP[]"});
----
['2022-06-01 06:41:58', '2021-08-21 08:26:55.5', '2009-11-15 21:58:54.636']
# create date and timestamp tables
statement ok
create table date_test as select '1996/03/27'::DATE d
statement ok
create table timestamp_test as select '1996-03-27 07:42:33'::TIMESTAMP t
# cannot be empty
statement error
copy (select d from date_test) to '__TEST_DIR__/my_file.json' (dateformat)
----
Binder Error
statement error
copy (select d from date_test) to '__TEST_DIR__/my_file.json' (timestampformat)
----
Binder Error
statement error
copy date_test from 'data/json/simple_timestamp.json' (dateformat)
----
Binder Error
statement error
copy date_test from 'data/json/simple_timestamp.json' (timestampformat)
----
Binder Error
# test all supported date formats
foreach date_format '%m-%d-%Y' '%m-%d-%y' '%d-%m-%Y' '%d-%m-%y' '%Y-%m-%d' '%y-%m-%d'
statement ok
copy (select d from date_test) to '__TEST_DIR__/my_file.json' (dateformat ${date_format})
# auto-detect
query II
select typeof(d), d from '__TEST_DIR__/my_file.json'
----
DATE 1996-03-27
# forced format read_ndjson
query II
select typeof(d), d from read_ndjson('__TEST_DIR__/my_file.json', columns={d: 'DATE'}, dateformat=${date_format})
----
DATE 1996-03-27
# wrong format read_ndjson
statement error
select typeof(d), d from read_ndjson('__TEST_DIR__/my_file.json', columns={d: 'DATE'}, dateformat='%d-%Y-%m')
----
Invalid Input Error
# forced format COPY
statement ok
drop table if exists date_copy_test
statement ok
create table date_copy_test (d date)
statement ok
copy date_copy_test from '__TEST_DIR__/my_file.json' (dateformat ${date_format})
query II
select typeof(d), d from date_copy_test
----
DATE 1996-03-27
endloop
# test all supported timestamp formats (hacky way to do foreach parameters that need spaces in them)
foreach a,b,c '%Y-%m-%d,%H:%M:%S.%f,' '%m-%d-%Y,%I:%M:%S,%p' '%m-%d-%y,%I:%M:%S,%p' '%d-%m-%Y,%H:%M:%S,' '%d-%m-%y,%H:%M:%S,' '%Y-%m-%d,%H:%M:%S,' '%y-%m-%d,%H:%M:%S,'
statement ok
copy (select t from timestamp_test) to '__TEST_DIR__/my_file.json' (format json, timestampformat ${a} ${b} ${c})
# auto-detect
query II
select typeof(t), t from '__TEST_DIR__/my_file.json'
----
TIMESTAMP 1996-03-27 07:42:33
# forced format read_ndjson
query II
select typeof(t), t from read_ndjson('__TEST_DIR__/my_file.json', columns={t: 'TIMESTAMP'}, timestamp_format=${a} ${b} ${c})
----
TIMESTAMP 1996-03-27 07:42:33
# wrong format read_ndjson
statement error
select typeof(t), t from read_ndjson('__TEST_DIR__/my_file.json', columns={t: 'TIMESTAMP'}, timestamp_format='%H:%M:%S%y-%m-%d')
----
Invalid Input Error
# forced format COPY
statement ok
drop table if exists timestamp_copy_test
statement ok
create table timestamp_copy_test (t timestamp)
statement ok
copy timestamp_copy_test from '__TEST_DIR__/my_file.json' (format json, timestampformat ${a} ${b} ${c})
query II
select typeof(t), t from timestamp_copy_test
----
TIMESTAMP 1996-03-27 07:42:33
endloop
# test this format too
query II
select typeof(createdAt), createdAt from 'data/json/timestamp_example.json'
----
TIMESTAMP 2023-02-07 19:12:28

View File

@@ -0,0 +1,33 @@
# name: test/sql/json/table/read_json_many_files.test_slow
# description: Read > 1000 json files (issue #6249)
# group: [table]
require json
statement ok
create table input as select range as a from range(1, 4);
loop i 0 2000
statement ok
copy input to '__TEST_DIR__/input${i}.json';
endloop
query T
select count(*) from read_json_auto('__TEST_DIR__/input*.json');
----
6000
# also test gzipped (issue #6588)
loop i 0 2000
statement ok
copy input to '__TEST_DIR__/input${i}.json.gz' (COMPRESSION GZIP);
endloop
query T
select count(*) from read_json_auto('__TEST_DIR__/input*.json.gz');
----
6000

View File

@@ -0,0 +1,231 @@
# name: test/sql/json/table/read_json_objects.test
# description: Read ndjson files
# group: [table]
require json
# we cannot check the error output for the specific byte, because on Windows the \n are replaced with \r\n
# therefore, the byte count is different. So, we cut off the error message here
statement error
select * from read_json_objects('data/json/unterminated_quotes.ndjson')
----
Invalid Input Error: Malformed JSON
# now it should work!
query I
SELECT * FROM read_csv('data/json/example_n.ndjson', columns={'json': 'JSON'}, delim=NULL, header=0, quote=NULL, escape=NULL, auto_detect = false)
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
# example_n is with regular \n newlines
query I
SELECT * FROM read_ndjson_objects('data/json/example_n.ndjson')
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
# this one does not have the 'records' param
statement error
SELECT * FROM read_ndjson_objects('data/json/example_n.ndjson', records='false')
----
Binder Error: Invalid named parameter
query I
SELECT * FROM read_ndjson_objects('data/json/example_n.ndjson')
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
# we can auto-detect that it's newline-delimited
query I
SELECT * FROM read_json_objects('data/json/example_n.ndjson', format='auto')
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
# example_r is with \r newlines - works with unstructured
query I
SELECT * FROM read_json_objects('data/json/example_r.ndjson', format='unstructured')
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
# we can detect that it's not newline-delimited
query I
SELECT * FROM read_json_objects('data/json/example_r.ndjson', format='auto')
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
# \r newlines are NOT valid according to ndjson spec - this does not work, all a single line
statement error
SELECT * FROM read_ndjson_objects('data/json/example_r.ndjson')
----
Invalid Input Error: Malformed JSON in file "data/json/example_r.ndjson"
# example_rn is with \r\n newlines
query I
SELECT * FROM read_ndjson_objects('data/json/example_rn.ndjson')
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
query I
SELECT * FROM read_ndjson_objects('data/json/example_rn.ndjson')
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
# same but gzipped
query I
SELECT * FROM read_ndjson_objects('data/json/example_rn.ndjson.gz')
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
query I
SELECT * FROM read_json_objects('data/json/example_rn.ndjson.gz', format='nd')
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
# multi-file scan
query I
SELECT count(*) from read_json_objects(['data/json/example_n.ndjson', 'data/json/example_r.ndjson', 'data/json/example_rn.ndjson'], format='auto')
----
15
query I
SELECT count(*) from read_ndjson_objects(['data/json/example_n.ndjson', 'data/json/example_rn.ndjson'])
----
10
# globbing
query I
SELECT count(*) from read_json_objects('data/json/example_*.ndjson', format='auto')
----
15
query I
SELECT count(*) from read_ndjson_objects('data/json/example_*n.ndjson')
----
10
# empty file
query I
select * from read_json_objects('data/json/empty.ndjson')
----
query I
select * from read_ndjson_objects('data/json/empty.ndjson')
----
# invalid json stuff
statement error
select * from read_json_objects('data/json/unterminated_quotes.ndjson', format='nd')
----
Invalid Input Error: Malformed JSON in file "data/json/unterminated_quotes.ndjson"
statement error
select * from read_ndjson_objects('data/json/unterminated_quotes.ndjson')
----
Invalid Input Error: Malformed JSON in file "data/json/unterminated_quotes.ndjson"
# we can auto-detect and ignore the error (becomes NULL)
query I
select * from read_json_objects('data/json/unterminated_quotes.ndjson', format='auto', ignore_errors=true)
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
NULL
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
# multiple values per line (works for read_json_objects)
query I
select * from read_json_objects('data/json/multiple_objects_per_line.ndjson', format='unstructured')
----
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
# does not work for read_ndjson_objects
statement error
select * from read_ndjson_objects('data/json/multiple_objects_per_line.ndjson')
----
Invalid Input Error: Malformed JSON in file "data/json/multiple_objects_per_line.ndjson"
# what if we try to read a CSV?
statement error
select * from read_json_objects('data/csv/tpcds_14.csv')
----
Invalid Input Error: Malformed JSON
statement error
select * from read_ndjson_objects('data/csv/tpcds_14.csv')
----
Invalid Input Error: Malformed JSON in file "data/csv/tpcds_14.csv"
# how about parquet?
statement error
select * from read_json_objects('data/parquet-testing/blob.parquet')
----
Invalid Input Error: Malformed JSON
statement error
select * from read_ndjson_objects('data/parquet-testing/blob.parquet')
----
Invalid Input Error: Malformed JSON in file "data/parquet-testing/blob.parquet"
# we can also read the objects from a JSON array (not newline-delimited)
query I
select * from read_json_objects('data/json/top_level_array.json')
----
{"conclusion":"cancelled"}
{"conclusion":"cancelled"}
# and auto-detect it
query I
select * from read_json_objects('data/json/top_level_array.json', format='auto')
----
{"conclusion":"cancelled"}
{"conclusion":"cancelled"}
# the file only has one line, so if we read this as ndjson, we just get the array
query I
select * from read_json_objects('data/json/top_level_array.json', format='nd')
----
[{"conclusion":"cancelled"}, {"conclusion":"cancelled"}]

View File

@@ -0,0 +1,35 @@
# name: test/sql/json/table/read_json_union.test
# description: Read json files with unions straight to columnar data
# group: [table]
require json
statement ok
pragma enable_verification
query I
SELECT data FROM read_ndjson('data/json/union.ndjson', columns={data: 'UNION(name VARCHAR, age INT, veteran BOOL)'})
----
Frank
5
false
statement error
SELECT * FROM read_ndjson('data/json/malformed/union/bad_key.ndjson', columns={data: 'UNION(name VARCHAR, age INT, veteran BOOL)'})
----
Found object containing unknown key, instead of union
statement error
SELECT * FROM read_ndjson('data/json/malformed/union/empty_object.ndjson', columns={data: 'UNION(name VARCHAR, age INT, veteran BOOL)'})
----
Found empty object, instead of union
statement error
SELECT * FROM read_ndjson('data/json/malformed/union/non_object.ndjson', columns={data: 'UNION(name VARCHAR, age INT, veteran BOOL)'})
----
Expected an object representing a union, got uint
statement error
SELECT * FROM read_ndjson('data/json/malformed/union/too_many_keys.ndjson', columns={data: 'UNION(name VARCHAR, age INT, veteran BOOL)'})
----
Found object containing more than one key, instead of union

View File

@@ -0,0 +1,51 @@
# name: test/sql/json/table/test_json_nested_struct_projection_pushdown.test
# description: Test JSON struct projection pushdown on deeply nested data
# group: [table]
require json
statement ok
pragma enable_verification
statement ok
COPY (SELECT {goose: 42, pigeon: 4.2, nested_duck: {nested_nested_duck: {goose: 42, pigeon: 4.2, nested_nested_nested_duck: {goose: 42, pigeon: 4.2}}, goose: 42, pigeon: 4.2}} AS duck) TO '__TEST_DIR__/nested.json'
query I
SELECT duck.goose FROM '__TEST_DIR__/nested.json'
----
42
query I
SELECT json.duck.goose FROM read_json('__TEST_DIR__/nested.json', records=false)
----
42
query I
SELECT duck.nested_duck.goose FROM '__TEST_DIR__/nested.json'
----
42
query I
SELECT json.duck.nested_duck.goose FROM read_json('__TEST_DIR__/nested.json', records=false)
----
42
query I
SELECT duck.nested_duck.nested_nested_duck.goose FROM '__TEST_DIR__/nested.json'
----
42
query I
SELECT json.duck.nested_duck.nested_nested_duck.goose FROM read_json('__TEST_DIR__/nested.json', records=false)
----
42
query I
SELECT duck.nested_duck.nested_nested_duck.nested_nested_nested_duck.goose FROM '__TEST_DIR__/nested.json'
----
42
query I
SELECT json.duck.nested_duck.nested_nested_duck.nested_nested_nested_duck.goose FROM read_json('__TEST_DIR__/nested.json', records=false)
----
42

View File

@@ -0,0 +1,748 @@
# name: test/sql/json/table/test_json_table_in_out.test_slow
# description: Test JSON table in/out functions (json_each/json_tree)
# group: [table]
require json
statement ok
pragma enable_verification
# some larger-than-vector-size tests
query I
select count(*) from json_each(range(3000));
----
3000
# this should be equal to the 3000th triangular number
query I
select count(*) = 3000*(3000+1)//2 from range(1, 3001), json_each(range(range));
----
true
##### SQLITE TESTS #####
### JSON101-5 ###
statement ok
CREATE OR REPLACE TABLE j2(id INTEGER PRIMARY KEY, json JSON, src VARCHAR);
statement ok
INSERT INTO j2(id,json,src)
VALUES(1,'{
"firstName": "John",
"lastName": "Smith",
"isAlive": true,
"age": 25,
"address": {
"streetAddress": "21 2nd Street",
"city": "New York",
"state": "NY",
"postalCode": "10021-3100"
},
"phoneNumbers": [
{
"type": "home",
"number": "212 555-1234"
},
{
"type": "office",
"number": "646 555-4567"
}
],
"children": [],
"spouse": null
}','https://en.wikipedia.org/wiki/JSON');
statement ok
INSERT INTO j2(id,json,src)
VALUES(2, '{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil''s Food" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}','https://adobe.github.io/Spry/samples/data_region/JSONDataSetSample.html');
statement ok
INSERT INTO j2(id,json,src)
VALUES(3,'[
{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil''s Food" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
},
{
"id": "0002",
"type": "donut",
"name": "Raised",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
},
{
"id": "0003",
"type": "donut",
"name": "Old Fashioned",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}
]','https://adobe.github.io/Spry/samples/data_region/JSONDataSetSample.html');
query I
select count(*) from j2, json_tree(j2.json);
----
154
query IIIII
SELECT j2.rowid, jx.rowid, fullkey, path, key
FROM j2, json_tree(j2.json) AS jx
WHERE fullkey!=(path || CASE WHEN TRY_CAST(key AS UBIGINT) IS NOT NULL THEN '['||key||']'
ELSE '.'||key END);
----
query IIIII
SELECT j2.rowid, jx.rowid, fullkey, path, key
FROM j2, json_each(j2.json) AS jx
WHERE fullkey!=(path || CASE WHEN TRY_CAST(key AS UBIGINT) IS NOT NULL THEN '['||key||']'
ELSE '.'||key END);
----
query IIIII
SELECT j2.rowid, jx.rowid, fullkey, path, key
FROM j2, json_each(j2.json) AS jx
WHERE jx.json<>j2.json;
----
query IIIII
SELECT j2.rowid, jx.rowid, fullkey, path, key
FROM j2, json_tree(j2.json) AS jx
WHERE jx.json<>j2.json;
----
query IIIII
SELECT j2.rowid, jx.rowid, fullkey, path, key
FROM j2, json_each(j2.json) AS jx
WHERE jx.value<>jx.atom AND type NOT IN ('ARRAY','OBJECT');
----
query IIIII
SELECT j2.rowid, jx.rowid, fullkey, path, key
FROM j2, json_tree(j2.json) AS jx
WHERE jx.value<>jx.atom AND type NOT IN ('ARRAY','OBJECT');
----
### JSON101-13 ###
statement ok
DROP TABLE IF EXISTS t1;
statement ok
DROP TABLE IF EXISTS t2;
statement ok
CREATE OR REPLACE TABLE t1(id INTEGER, json JSON);
statement ok
INSERT INTO t1(id,json) VALUES(1,'{"items":[3,5]}');
statement ok
CREATE OR REPLACE TABLE t2(id INTEGER, json JSON);
statement ok
INSERT INTO t2(id,json) VALUES(2,'{"value":2}');
statement ok
INSERT INTO t2(id,json) VALUES(3,'{"value":3}');
statement ok
INSERT INTO t2(id,json) VALUES(4,'{"value":4}');
statement ok
INSERT INTO t2(id,json) VALUES(5,'{"value":5}');
statement ok
INSERT INTO t2(id,json) VALUES(6,'{"value":6}');
query I
select count(*) from t2, json_each(t2.json) je;
----
5
query I
select je.rowid from t2, json_each(t2.json) je;
----
0
0
0
0
0
# our result here differs from SQLite because our correlated subquery behavior is different
query IIII rowsort
SELECT * FROM t1 CROSS JOIN t2
WHERE EXISTS(SELECT 1 FROM json_each(t1.json,'$.items') AS Z
WHERE Z.value==t2.id);
----
1 {"items":[3,5]} 3 {"value":3}
1 {"items":[3,5]} 5 {"value":5}
query IIII rowsort
SELECT * FROM t2 CROSS JOIN t1
WHERE EXISTS(SELECT 1 FROM json_each(t1.json,'$.items') AS Z
WHERE Z.value==t2.id);
----
3 {"value":3} 1 {"items":[3,5]}
5 {"value":5} 1 {"items":[3,5]}
### JSON101-14 ###
query I
SELECT fullkey FROM json_each('123');
----
$
query I
SELECT fullkey FROM json_each('123.56');
----
$
query I
SELECT fullkey FROM json_each('"hello"');
----
$
query I
SELECT fullkey FROM json_each('null');
----
$
query I
SELECT fullkey FROM json_tree('123');
----
$
query I
SELECT fullkey FROM json_tree('123.56');
----
$
query I
SELECT fullkey FROM json_tree('"hello"');
----
$
query I
SELECT fullkey FROM json_tree('null');
----
$
### JSON101-15 ###
query IIIIIIII
SELECT * FROM JSON_EACH('{"a":1, "b":2}');
----
a 1 UBIGINT 1 2 NULL $.a $
b 2 UBIGINT 2 4 NULL $.b $
query IIIIIIII
SELECT xyz.* FROM JSON_EACH('{"a":1, "b":2}') AS xyz;
----
a 1 UBIGINT 1 2 NULL $.a $
b 2 UBIGINT 2 4 NULL $.b $
query IIIIIIII
SELECT * FROM (FROM JSON_EACH('{"a":1, "b":2}'));
----
a 1 UBIGINT 1 2 NULL $.a $
b 2 UBIGINT 2 4 NULL $.b $
query IIIIIIII
SELECT xyz.* FROM (FROM JSON_EACH('{"a":1, "b":2}')) AS xyz;
----
a 1 UBIGINT 1 2 NULL $.a $
b 2 UBIGINT 2 4 NULL $.b $
### JSON101-17 ###
query I
SELECT count(*) FROM json_each(NULL);
----
0
query I
SELECT count(*) FROM json_tree(NULL);
----
0
### JSON102-1000 ###
statement ok
CREATE OR REPLACE TABLE user(name VARCHAR,phone JSON);
statement ok
INSERT INTO user(name,phone) VALUES
('Alice','["919-555-2345","804-555-3621"]'),
('Bob','["201-555-8872"]'),
('Cindy','["704-555-9983"]'),
('Dave','["336-555-8421","704-555-4321","803-911-4421"]');
query I rowsort
SELECT DISTINCT user.name
FROM user, json_each(user.phone)
WHERE json_each.value LIKE '"704-%'
ORDER BY 1;
----
Cindy
Dave
statement ok
UPDATE user
SET phone=json_extract(phone,'$[0]')
WHERE json_array_length(phone)<2;
query II rowsort
SELECT name, substr(phone,1,5) FROM user ORDER BY name;
----
Alice ["919
Bob "201-
Cindy "704-
Dave ["336
query I rowsort
SELECT name FROM user WHERE phone LIKE '"704-%'
UNION
SELECT user.name
FROM user, json_each(user.phone)
WHERE json_valid(user.phone)
AND json_each.value LIKE '"704-%';
----
Cindy
Dave
### JSON102-1010 ###
statement ok
CREATE OR REPLACE TABLE big(json JSON);
statement ok
INSERT INTO big(json) VALUES('{
"id":123,
"stuff":[1,2,3,4],
"partlist":[
{"uuid":"bb108722-572e-11e5-9320-7f3b63a4ca74"},
{"uuid":"c690dc14-572e-11e5-95f9-dfc8861fd535"},
{"subassembly":[
{"uuid":"6fa5181e-5721-11e5-a04e-57f3d7b32808"}
]}
]
}');
statement ok
INSERT INTO big(json) VALUES('{
"id":456,
"stuff":["hello","world","xyzzy"],
"partlist":[
{"uuid":false},
{"uuid":"c690dc14-572e-11e5-95f9-dfc8861fd535"}
]
}');
query III nosort q0
SELECT big.rowid, fullkey, value
FROM big, json_tree(big.json)
WHERE json_tree.type NOT IN ('OBJECT','ARRAY')
ORDER BY +big.rowid, +json_tree.id;
----
0 $.id 123
0 $stuff[0] 1
0 $stuff[1] 2
0 $stuff[2] 3
0 $stuff[3] 4
0 $partlist.uuid "bb108722-572e-11e5-9320-7f3b63a4ca74"
0 $partlist.uuid "c690dc14-572e-11e5-95f9-dfc8861fd535"
0 $partlistsubassembly.uuid "6fa5181e-5721-11e5-a04e-57f3d7b32808"
1 $.id 456
1 $stuff[0] "hello"
1 $stuff[1] "world"
1 $stuff[2] "xyzzy"
1 $partlist.uuid false
1 $partlist.uuid "c690dc14-572e-11e5-95f9-dfc8861fd535"
query III nosort q0
SELECT big.rowid, fullkey, atom
FROM big, json_tree(big.json)
WHERE atom IS NOT NULL
ORDER BY +big.rowid, +json_tree.id
----
query I
SELECT DISTINCT json_extract(big.json,'$.id')
FROM big, json_tree(big.json,'$.partlist')
WHERE json_tree.key='uuid'
AND json_tree.value='"6fa5181e-5721-11e5-a04e-57f3d7b32808"';
----
123
query I
SELECT DISTINCT json_extract(big.json,'$.id')
FROM big, json_tree(big.json,'$')
WHERE json_tree.key='uuid'
AND json_tree.value='"6fa5181e-5721-11e5-a04e-57f3d7b32808"';
----
123
query I
SELECT DISTINCT json_extract(big.json,'$.id')
FROM big, json_tree(big.json)
WHERE json_tree.key='uuid'
AND json_tree.value='"6fa5181e-5721-11e5-a04e-57f3d7b32808"';
----
123
### JSON107 ###
query II
SELECT key, value FROM json_tree('{"a":123,"b":456}')
WHERE atom;
----
a 123
b 456
### JSON502 ###
statement ok
CREATE OR REPLACE TABLE t1(x JSON);
statement ok
INSERT INTO t1(x) VALUES('{"a":{"b":{"c":"hello",},},}');
query I
SELECT fullkey FROM t1, json_tree(x) order by json_tree.rowid;
----
$
$.a
$.a.b
$.a.b.c
### JOIN-23 ###
statement ok
CREATE OR REPLACE TABLE a(value TEXT);
statement ok
INSERT INTO a(value) SELECT value FROM json_each('["a", "b", null]');
statement ok
CREATE OR REPLACE TABLE b(value TEXT);
statement ok
INSERT INTO b(value) SELECT value FROM json_each('["a", "c", null]');
query II rowsort q1
SELECT a.value, b.value FROM a RIGHT JOIN b ON a.value = b.value;
----
"a" "a"
null null
NULL "c"
query II rowsort q1
SELECT a.value, b.value FROM b LEFT JOIN a ON a.value = b.value;
----
query II rowsort q1
SELECT a.value, b.value
FROM json_each('["a", "c", null]') AS b
LEFT JOIN
json_each('["a", "b", null]') AS a ON a.value = b.value;
----
query II rowsort q1
SELECT a.value, b.value
FROM json_each('["a", "b", null]') AS a
RIGHT JOIN
json_each('["a", "c", null]') AS b ON a.value = b.value;
----
query II rowsort q1
SELECT a.value, b.value
FROM json_each('["a", "b", null]') AS a
RIGHT JOIN
b ON a.value = b.value;
----
query II rowsort q1
SELECT a.value, b.value
FROM a
RIGHT JOIN
json_each('["a", "c", null]') AS b ON a.value = b.value;
----
### JOIN8-6000 ###
statement ok
CREATE OR REPLACE TABLE t1(a INTEGER PRIMARY KEY, b TEXT, c TEXT, d REAL);
statement ok
INSERT INTO t1 VALUES(0,'A','aa',2.5);
query IIII
SELECT * FROM t1 AS t2 NATURAL RIGHT JOIN t1 AS t3
WHERE (a,b) IN (SELECT rowid, b FROM t1);
----
0 A aa 2.5
statement ok
DROP TABLE IF EXISTS t1;
statement ok
CREATE OR REPLACE TABLE t1(a INT PRIMARY KEY, b TEXT, c TEXT, d INT);
statement ok
INSERT INTO t1 VALUES(15,'xray','baker',42);
query IIIII
SELECT value, t1.* FROM json_each('7') RIGHT JOIN t1 USING (rowid)
WHERE (a,b) IN (SELECT a, b FROM t1);
----
7 15 xray baker 42
statement ok
DROP TABLE IF EXISTS t1;
statement ok
CREATE OR REPLACE TABLE t1(a INTEGER PRIMARY KEY,b INTEGER);
statement ok
INSERT INTO t1 VALUES(0,NULL),(1,2);
query III
SELECT value, t1.* FROM json_each('null') RIGHT JOIN t1 USING (rowid)
WHERE (a,b) IN (SELECT rowid, b FROM t1);
----
NULL 1 2
statement ok
CREATE OR REPLACE TABLE a(key TEXT);
statement ok
INSERT INTO a(key) VALUES('a'),('b');
query II
SELECT to_json(a.key), b.value
FROM a RIGHT JOIN json_each('["a","c"]') AS b ON to_json(a.key)=b.value;
----
"a" "a"
NULL "c"
### WindowB-11 ###
query I
SELECT value FROM json_each('[1,2,3,4,5]');
----
1
2
3
4
5
query II
SELECT key, value FROM json_each('[1,2,3,4,5]');
----
0 1
1 2
2 3
3 4
4 5
query II
SELECT rowid, value FROM json_each('[1,2,3,4,5]');
----
0 1
1 2
2 3
3 4
4 5
query I
SELECT sum(value::int) OVER (ORDER BY rowid) FROM json_each('[1,2,3,4,5]')
----
1
3
6
10
15
query I
SELECT sum(value::int) OVER (
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) FROM json_each('[1,2,3,4,5]')
----
1
3
6
10
15
query I
SELECT sum(value::int) OVER (ORDER BY rowid DESC) FROM json_each('[1,2,3,4,5]')
----
5
9
12
14
15
query I
SELECT sum(value::int) OVER (ORDER BY value ASC) FROM json_each('[2,1,4,3,5]')
----
1
3
6
10
15
### WhereF-6 ###
statement ok
CREATE OR REPLACE TABLE t6(x JSON);
query I
SELECT * FROM t6 WHERE 1 IN (SELECT value FROM json_each(x));
----
statement ok
DROP TABLE t6;
statement ok
CREATE OR REPLACE TABLE t6(a int,b int,c json);
statement ok
INSERT INTO t6 VALUES
(0,null,'{"a":0,"b":[3,4,5],"c":{"x":4.5,"y":7.8}}'),
(1,null,'{"a":1,"b":[3,4,5],"c":{"x":4.5,"y":7.8}}'),
(2,null,'{"a":9,"b":[3,4,5],"c":{"x":4.5,"y":7.8}}');
query III
SELECT * FROM t6
WHERE (EXISTS (SELECT 1 FROM json_each(t6.c) AS x WHERE x.type = 'UBIGINT' AND x.value=1));
----
1 NULL {"a":1,"b":[3,4,5],"c":{"x":4.5,"y":7.8}}
# Another test case derived from a posting by Wout Mertens on the
# sqlite-users mailing list on 2017-10-04.
statement ok
DROP TABLE IF EXISTS t;
statement ok
CREATE OR REPLACE TABLE t(json JSON);
query I
SELECT * FROM t
WHERE(EXISTS(SELECT 1 FROM json_each(t.json,'$.foo') j
WHERE j.value = 'meep'));
----
statement ok
INSERT INTO t VALUES('{"xyzzy":null}');
statement ok
INSERT INTO t VALUES('{"foo":"meep","other":12345}');
statement ok
INSERT INTO t VALUES('{"foo":"bingo","alt":5.25}');
query I
SELECT * FROM t
WHERE(EXISTS(SELECT 1 FROM json_each(t.json,'$.foo') j
WHERE j.value = '"meep"'));
----
{"foo":"meep","other":12345}
# internal issue 5080
statement ok
create table json_table as
select '{"my_array":[{"my_key":42},{"my_key":9001}]}' as my_json;
query II
select fullkey, path from json_table, json_tree(json_table.my_json) order by json_tree.rowid;
----
$ $
$.my_array $
$.my_array[0] $.my_array
$.my_array[0].my_key $.my_array[0]
$.my_array[1] $.my_array
$.my_array[1].my_key $.my_array[1]
# internal issues 5772 and 5776
statement ok
create table all_types as select * exclude(small_enum, medium_enum, large_enum) from test_all_types() limit 0;
statement ok
SELECT NULL FROM json_each(6051, NULL)