# name: test/sql/json/table/read_json.test # description: Read json files straight to columnar data # group: [table] require json statement ok pragma enable_verification statement error SELECT * FROM read_json('data/json/example_n.ndjson', auto_detect=false) ---- Binder Error # can't read ndjson with array statement error SELECT * FROM read_json('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='array') ---- Invalid Input Error: Expected top-level JSON array # read_ndjson works query II SELECT * FROM read_ndjson('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}) ---- 1 O Brother, Where Art Thou? 2 Home for the Holidays 3 The Firm 4 Broadcast News 5 Raising Arizona # We can also read only one of the columns query I SELECT * FROM read_ndjson('data/json/example_n.ndjson', columns={id: 'INTEGER'}) ---- 1 2 3 4 5 query I SELECT * FROM read_ndjson('data/json/example_n.ndjson', columns={name: 'VARCHAR'}) ---- O Brother, Where Art Thou? Home for the Holidays The Firm Broadcast News Raising Arizona # what about a broken JSON file query II SELECT * FROM read_ndjson('data/json/unterminated_quotes.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, ignore_errors=true) ---- 1 O Brother, Where Art Thou? 2 Home for the Holidays NULL NULL 4 Broadcast News 5 Raising Arizona # some of these values don't have "name" query II SELECT * FROM read_ndjson('data/json/different_schemas.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}) ---- 1 O Brother, Where Art Thou? 2 NULL 3 The Firm 4 NULL 5 Raising Arizona # test projection pushdown (unstructured json) query I SELECT id FROM read_json('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='unstructured') ---- 1 2 3 4 5 query I SELECT name FROM read_json('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='unstructured') ---- O Brother, Where Art Thou? Home for the Holidays The Firm Broadcast News Raising Arizona # test projection pushdown (newline-delimited json) query I SELECT id FROM read_json('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='newline_delimited') ---- 1 2 3 4 5 query I SELECT name FROM read_ndjson('data/json/example_n.ndjson', columns={id: 'INTEGER', name: 'VARCHAR'}, format='nd') ---- O Brother, Where Art Thou? Home for the Holidays The Firm Broadcast News Raising Arizona # auto-detect query II SELECT * FROM read_json_auto('data/json/example_n.ndjson') ---- 1 O Brother, Where Art Thou? 2 Home for the Holidays 3 The Firm 4 Broadcast News 5 Raising Arizona query II SELECT * FROM 'data/json/example_n.ndjson' ---- 1 O Brother, Where Art Thou? 2 Home for the Holidays 3 The Firm 4 Broadcast News 5 Raising Arizona # we can detect at varying levels, level 0 is just JSON query I SELECT * FROM read_json_auto('data/json/with_list.json', maximum_depth=0) ---- {"id":1,"name":["O","Brother,","Where","Art","Thou?"]} {"id":2,"name":["Home","for","the","Holidays"]} {"id":3,"name":["The","Firm"]} {"id":4,"name":["Broadcast","News"]} {"id":5,"name":["Raising","Arizona"]} # at level one we get JSON and JSON query II SELECT * FROM read_json_auto('data/json/with_list.json', maximum_depth=1) ---- 1 ["O","Brother,","Where","Art","Thou?"] 2 ["Home","for","the","Holidays"] 3 ["The","Firm"] 4 ["Broadcast","News"] 5 ["Raising","Arizona"] # at level 2 we get BIGINT and JSON[] query II SELECT * FROM read_json_auto('data/json/with_list.json', maximum_depth=2) ---- 1 ["O", "Brother,", "Where", "Art", "Thou?"] 2 ["Home", "for", "the", "Holidays"] 3 ["The", "Firm"] 4 ["Broadcast", "News"] 5 ["Raising", "Arizona"] # at level 3 it's fully detected, and we get BIGINT and VARCHAR[] query II SELECT * FROM read_json_auto('data/json/with_list.json', maximum_depth=3) ---- 1 [O, 'Brother,', Where, Art, Thou?] 2 [Home, for, the, Holidays] 3 [The, Firm] 4 [Broadcast, News] 5 [Raising, Arizona] # we can detect lists too query III SELECT id, typeof(name), unnest(name) FROM 'data/json/with_list.json' ---- 1 VARCHAR[] O 1 VARCHAR[] Brother, 1 VARCHAR[] Where 1 VARCHAR[] Art 1 VARCHAR[] Thou? 2 VARCHAR[] Home 2 VARCHAR[] for 2 VARCHAR[] the 2 VARCHAR[] Holidays 3 VARCHAR[] The 3 VARCHAR[] Firm 4 VARCHAR[] Broadcast 4 VARCHAR[] News 5 VARCHAR[] Raising 5 VARCHAR[] Arizona # with depth 2 we don't bother detecting inside of the list - defaults to JSON query III SELECT id, typeof(name), unnest(name) FROM read_json_auto('data/json/with_list.json', maximum_depth=2) ---- 1 JSON[] "O" 1 JSON[] "Brother," 1 JSON[] "Where" 1 JSON[] "Art" 1 JSON[] "Thou?" 2 JSON[] "Home" 2 JSON[] "for" 2 JSON[] "the" 2 JSON[] "Holidays" 3 JSON[] "The" 3 JSON[] "Firm" 4 JSON[] "Broadcast" 4 JSON[] "News" 5 JSON[] "Raising" 5 JSON[] "Arizona" # with depth 0 we don't bother detecting anything, everything defaults to JSON (even the "id" column in this case) query II SELECT typeof(id), typeof(name) FROM read_json_auto('data/json/with_list.json', maximum_depth=1) ---- JSON JSON JSON JSON JSON JSON JSON JSON JSON JSON # we can detect UUID's query II SELECT id, typeof(id) FROM 'data/json/with_uuid.json' ---- bbd05ae7-76e5-4f1a-a31f-247408251fc9 UUID d5c52052-5f8e-473f-bc8d-176342643ef5 UUID 3b6a6de3-0732-4591-93ed-8df6091eb00d UUID ae24e69e-e0bf-4e85-9848-27d35df85b8b UUID 63928b16-1814-436f-8b30-b3c40cc31d51 UUID # top-level array of values query I select * from read_json('data/json/top_level_array.json', columns={conclusion: 'VARCHAR'}) ---- cancelled cancelled query I select * from read_json('data/json/top_level_array.json', auto_detect=true) ---- cancelled cancelled # if we try to read it as 'unstructured' records statement error select * from read_json('data/json/top_level_array.json', columns={conclusion: 'VARCHAR'}, format='unstructured', records=true) ---- Invalid Input Error: JSON transform error in file "data/json/top_level_array.json", in record/value 1: Expected OBJECT, but got ARRAY # if we try to read an ndjson file as if it is an array of values, we get an error statement error select * from read_json_auto('data/json/example_n.ndjson', format='array') ---- Invalid Input Error: Expected top-level JSON array # test that we can read a list of longer than STANDARD_VECTOR_SIZE properly statement ok copy (select 42 duck from range(10000)) to '__TEST_DIR__/my_file.json' (array true) query T select count(*) from read_json('__TEST_DIR__/my_file.json', columns={duck: 'INTEGER'}, format='array') ---- 10000 query T select sum(duck) = 42*10000 from read_json('__TEST_DIR__/my_file.json', columns={duck: 'INTEGER'}, format='array') ---- true # read_json_auto also understands ARRAY format query T select count(*) from '__TEST_DIR__/my_file.json' ---- 10000 query T select sum(duck) = 42*10000 from '__TEST_DIR__/my_file.json' ---- true # what if we do an array of non-records? statement ok copy (select list(range) from range(10)) to '__TEST_DIR__/my_file.json' (format csv, quote '', HEADER 0) query T select * from '__TEST_DIR__/my_file.json' ---- 0 1 2 3 4 5 6 7 8 9 # fails because it's not records statement error select * from read_json('__TEST_DIR__/my_file.json', format='array', columns={range: 'INTEGER'}, records=true) ---- Invalid Input Error: JSON transform error # fails because it's not records statement error select * from read_json_auto('__TEST_DIR__/my_file.json', format='array', records=true) ---- Binder Error: json_read expected records query T select * from read_json('__TEST_DIR__/my_file.json', format='auto', records=false, auto_detect=true) ---- 0 1 2 3 4 5 6 7 8 9 # need to supply columns statement error select * from read_json('__TEST_DIR__/my_file.json', format='auto', records='false', auto_detect=false) ---- Binder Error # read as unstructured values, so we just get the array query T select * from read_json('__TEST_DIR__/my_file.json', format='unstructured', records='false', auto_detect=true) ---- [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # array of non-records query T select * from read_json('__TEST_DIR__/my_file.json', format='array', records='false', auto_detect=true) ---- 0 1 2 3 4 5 6 7 8 9 # also works with auto query T select * from read_json('__TEST_DIR__/my_file.json', format='array', records='auto', auto_detect=true) ---- 0 1 2 3 4 5 6 7 8 9 # lower thread count so the next tests don't OOM on many-core machines statement ok SET threads=2 # issue 6646, this is not an array, but we try to read it as one statement error select json_structure(json ->> '$.metadata') as structure, from read_json('data/json/issue.json', format='array', columns={'json': 'JSON'}, maximum_object_size=104857600) limit 1; ---- Invalid Input Error: Expected top-level JSON array # let's try a variation statement error select json_structure(json ->> '$.metadata') as structure, from read_json('data/json/issue.json', format='array', records='false', columns={'json': 'JSON'}, maximum_object_size=104857600) limit 1; ---- Invalid Input Error: Expected top-level JSON array # we can parse it as unstructured values, and give it a different col name query I select json_structure(my_json ->> '$.metadata') as structure, from read_json('data/json/issue.json', format='unstructured', records='false', columns={'my_json': 'JSON'}, maximum_object_size=104857600) limit 1; ---- {"argv":["VARCHAR"],"dag":{"dag_size":"VARCHAR","tasks":{"load_oscar":{"status":"VARCHAR","type":"VARCHAR","upstream":"VARCHAR","products":{"nb":"VARCHAR"}},"load_weather":{"status":"VARCHAR","type":"VARCHAR","upstream":"VARCHAR","products":{"nb":"VARCHAR"}},"compress":{"status":"VARCHAR","type":"VARCHAR","upstream":{"load_oscar":"VARCHAR"},"products":{"nb":"VARCHAR"}}}}} statement ok pragma disable_verification # test that we can read a JSON list that spans more than one buffer size # the JSON is 55 bytes, and the minimum buffer size is 32MB # let's do 50k to be safe statement ok copy (select 42 this_is_a_very_long_field_name_yes_very_much_so from range(50000)) to '__TEST_DIR__/my_file.json' (array true) query T select sum(this_is_a_very_long_field_name_yes_very_much_so) = 42 * 50000 from '__TEST_DIR__/my_file.json' ---- true require httpfs query II select * from read_json_auto('https://github.com/duckdb/duckdb-data/releases/download/v1.0/example_rn.ndjson'); ---- 1 O Brother, Where Art Thou? 2 Home for the Holidays 3 The Firm 4 Broadcast News 5 Raising Arizona