should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,33 @@
# name: test/sql/copy/csv/glob/copy_csv_glob.test
# description: Test globbing CSVs
# group: [glob]
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE dates(d DATE);
# simple globbing
statement ok
COPY dates FROM 'data/csv/glob/a?/*.csv' (AUTO_DETECT 1);
query I
SELECT * FROM dates ORDER BY 1
----
2019-06-05
2019-06-15
2019-06-25
2019-07-05
2019-07-15
2019-07-25
2019-08-05
2019-08-15
2019-08-25
# nothing matches the glob
statement error
COPY dates FROM read_csv('data/csv/glob/*/a*a.csv', auto_detect=1)
----
syntax error at or near "'data/csv/glob/*/a*a.csv'"

View File

@@ -0,0 +1,287 @@
# name: test/sql/copy/csv/glob/read_csv_glob.test
# description: Test globbing CSVs
# group: [glob]
statement ok
PRAGMA enable_verification
query IIIII
select typeof(#1),typeof(#2),typeof(#3),typeof(#4),typeof(#5) FROM read_csv('data/csv/per_thread/*.csv') limit 1
----
VARCHAR BOOLEAN DOUBLE DOUBLE VARCHAR
query IIIII
select typeof(#1),typeof(#2),typeof(#3),typeof(#4),typeof(#5) FROM read_csv(['data/csv/per_thread/c1.csv', 'data/csv/per_thread/c2.csv']) limit 1
----
VARCHAR BOOLEAN DOUBLE DOUBLE VARCHAR
query IIIII
select typeof(#1),typeof(#2),typeof(#3),typeof(#4),typeof(#5) FROM read_csv(['data/csv/per_thread/c2.csv', 'data/csv/per_thread/c1.csv', 'data/csv/per_thread/c3.csv']) limit 1
----
VARCHAR BOOLEAN DOUBLE DOUBLE VARCHAR
# simple globbing
query I
SELECT * FROM read_csv('data/csv/glob/a?/*.csv') ORDER BY 1
----
2019-06-05
2019-06-15
2019-06-25
2019-07-05
2019-07-15
2019-07-25
2019-08-05
2019-08-15
2019-08-25
query I
SELECT * FROM read_csv('data/csv/glob/a?/a*.csv') ORDER BY 1
----
2019-06-05
2019-06-15
2019-06-25
2019-07-05
2019-07-15
2019-07-25
# list parameter
query I
SELECT * FROM read_csv(['data/csv/glob/a1/a1.csv', 'data/csv/glob/a2/a2.csv']) ORDER BY 1
----
2019-06-05
2019-06-15
2019-06-25
2019-07-05
2019-07-15
2019-07-25
query I
SELECT * FROM read_csv_auto(['data/csv/glob/a1/a1.csv', 'data/csv/glob/a2/a2.csv']) ORDER BY 1
----
2019-06-05
2019-06-15
2019-06-25
2019-07-05
2019-07-15
2019-07-25
# multiple globs
query I
SELECT * FROM read_csv(['data/csv/glob/a?/a*.csv', 'data/csv/glob/a?/a*.csv']) ORDER BY 1
----
2019-06-05
2019-06-05
2019-06-15
2019-06-15
2019-06-25
2019-06-25
2019-07-05
2019-07-05
2019-07-15
2019-07-15
2019-07-25
2019-07-25
# more asterisks for directories
query I
SELECT * FROM read_csv('data/csv/*/a?/a*.csv') ORDER BY 1
----
2019-06-05
2019-06-15
2019-06-25
2019-07-05
2019-07-15
2019-07-25
query II
SELECT a, b LIKE '%a1.csv%' FROM read_csv('data/csv/*/a?/a*.csv', filename=1) t1(a,b) ORDER BY 1
----
2019-06-05 1
2019-06-15 1
2019-06-25 1
2019-07-05 0
2019-07-15 0
2019-07-25 0
# read-csv auto fails here because of a type mismatch: most files contain dates, but one file contains integers
statement error
SELECT * FROM read_csv('data/csv/glob/*/*.csv') ORDER BY 1
----
Schema mismatch between globbed files.
# forcing string parsing works
query I
SELECT * FROM read_csv('data/csv/glob/*/*.csv', columns=STRUCT_PACK(d := 'STRING')) ORDER BY 1
----
1
2
2019-06-05
2019-06-15
2019-06-25
2019-07-05
2019-07-15
2019-07-25
2019-08-05
2019-08-15
2019-08-25
3
query II
SELECT a, b LIKE '%a_.csv' FROM read_csv('data/csv/glob/*/*.csv', columns=STRUCT_PACK(d := 'STRING'), filename=1) t(a,b) ORDER BY 1
----
1 0
2 0
2019-06-05 1
2019-06-15 1
2019-06-25 1
2019-07-05 1
2019-07-15 1
2019-07-25 1
2019-08-05 0
2019-08-15 0
2019-08-25 0
3 0
# test glob parsing
query I
SELECT COUNT(*) FROM glob('data/csv/glob/*/*.csv')
----
5
query I
SELECT COUNT(*) FROM glob(['data/csv/glob/*/*.csv'])
----
5
query I
SELECT COUNT(*) FROM glob(['data/csv/glob/*/*.csv', 'data/csv/glob/*/*.csv'])
----
10
# we can also use windows file slashes
query I
SELECT COUNT(*) FROM glob('data\csv\glob\*\*.csv')
----
5
# consecutive slashes are ignored
query I
SELECT COUNT(*) FROM glob('data//csv///glob///*//////*.csv')
----
5
# nothing matches the glob
statement error
SELECT * FROM read_csv('data/csv/glob/*/a*a.csv') ORDER BY 1
----
No files found that match the pattern "data/csv/glob/*/a*a.csv"
statement error
SELECT * FROM read_csv(['data/csv/glob/*/a*a.csv']) ORDER BY 1
----
No files found that match the pattern "data/csv/glob/*/a*a.csv"
statement error
SELECT * FROM read_csv_auto(['data/csv/glob/*/a*a.csv']) ORDER BY 1
----
No files found that match the pattern "data/csv/glob/*/a*a.csv"
query I
SELECT COUNT(*) FROM glob('data/csv/glob/*/a*a.csv')
----
0
query I
select count(*) from glob('/rewoiarwiouw3rajkawrasdf790273489*.csv') limit 10;
----
0
query I
select count(*) from glob('~/rewoiarwiouw3rajkawrasdf790273489*.py') limit 10;
----
0
require skip_reload
# file_search_path with one path
statement ok
set file_search_path='data/csv/glob';
query I
SELECT COUNT(*) FROM glob('*/*.csv');
----
5
# file_search_path with multiple paths
statement ok
set file_search_path='data/csv/glob/a1,data/csv/glob/a2';
query I
SELECT COUNT(*) FROM glob('*.csv');
----
2
# file_search_path with a non-existent path
statement ok
set file_search_path='data/csv/glob,garbage';
query I
SELECT COUNT(*) FROM glob('*/*.csv');
----
5
# Only file_search_path is searched
query I
SELECT COUNT(*) FROM glob('data/csv/glob/*/*.csv');
----
0
# file_search_path can be cleared
statement ok
set file_search_path='';
query I
SELECT COUNT(*) FROM glob('data/csv/glob/*/*.csv');
----
5
# empty list
statement error
SELECT * FROM read_csv_auto([]) ORDER BY 1
----
No function matches
statement error
SELECT * FROM read_csv_auto([]::VARCHAR[]) ORDER BY 1
----
at least one file
# null list
statement error
SELECT * FROM read_csv_auto(NULL) ORDER BY 1
----
NULL
statement error
SELECT * FROM read_csv_auto([NULL]) ORDER BY 1
----
NULL
statement error
SELECT * FROM read_csv_auto(NULL::VARCHAR) ORDER BY 1
----
NULL
statement error
SELECT * FROM read_csv_auto(NULL::VARCHAR[]) ORDER BY 1
----
NULL
statement ok
SET threads=1;
statement error
FROM read_csv('data/csv/glob/*/*.csv');
----
Schema mismatch between globbed files.

View File

@@ -0,0 +1,17 @@
# name: test/sql/copy/csv/glob/read_csv_glob_crawl.test_slow
# description: Test glob **
# group: [glob]
statement ok
PRAGMA enable_verification
# tests without previous directories
query I
select count(*) > 0 from glob('**');
----
true
query I
select count(*) from glob('**/samename/*');
----
8

View File

@@ -0,0 +1,319 @@
# name: test/sql/copy/csv/glob/read_csv_glob_crawl_partitioned.test_slow
# description: Test glob **
# group: [glob]
# files from: 'data/csv/glob/crawl/'
statement ok
PRAGMA enable_verification
# simple crawling
# example from: https://stackoverflow.com/a/66744400
query II
SELECT * FROM read_csv('data/csv/glob/crawl/stackoverflow/**/*.csv', auto_detect=1) ORDER BY 2;
----
0 0
1 1
2 2
3 3
2 4
3 5
# test with ** as the last entry
query II
SELECT * FROM read_csv('data/csv/glob/crawl/stackoverflow/**', auto_detect=1) ORDER BY 2;
----
0 0
1 1
2 2
3 3
2 4
3 5
# test with nested same name dirs
query I
SELECT * FROM read_csv('data/csv/glob/crawl/samename/**/*.csv', auto_detect=1);
----
42
42
42
42
42
42
42
42
# test with nested same name dirs, but with ** as last entry
query I
SELECT sum(column0) FROM read_csv('data/csv/glob/crawl/samename/**', auto_detect=1);
----
336
# test with structure:
# mkdir -p d/{d00,d01,d02}/{d10,d11,d12}/{d20,d21,d22}/mid/{d40,d41,d42}
# touch file.csv {d00,d01,d02}/file.csv && touch {d00,d01,d02}/{d10,d11,d12}/file.csv && touch {d00,d01,d02}/{d10,d11,d12}/{d20,d21,d22}/file.csv && touch {d00,d01,d02}/{d10,d11,d12}/{d20,d21,d22}/mid/file.csv && touch {d00,d01,d02}/{d10,d11,d12}/{d20,d21,d22}/mid/{d40,d41,d42}/file.csv
query I
SELECT count(*) FROM read_csv('data/csv/glob/crawl/d/**/*.csv', auto_detect=1);
----
148
query I
SELECT count(*) FROM glob('data/csv/glob/crawl/d/**');
----
148
query I
SELECT sum(column0) FROM read_csv('data/csv/glob/crawl/d/**', auto_detect=1);
----
6216
query I
SELECT count(*) FROM glob('data/csv/glob/crawl/d/**/');
----
148
query I
SELECT count(*) FROM glob('data/csv/glob/crawl/d/**/mid/*.csv');
----
27
query I
SELECT count(*) FROM 'data/csv/glob/crawl/d/**/mid/*.csv';
----
27
query I
SELECT count(*) FROM 'data/csv/glob/crawl/d/**/mid/*/*.csv';
----
81
query I
SELECT count(*) FROM glob('data/csv/glob/crawl/d/**/mid/*/');
----
81
statement error
SELECT count(*) FROM 'data/csv/glob/crawl/d/**/mid/**/*.csv';
----
Cannot use multiple '**' in one path
query I
SELECT count(*) FROM 'data/csv/glob/crawl/d/**/???/*/*.csv';
----
144
query I
SELECT count(*) FROM 'data/csv/glob/crawl/d/*/???/**/*.csv';
----
144
statement error
SELECT count(*) FROM 'data/csv/glob/crawl/d/*/mid/**/*.csv';
----
No files found that match the pattern
query I
SELECT count(*) FROM 'data/csv/glob/crawl/d/*/*/*/mid/**/*.csv';
----
108
query I
SELECT count(*) FROM 'data/csv/glob/crawl/d/**/???/*.csv';
----
147
query I
SELECT count(*) FROM 'data/csv/glob/crawl/d/*/???/*.csv';
----
9
statement error
SELECT count(*) FROM 'data/csv/glob/crawl/d/**/*/**/*.csv';
----
Cannot use multiple '**' in one path
query I
SELECT count(*) FROM 'data/csv/glob/crawl/d/**/d2?/*/*.csv';
----
27
query I
SELECT count(*) FROM 'data/csv/glob/crawl/d/*/*/d2?/**/*.csv';
----
135
query I
SELECT sum(column0) FROM read_csv('data/csv/glob/crawl/d/*/*/d2?/**', auto_detect=1);
----
5670
query I
SELECT count(*) FROM 'data/csv/glob/crawl/d/**/d?0/*.csv';
----
40
query I
SELECT count(*) FROM 'data/csv/glob/crawl/d/*/**/d?0/*.csv';
----
39
statement error
SELECT count(*) FROM 'data/csv/glob/crawl/d/**/**/**/**/*.csv';
----
Cannot use multiple '**' in one path
# Test with hidden files. By default python does not return hidden files, but duckdb does.
query II
FROM read_csv_auto('data/csv/glob/crawl/hidden/**');
----
42 42
42 42
42 42
# additional tests
statement ok
CREATE TABLE t0 AS SELECT (i%2) AS c_2, (i%3) AS c_3, (i*i) AS c_pow FROM RANGE(0,10) tbl(i);
statement ok
COPY t0 TO '__TEST_DIR__/partitioned0' (PARTITION_BY(c_2,c_3));
query I
from glob('__TEST_DIR__/partitioned0/*');
----
query I
select count(*) from glob('__TEST_DIR__/partitioned0/*');
----
0
query I
select count(*) from glob('__TEST_DIR__/partitioned0/*/*/*');
----
6
query I
select count(*) from glob('__TEST_DIR__/partitioned0/*/*/**')
----
6
query I
select count(*) from glob('__TEST_DIR__/partitioned0/*/*/*/**')
----
0
query I
select count(*) from glob('__TEST_DIR__/partitioned0/**');
----
6
query I
select count(*) from glob('__TEST_DIR__/partitioned0/**/*');
----
6
query I
select count(*) from glob('__TEST_DIR__/partitioned0/*/**');
----
6
query I
select count(*) from glob('__TEST_DIR__/partitioned0/**/data_0.csv');
----
6
query I
select count(*) from glob('__TEST_DIR__/partitioned0/**/*/data_0.csv')
----
6
query I
select count(*) from glob('__TEST_DIR__/partitioned0/**/c_3=0/data_0.csv')
----
2
query I
select count(*) from glob('__TEST_DIR__/partitioned0/c_2=0/**/data_0.csv')
----
3
# put a file with a different name in the partitioned0 directory
statement ok
COPY t0 TO '__TEST_DIR__/partitioned0/data_1.csv';
query I
select count(*) from glob('__TEST_DIR__/partitioned0/*');
----
1
query I
select count(*) from glob('__TEST_DIR__/partitioned0/*');
----
1
query I
select count(*) from glob('__TEST_DIR__/partitioned0/*/*/*');
----
6
query I
select count(*) from glob('__TEST_DIR__/partitioned0/*/*/**')
----
6
query I
select count(*) from glob('__TEST_DIR__/partitioned0/*/*/*/**')
----
0
query I
select count(*) from glob('__TEST_DIR__/partitioned0/**');
----
7
query I
select count(*) from glob('__TEST_DIR__/partitioned0/**/*');
----
7
query I
select count(*) from glob('__TEST_DIR__/partitioned0/*/**');
----
6
query I
select count(*) from glob('__TEST_DIR__/partitioned0/**/data_0.csv');
----
6
query I
select count(*) from glob('__TEST_DIR__/partitioned0/**/*/data_0.csv')
----
6
query I
select count(*) from glob('__TEST_DIR__/partitioned0/**/c_3=0/data_0.csv')
----
2
query I
select count(*) from glob('__TEST_DIR__/partitioned0/c_2=0/**/data_0.csv')
----
3
require notwindows
# symlink test: symlinks will not be searched
query I
FROM read_csv_auto('data/csv/glob/crawl/.symbolic_link/**');
----
-42
statement error
SELECT sum(column0) FROM read_csv('data/csv/glob/crawl/d/**/', auto_detect=1);
----
Could not read from file

View File

@@ -0,0 +1,36 @@
# name: test/sql/copy/csv/glob/test_unmatch_globs.test
# description: Test globbing CSVs
# group: [glob]
statement ok
PRAGMA enable_verification
query III
FROM 'data/csv/glob_dif_dialect/14166/__200*.csv';
----
2000-01-01 10 80.9189441112103
2000-01-02 5 109.16581782022259
query III
FROM read_csv(['data/csv/glob_dif_dialect/14166/__2000.csv', 'data/csv/glob_dif_dialect/14166/__2001.csv', 'data/csv/glob_dif_dialect/14166/empty.csv']);
----
2000-01-01 10 80.9189441112103
2000-01-02 5 109.16581782022259
query III
FROM read_csv(['data/csv/glob_dif_dialect/14166/__2000.csv','data/csv/glob_dif_dialect/14166/matching_types.csv']);
----
2000-01-01 10 80.9189441112103
2000-01-02 5 109.16581782022259
# Globbing with different dialects
query III
FROM 'data/csv/glob_dif_dialect/f_*.csv' order by all
----
1 alice alice@email.com
1 alice alice@email.com
2 eve eve@email.com
3 bob bob@email.com
3 bob NULL
4 pedro pedro@email.com
5r tim tim@email.com