should be it
This commit is contained in:
24
external/duckdb/test/sql/copy/csv/unquoted_escape/32k_rows.test_slow
vendored
Normal file
24
external/duckdb/test/sql/copy/csv/unquoted_escape/32k_rows.test_slow
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
# name: test/sql/copy/csv/unquoted_escape/32k_rows.test_slow
|
||||
# description: Test large number of rows with unquoted escape characters
|
||||
# group: [unquoted_escape]
|
||||
|
||||
statement ok
|
||||
PRAGMA enable_verification
|
||||
|
||||
statement ok
|
||||
CREATE TABLE pseudorandom_fields AS SELECT i, list_select(
|
||||
[E'\\', E'\t', E'\n', E'\\\\', E'\\\t', E'\\\n', E'\t\\', E'\t\t', E'\t\n', E'\n\\', E'\n\t', E'\n\n', '', 'a', 'b', 'c'],
|
||||
list_transform([hash(i), hash(i * 7 + 251), hash(i * i + 4093)], lambda x: (x % 16 + 1)::int)
|
||||
) AS s FROM range(32 << 10) t(i);
|
||||
|
||||
statement ok
|
||||
COPY (SELECT concat_ws(E'\t', s1, i, s2, s3) FROM (
|
||||
SELECT i, replace(replace(replace(columns(* exclude (i)), '\', '\\'), E'\t', E'\\\t'), E'\n', E'\\\n')
|
||||
FROM (SELECT i, s[1] s1, s[2] s2, s[3] s3 FROM pseudorandom_fields)
|
||||
)) TO '__TEST_DIR__/pseudorandom_fields.tsv' (HEADER false, QUOTE '', ESCAPE '');
|
||||
|
||||
query II
|
||||
SELECT log2(count(*))::int, bool_and(s[1] = f1 AND s[2] = f2 AND s[3] = f3)::int
|
||||
FROM pseudorandom_fields JOIN read_csv('__TEST_DIR__/pseudorandom_fields.tsv', quote = '', sep = '\t', escape = '\', header = false, strict_mode = false) t(f1, j, f2, f3) ON i = j;
|
||||
----
|
||||
15 1
|
||||
43
external/duckdb/test/sql/copy/csv/unquoted_escape/basic.test
vendored
Normal file
43
external/duckdb/test/sql/copy/csv/unquoted_escape/basic.test
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
# name: test/sql/copy/csv/unquoted_escape/basic.test
|
||||
# description: Test the parsing of unquoted escape characters
|
||||
# group: [unquoted_escape]
|
||||
|
||||
statement ok
|
||||
PRAGMA enable_verification
|
||||
|
||||
query IT
|
||||
SELECT * FROM read_csv('data/csv/unquoted_escape/plain.csv', escape = '\', sep = ',', strict_mode = false, nullstr = '\N');
|
||||
----
|
||||
0 \
|
||||
1 ,
|
||||
2 "
|
||||
3 \,
|
||||
4 \"
|
||||
5 ,"
|
||||
6 \,"
|
||||
7 NULL
|
||||
8 \N
|
||||
9 Na
|
||||
|
||||
statement ok
|
||||
CREATE TABLE special_char(a INT, b STRING);
|
||||
|
||||
statement ok
|
||||
INSERT INTO special_char VALUES
|
||||
(0, E'\\'), (1, E'\t'), (2, E'\n'),
|
||||
(3, E'a\\a'), (4, E'b\tb'), (5, E'c\nc'),
|
||||
(6, E'\\d'), (7, E'\te'), (8, E'\nf'),
|
||||
(9, E'g\\'), (10, E'h\t'), (11, E'i\n'),
|
||||
(12, E'\\j'), (13, E'\tk'), (14, E'\nl'),
|
||||
(15, E'\\\\'), (16, E'\t\t'), (17, E'\n\n'),
|
||||
(18, E'\\\t\n');
|
||||
|
||||
loop buffer_size 10 25
|
||||
|
||||
# replace CRLF with LF to pass the test on Windows
|
||||
query I
|
||||
SELECT bool_and(b = replace(s, E'\r\n', E'\n'))::int FROM special_char JOIN read_csv('data/csv/unquoted_escape/basic.tsv', quote = '', escape = '\', sep = '\t', strict_mode = false) t (i, s, j) ON i = a;
|
||||
----
|
||||
1
|
||||
|
||||
endloop
|
||||
86
external/duckdb/test/sql/copy/csv/unquoted_escape/human_eval.test
vendored
Normal file
86
external/duckdb/test/sql/copy/csv/unquoted_escape/human_eval.test
vendored
Normal file
@@ -0,0 +1,86 @@
|
||||
# name: test/sql/copy/csv/unquoted_escape/human_eval.test
|
||||
# description: Test the parsing of unquoted escape characters
|
||||
# group: [unquoted_escape]
|
||||
|
||||
#
|
||||
# The data file is generated by the following workflow:
|
||||
#
|
||||
# duckdb -c "COPY (SELECT REPLACE(COLUMNS(*), ' ', E'\t') FROM read_ndjson_auto('https://raw.githubusercontent.com/openai/human-eval/refs/heads/master/data/HumanEval.jsonl.gz')) to 'HumanEval.csv'"
|
||||
#
|
||||
# docker run --rm -d --name tmp-gen-csv \
|
||||
# -e MYSQL_ROOT_PASSWORD=root \
|
||||
# -p 13316:3306 \
|
||||
# mysql:latest \
|
||||
# mysqld --secure-file-priv=/tmp
|
||||
#
|
||||
# mysql -h127.0.0.1 -uroot -proot -P13316 --local-infile <<EOF
|
||||
# CREATE DATABASE human_eval;
|
||||
# USE human_eval;
|
||||
# CREATE TABLE t (task_id TEXT, prompt TEXT, entry_point TEXT, canonical_solution TEXT, test TEXT);
|
||||
# SET GLOBAL local_infile = 1;
|
||||
# LOAD DATA LOCAL INFILE './HumanEval.csv' INTO TABLE t FIELDS TERMINATED BY ',' ENCLOSED BY '"' ESCAPED BY '"' IGNORE 1 LINES;
|
||||
# TABLE t LIMIT 10 INTO OUTFILE '/tmp/human_eval.tsv';
|
||||
# TABLE t LIMIT 10 INTO OUTFILE '/tmp/human_eval.csv' FIELDS TERMINATED BY ',';
|
||||
# EOF
|
||||
#
|
||||
# docker cp tmp-gen-csv:/tmp/human_eval.tsv .
|
||||
# docker cp tmp-gen-csv:/tmp/human_eval.csv .
|
||||
#
|
||||
# docker kill tmp-gen-csv
|
||||
|
||||
require httpfs
|
||||
|
||||
require json
|
||||
|
||||
statement ok
|
||||
PRAGMA enable_verification
|
||||
|
||||
statement ok
|
||||
CREATE TABLE human_eval_jsonl AS
|
||||
SELECT REPLACE(COLUMNS(*), ' ', E'\t') FROM read_ndjson_auto(
|
||||
'https://raw.githubusercontent.com/openai/human-eval/refs/heads/master/data/HumanEval.jsonl.gz');
|
||||
|
||||
# keep the first 10 rows
|
||||
statement ok
|
||||
DELETE FROM human_eval_jsonl WHERE split_part(task_id, '/', 2)::int >= 10;
|
||||
|
||||
statement ok
|
||||
CREATE TABLE human_eval_csv(task_id TEXT, prompt TEXT, entry_point TEXT, canonical_solution TEXT, test TEXT);
|
||||
|
||||
statement ok
|
||||
CREATE TABLE human_eval_tsv(task_id TEXT, prompt TEXT, entry_point TEXT, canonical_solution TEXT, test TEXT);
|
||||
|
||||
loop buffer_size 10 25
|
||||
|
||||
statement ok
|
||||
TRUNCATE human_eval_csv;
|
||||
|
||||
statement ok
|
||||
TRUNCATE human_eval_tsv;
|
||||
|
||||
# replace the CRLF with LF to pass the test on Windows
|
||||
statement ok
|
||||
INSERT INTO human_eval_csv
|
||||
SELECT replace(COLUMNS(*), E'\r\n', E'\n')
|
||||
FROM read_csv('data/csv/unquoted_escape/human_eval.csv', quote = '', escape = '\', sep = ',', header = false, strict_mode = false);
|
||||
|
||||
statement ok
|
||||
INSERT INTO human_eval_tsv
|
||||
SELECT replace(COLUMNS(*), E'\r\n', E'\n')
|
||||
FROM read_csv('data/csv/unquoted_escape/human_eval.tsv', quote = '', escape = '\', sep = '\t', header = false, strict_mode = false);
|
||||
|
||||
# Verify that the three copies are the same
|
||||
query II
|
||||
SELECT count(*), bool_and(
|
||||
j.task_id = c.task_id AND j.task_id = t.task_id AND
|
||||
j.prompt = c.prompt AND j.prompt = t.prompt AND
|
||||
j.entry_point = c.entry_point AND j.entry_point = t.entry_point AND
|
||||
j.canonical_solution = c.canonical_solution AND j.canonical_solution = t.canonical_solution AND
|
||||
j.test = c.test AND j.test = t.test
|
||||
)::int
|
||||
FROM human_eval_jsonl j, human_eval_csv c, human_eval_tsv t
|
||||
WHERE j.task_id = c.task_id AND j.task_id = t.task_id
|
||||
----
|
||||
10 1
|
||||
|
||||
endloop
|
||||
15
external/duckdb/test/sql/copy/csv/unquoted_escape/identical.test
vendored
Normal file
15
external/duckdb/test/sql/copy/csv/unquoted_escape/identical.test
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
# name: test/sql/copy/csv/unquoted_escape/identical.test
|
||||
# description: Ensure that the identical quote & escape case works as before and is not affected by the handling of unquoted escaped values.
|
||||
# group: [unquoted_escape]
|
||||
|
||||
statement ok
|
||||
PRAGMA enable_verification
|
||||
|
||||
query TT
|
||||
SELECT concat('#', columns(*), '#') FROM read_csv('data/csv/unquoted_escape/identical.csv', quote = '"', escape = '"', sep = ',', strict_mode = false);
|
||||
----
|
||||
## #a""b#
|
||||
#c""d# ##
|
||||
#ef,"gh"# ##
|
||||
## #"ij",kl#
|
||||
#mn"# #op""#
|
||||
16
external/duckdb/test/sql/copy/csv/unquoted_escape/mixed.test
vendored
Normal file
16
external/duckdb/test/sql/copy/csv/unquoted_escape/mixed.test
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
# name: test/sql/copy/csv/unquoted_escape/mixed.test
|
||||
# description: Test the parsing of escaped values within and without quote in the same file
|
||||
# group: [unquoted_escape]
|
||||
|
||||
statement ok
|
||||
PRAGMA enable_verification
|
||||
|
||||
# replace CRLF with LF to pass the test on Windows
|
||||
query III
|
||||
SELECT
|
||||
hamming(replace(string_agg(w, '|' ORDER BY y), E'\r\n', E'\n'), E'\\|,|"|\n'),
|
||||
hamming(string_agg(z, '|' ORDER BY y), '"|"a"|"b|c"'),
|
||||
bool_and(x = concat(w, '"', w))::int
|
||||
FROM read_csv('data/csv/unquoted_escape/mixed.csv', quote = '"', escape = '\', sep = ',', strict_mode = false);
|
||||
----
|
||||
0 0 1
|
||||
Reference in New Issue
Block a user