should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,24 @@
# name: test/sql/copy/csv/unquoted_escape/32k_rows.test_slow
# description: Test large number of rows with unquoted escape characters
# group: [unquoted_escape]
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE pseudorandom_fields AS SELECT i, list_select(
[E'\\', E'\t', E'\n', E'\\\\', E'\\\t', E'\\\n', E'\t\\', E'\t\t', E'\t\n', E'\n\\', E'\n\t', E'\n\n', '', 'a', 'b', 'c'],
list_transform([hash(i), hash(i * 7 + 251), hash(i * i + 4093)], lambda x: (x % 16 + 1)::int)
) AS s FROM range(32 << 10) t(i);
statement ok
COPY (SELECT concat_ws(E'\t', s1, i, s2, s3) FROM (
SELECT i, replace(replace(replace(columns(* exclude (i)), '\', '\\'), E'\t', E'\\\t'), E'\n', E'\\\n')
FROM (SELECT i, s[1] s1, s[2] s2, s[3] s3 FROM pseudorandom_fields)
)) TO '__TEST_DIR__/pseudorandom_fields.tsv' (HEADER false, QUOTE '', ESCAPE '');
query II
SELECT log2(count(*))::int, bool_and(s[1] = f1 AND s[2] = f2 AND s[3] = f3)::int
FROM pseudorandom_fields JOIN read_csv('__TEST_DIR__/pseudorandom_fields.tsv', quote = '', sep = '\t', escape = '\', header = false, strict_mode = false) t(f1, j, f2, f3) ON i = j;
----
15 1

View File

@@ -0,0 +1,43 @@
# name: test/sql/copy/csv/unquoted_escape/basic.test
# description: Test the parsing of unquoted escape characters
# group: [unquoted_escape]
statement ok
PRAGMA enable_verification
query IT
SELECT * FROM read_csv('data/csv/unquoted_escape/plain.csv', escape = '\', sep = ',', strict_mode = false, nullstr = '\N');
----
0 \
1 ,
2 "
3 \,
4 \"
5 ,"
6 \,"
7 NULL
8 \N
9 Na
statement ok
CREATE TABLE special_char(a INT, b STRING);
statement ok
INSERT INTO special_char VALUES
(0, E'\\'), (1, E'\t'), (2, E'\n'),
(3, E'a\\a'), (4, E'b\tb'), (5, E'c\nc'),
(6, E'\\d'), (7, E'\te'), (8, E'\nf'),
(9, E'g\\'), (10, E'h\t'), (11, E'i\n'),
(12, E'\\j'), (13, E'\tk'), (14, E'\nl'),
(15, E'\\\\'), (16, E'\t\t'), (17, E'\n\n'),
(18, E'\\\t\n');
loop buffer_size 10 25
# replace CRLF with LF to pass the test on Windows
query I
SELECT bool_and(b = replace(s, E'\r\n', E'\n'))::int FROM special_char JOIN read_csv('data/csv/unquoted_escape/basic.tsv', quote = '', escape = '\', sep = '\t', strict_mode = false) t (i, s, j) ON i = a;
----
1
endloop

View File

@@ -0,0 +1,86 @@
# name: test/sql/copy/csv/unquoted_escape/human_eval.test
# description: Test the parsing of unquoted escape characters
# group: [unquoted_escape]
#
# The data file is generated by the following workflow:
#
# duckdb -c "COPY (SELECT REPLACE(COLUMNS(*), ' ', E'\t') FROM read_ndjson_auto('https://raw.githubusercontent.com/openai/human-eval/refs/heads/master/data/HumanEval.jsonl.gz')) to 'HumanEval.csv'"
#
# docker run --rm -d --name tmp-gen-csv \
# -e MYSQL_ROOT_PASSWORD=root \
# -p 13316:3306 \
# mysql:latest \
# mysqld --secure-file-priv=/tmp
#
# mysql -h127.0.0.1 -uroot -proot -P13316 --local-infile <<EOF
# CREATE DATABASE human_eval;
# USE human_eval;
# CREATE TABLE t (task_id TEXT, prompt TEXT, entry_point TEXT, canonical_solution TEXT, test TEXT);
# SET GLOBAL local_infile = 1;
# LOAD DATA LOCAL INFILE './HumanEval.csv' INTO TABLE t FIELDS TERMINATED BY ',' ENCLOSED BY '"' ESCAPED BY '"' IGNORE 1 LINES;
# TABLE t LIMIT 10 INTO OUTFILE '/tmp/human_eval.tsv';
# TABLE t LIMIT 10 INTO OUTFILE '/tmp/human_eval.csv' FIELDS TERMINATED BY ',';
# EOF
#
# docker cp tmp-gen-csv:/tmp/human_eval.tsv .
# docker cp tmp-gen-csv:/tmp/human_eval.csv .
#
# docker kill tmp-gen-csv
require httpfs
require json
statement ok
PRAGMA enable_verification
statement ok
CREATE TABLE human_eval_jsonl AS
SELECT REPLACE(COLUMNS(*), ' ', E'\t') FROM read_ndjson_auto(
'https://raw.githubusercontent.com/openai/human-eval/refs/heads/master/data/HumanEval.jsonl.gz');
# keep the first 10 rows
statement ok
DELETE FROM human_eval_jsonl WHERE split_part(task_id, '/', 2)::int >= 10;
statement ok
CREATE TABLE human_eval_csv(task_id TEXT, prompt TEXT, entry_point TEXT, canonical_solution TEXT, test TEXT);
statement ok
CREATE TABLE human_eval_tsv(task_id TEXT, prompt TEXT, entry_point TEXT, canonical_solution TEXT, test TEXT);
loop buffer_size 10 25
statement ok
TRUNCATE human_eval_csv;
statement ok
TRUNCATE human_eval_tsv;
# replace the CRLF with LF to pass the test on Windows
statement ok
INSERT INTO human_eval_csv
SELECT replace(COLUMNS(*), E'\r\n', E'\n')
FROM read_csv('data/csv/unquoted_escape/human_eval.csv', quote = '', escape = '\', sep = ',', header = false, strict_mode = false);
statement ok
INSERT INTO human_eval_tsv
SELECT replace(COLUMNS(*), E'\r\n', E'\n')
FROM read_csv('data/csv/unquoted_escape/human_eval.tsv', quote = '', escape = '\', sep = '\t', header = false, strict_mode = false);
# Verify that the three copies are the same
query II
SELECT count(*), bool_and(
j.task_id = c.task_id AND j.task_id = t.task_id AND
j.prompt = c.prompt AND j.prompt = t.prompt AND
j.entry_point = c.entry_point AND j.entry_point = t.entry_point AND
j.canonical_solution = c.canonical_solution AND j.canonical_solution = t.canonical_solution AND
j.test = c.test AND j.test = t.test
)::int
FROM human_eval_jsonl j, human_eval_csv c, human_eval_tsv t
WHERE j.task_id = c.task_id AND j.task_id = t.task_id
----
10 1
endloop

View File

@@ -0,0 +1,15 @@
# name: test/sql/copy/csv/unquoted_escape/identical.test
# description: Ensure that the identical quote & escape case works as before and is not affected by the handling of unquoted escaped values.
# group: [unquoted_escape]
statement ok
PRAGMA enable_verification
query TT
SELECT concat('#', columns(*), '#') FROM read_csv('data/csv/unquoted_escape/identical.csv', quote = '"', escape = '"', sep = ',', strict_mode = false);
----
## #a""b#
#c""d# ##
#ef,"gh"# ##
## #"ij",kl#
#mn"# #op""#

View File

@@ -0,0 +1,16 @@
# name: test/sql/copy/csv/unquoted_escape/mixed.test
# description: Test the parsing of escaped values within and without quote in the same file
# group: [unquoted_escape]
statement ok
PRAGMA enable_verification
# replace CRLF with LF to pass the test on Windows
query III
SELECT
hamming(replace(string_agg(w, '|' ORDER BY y), E'\r\n', E'\n'), E'\\|,|"|\n'),
hamming(string_agg(z, '|' ORDER BY y), '"|"a"|"b|c"'),
bool_and(x = concat(w, '"', w))::int
FROM read_csv('data/csv/unquoted_escape/mixed.csv', quote = '"', escape = '\', sep = ',', strict_mode = false);
----
0 0 1