should be it

2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions
--- a/external/duckdb/test/sql/copy/csv/unquoted_escape/32k_rows.test_slow
+++ b/external/duckdb/test/sql/copy/csv/unquoted_escape/32k_rows.test_slow
@@ -0,0 +1,24 @@
+# name: test/sql/copy/csv/unquoted_escape/32k_rows.test_slow
+# description: Test large number of rows with unquoted escape characters
+# group: [unquoted_escape]
+
+statement ok
+PRAGMA enable_verification
+
+statement ok
+CREATE TABLE pseudorandom_fields AS SELECT i, list_select(
+    [E'\\', E'\t', E'\n', E'\\\\', E'\\\t', E'\\\n', E'\t\\', E'\t\t', E'\t\n', E'\n\\', E'\n\t', E'\n\n', '',  'a', 'b', 'c'],
+    list_transform([hash(i), hash(i * 7 + 251), hash(i * i + 4093)], lambda x: (x % 16 + 1)::int)
+) AS s FROM range(32 << 10) t(i);
+
+statement ok
+COPY (SELECT concat_ws(E'\t', s1, i, s2, s3) FROM (
+    SELECT i, replace(replace(replace(columns(* exclude (i)), '\', '\\'), E'\t', E'\\\t'), E'\n', E'\\\n')
+    FROM (SELECT i, s[1] s1, s[2] s2, s[3] s3 FROM pseudorandom_fields)
+)) TO '__TEST_DIR__/pseudorandom_fields.tsv' (HEADER false, QUOTE '', ESCAPE '');
+
+query II
+SELECT log2(count(*))::int, bool_and(s[1] = f1 AND s[2] = f2 AND s[3] = f3)::int
+FROM pseudorandom_fields JOIN read_csv('__TEST_DIR__/pseudorandom_fields.tsv', quote = '', sep = '\t', escape = '\', header = false, strict_mode = false) t(f1, j, f2, f3) ON i = j;
+----
+15	1
--- a/external/duckdb/test/sql/copy/csv/unquoted_escape/basic.test
+++ b/external/duckdb/test/sql/copy/csv/unquoted_escape/basic.test
@@ -0,0 +1,43 @@
+# name: test/sql/copy/csv/unquoted_escape/basic.test
+# description: Test the parsing of unquoted escape characters
+# group: [unquoted_escape]
+
+statement ok
+PRAGMA enable_verification
+
+query IT
+SELECT * FROM read_csv('data/csv/unquoted_escape/plain.csv', escape = '\', sep = ',', strict_mode = false, nullstr = '\N');
+----
+0	\
+1	,
+2	"
+3	\,
+4	\"
+5	,"
+6	\,"
+7	NULL
+8	\N
+9	Na
+
+statement ok
+CREATE TABLE special_char(a INT, b STRING);
+
+statement ok
+INSERT INTO special_char VALUES
+    (0, E'\\'), (1, E'\t'), (2, E'\n'),
+    (3, E'a\\a'), (4, E'b\tb'), (5, E'c\nc'),
+    (6, E'\\d'), (7, E'\te'), (8, E'\nf'),
+    (9, E'g\\'), (10, E'h\t'), (11, E'i\n'),
+    (12, E'\\j'), (13, E'\tk'), (14, E'\nl'),
+    (15, E'\\\\'), (16, E'\t\t'), (17, E'\n\n'),
+    (18, E'\\\t\n');
+
+loop buffer_size 10 25
+
+# replace CRLF with LF to pass the test on Windows
+query I
+SELECT bool_and(b = replace(s, E'\r\n', E'\n'))::int FROM special_char JOIN read_csv('data/csv/unquoted_escape/basic.tsv', quote = '', escape = '\', sep = '\t', strict_mode = false) t (i, s, j) ON i = a;
+----
+1
+
+endloop
--- a/external/duckdb/test/sql/copy/csv/unquoted_escape/human_eval.test
+++ b/external/duckdb/test/sql/copy/csv/unquoted_escape/human_eval.test
@@ -0,0 +1,86 @@
+# name: test/sql/copy/csv/unquoted_escape/human_eval.test
+# description: Test the parsing of unquoted escape characters
+# group: [unquoted_escape]
+
+#
+# The data file is generated by the following workflow:
+#
+# duckdb -c "COPY (SELECT REPLACE(COLUMNS(*), '    ', E'\t') FROM read_ndjson_auto('https://raw.githubusercontent.com/openai/human-eval/refs/heads/master/data/HumanEval.jsonl.gz')) to 'HumanEval.csv'"
+#
+# docker run --rm -d --name tmp-gen-csv \
+#  -e MYSQL_ROOT_PASSWORD=root \
+#  -p 13316:3306 \
+#  mysql:latest \
+#  mysqld --secure-file-priv=/tmp
+# 
+# mysql -h127.0.0.1 -uroot -proot -P13316 --local-infile <<EOF
+# CREATE DATABASE human_eval;
+# USE human_eval;
+# CREATE TABLE t (task_id TEXT, prompt TEXT, entry_point TEXT, canonical_solution TEXT, test TEXT);
+# SET GLOBAL local_infile = 1;
+# LOAD DATA LOCAL INFILE './HumanEval.csv' INTO TABLE t FIELDS TERMINATED BY ',' ENCLOSED BY '"' ESCAPED BY '"' IGNORE 1 LINES;
+# TABLE t LIMIT 10 INTO OUTFILE '/tmp/human_eval.tsv';
+# TABLE t LIMIT 10 INTO OUTFILE '/tmp/human_eval.csv' FIELDS TERMINATED BY ',';
+# EOF
+#
+# docker cp tmp-gen-csv:/tmp/human_eval.tsv .
+# docker cp tmp-gen-csv:/tmp/human_eval.csv .
+#
+# docker kill tmp-gen-csv
+
+require httpfs
+
+require json
+
+statement ok
+PRAGMA enable_verification
+
+statement ok
+CREATE TABLE human_eval_jsonl AS
+SELECT REPLACE(COLUMNS(*), '    ', E'\t') FROM read_ndjson_auto(
+'https://raw.githubusercontent.com/openai/human-eval/refs/heads/master/data/HumanEval.jsonl.gz');
+
+# keep the first 10 rows
+statement ok
+DELETE FROM human_eval_jsonl WHERE split_part(task_id, '/', 2)::int >= 10;
+
+statement ok
+CREATE TABLE human_eval_csv(task_id TEXT, prompt TEXT, entry_point TEXT, canonical_solution TEXT, test TEXT);
+
+statement ok
+CREATE TABLE human_eval_tsv(task_id TEXT, prompt TEXT, entry_point TEXT, canonical_solution TEXT, test TEXT);
+
+loop buffer_size 10 25
+
+statement ok
+TRUNCATE human_eval_csv;
+
+statement ok
+TRUNCATE human_eval_tsv;
+
+# replace the CRLF with LF to pass the test on Windows
+statement ok
+INSERT INTO human_eval_csv
+SELECT replace(COLUMNS(*), E'\r\n', E'\n')
+FROM read_csv('data/csv/unquoted_escape/human_eval.csv', quote = '', escape = '\', sep = ',', header = false, strict_mode = false);
+
+statement ok
+INSERT INTO human_eval_tsv
+SELECT replace(COLUMNS(*), E'\r\n', E'\n')
+FROM read_csv('data/csv/unquoted_escape/human_eval.tsv', quote = '', escape = '\', sep = '\t', header = false, strict_mode = false);
+
+# Verify that the three copies are the same
+query II
+SELECT count(*), bool_and(
+    j.task_id = c.task_id AND j.task_id = t.task_id AND
+    j.prompt = c.prompt AND j.prompt = t.prompt AND
+    j.entry_point = c.entry_point AND j.entry_point = t.entry_point AND
+    j.canonical_solution = c.canonical_solution AND j.canonical_solution = t.canonical_solution AND
+    j.test = c.test AND j.test = t.test
+)::int
+FROM human_eval_jsonl j, human_eval_csv c, human_eval_tsv t
+WHERE j.task_id = c.task_id AND j.task_id = t.task_id
+----
+10	1
+
+endloop
--- a/external/duckdb/test/sql/copy/csv/unquoted_escape/identical.test
+++ b/external/duckdb/test/sql/copy/csv/unquoted_escape/identical.test
@@ -0,0 +1,15 @@
+# name: test/sql/copy/csv/unquoted_escape/identical.test
+# description: Ensure that the identical quote & escape case works as before and is not affected by the handling of unquoted escaped values.
+# group: [unquoted_escape]
+
+statement ok
+PRAGMA enable_verification
+
+query TT
+SELECT concat('#', columns(*), '#') FROM read_csv('data/csv/unquoted_escape/identical.csv', quote = '"', escape = '"', sep = ',', strict_mode = false);
+----
+##	#a""b#
+#c""d#	##
+#ef,"gh"#	##
+##	#"ij",kl#
+#mn"#	#op""#
--- a/external/duckdb/test/sql/copy/csv/unquoted_escape/mixed.test
+++ b/external/duckdb/test/sql/copy/csv/unquoted_escape/mixed.test
@@ -0,0 +1,16 @@
+# name: test/sql/copy/csv/unquoted_escape/mixed.test
+# description: Test the parsing of escaped values within and without quote in the same file
+# group: [unquoted_escape]
+
+statement ok
+PRAGMA enable_verification
+
+# replace CRLF with LF to pass the test on Windows
+query III
+SELECT
+    hamming(replace(string_agg(w, '|' ORDER BY y), E'\r\n', E'\n'), E'\\|,|"|\n'),
+    hamming(string_agg(z, '|' ORDER BY y), '"|"a"|"b|c"'),
+    bool_and(x = concat(w, '"', w))::int
+FROM read_csv('data/csv/unquoted_escape/mixed.csv', quote = '"', escape = '\', sep = ',', strict_mode = false);
+----
+0	0	1