This adds a couple of tests to trigger encoding conversion when input
and server encodings do not match in COPY FROM/TO, or need_transcoding
set to true in the COPY state data. These tests rely on UTF8 <-> LATIN1
for the valid cases as LATIN1 accepts any bytes, and UTF8 <-> EUC_JP for
some of the invalid cases where a character cannot be understood,
causing a conversion failure.
Both ENCODING and client_encoding are covered. Test suggested by Andres
Freund.
Author: Sutou Kouhei
Discussion: https://postgr.es/m/
20240206222445[email protected]--- /dev/null
+--
+-- Test cases for encoding with COPY commands
+--
+-- skip test if not UTF8 server encoding
+SELECT getdatabaseencoding() <> 'UTF8'
+ AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+-- directory paths are passed to us in environment variables
+\getenv abs_builddir PG_ABS_BUILDDIR
+\set utf8_csv :abs_builddir '/results/copyencoding_utf8.csv'
+CREATE TABLE copy_encoding_tab (t text);
+-- Valid cases
+-- Use ENCODING option
+-- U+3042 HIRAGANA LETTER A
+COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
+-- Read UTF8 data as LATIN1: no error
+COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1');
+-- Use client_encoding
+SET client_encoding TO UTF8;
+-- U+3042 HIRAGANA LETTER A
+COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
+-- Read UTF8 data as LATIN1: no error
+SET client_encoding TO LATIN1;
+COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
+RESET client_encoding;
+-- Invalid cases
+-- Use ENCODING explicitly
+-- U+3042 HIRAGANA LETTER A
+COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
+-- Read UTF8 data as EUC_JP: no error
+COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'EUC_JP');
+ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81
+CONTEXT: COPY copy_encoding_tab, line 1
+-- Use client_encoding
+SET client_encoding TO UTF8;
+-- U+3042 HIRAGANA LETTER A
+COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
+-- Read UTF8 data as EUC_JP: no error
+SET client_encoding TO EUC_JP;
+COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
+ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81
+CONTEXT: COPY copy_encoding_tab, line 1
+RESET client_encoding;
+DROP TABLE copy_encoding_tab;
--- /dev/null
+--
+-- Test cases for encoding with COPY commands
+--
+-- skip test if not UTF8 server encoding
+SELECT getdatabaseencoding() <> 'UTF8'
+ AS skip_test \gset
+\if :skip_test
+\quit
# execute two copy tests in parallel, to check that copy itself
# is concurrent safe.
# ----------
-test: copy copyselect copydml insert insert_conflict
+test: copy copyselect copydml copyencoding insert insert_conflict
# ----------
# More groups of parallel tests
--- /dev/null
+--
+-- Test cases for encoding with COPY commands
+--
+
+-- skip test if not UTF8 server encoding
+SELECT getdatabaseencoding() <> 'UTF8'
+ AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+-- directory paths are passed to us in environment variables
+\getenv abs_builddir PG_ABS_BUILDDIR
+
+\set utf8_csv :abs_builddir '/results/copyencoding_utf8.csv'
+
+CREATE TABLE copy_encoding_tab (t text);
+
+-- Valid cases
+
+-- Use ENCODING option
+-- U+3042 HIRAGANA LETTER A
+COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
+-- Read UTF8 data as LATIN1: no error
+COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1');
+
+-- Use client_encoding
+SET client_encoding TO UTF8;
+-- U+3042 HIRAGANA LETTER A
+COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
+-- Read UTF8 data as LATIN1: no error
+SET client_encoding TO LATIN1;
+COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
+RESET client_encoding;
+
+-- Invalid cases
+
+-- Use ENCODING explicitly
+-- U+3042 HIRAGANA LETTER A
+COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
+-- Read UTF8 data as EUC_JP: no error
+COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'EUC_JP');
+
+-- Use client_encoding
+SET client_encoding TO UTF8;
+-- U+3042 HIRAGANA LETTER A
+COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
+-- Read UTF8 data as EUC_JP: no error
+SET client_encoding TO EUC_JP;
+COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
+RESET client_encoding;
+
+DROP TABLE copy_encoding_tab;