Support LIKE with nondeterministic collations

author Peter Eisentraut <[email protected]>
Wed, 27 Nov 2024 07:18:35 +0000 (08:18 +0100)
committer Peter Eisentraut <[email protected]>
Wed, 27 Nov 2024 07:19:42 +0000 (08:19 +0100)
diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
index f5e115e8d6e6a2795eb8857f5cd8a9ab41e276a5..00e1986849aedf0bbe1c27890e5ff61bcc05eb33 100644 (file)
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@@ -1197,7 +1197,7 @@ CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-tr
       to a performance penalty.  Note, in particular, that B-tree cannot use
       deduplication with indexes that use a nondeterministic collation.  Also,
       certain operations are not possible with nondeterministic collations,
-     such as pattern matching operations.  Therefore, they should be used
+     such as some pattern matching operations.  Therefore, they should be used
       only in cases where they are specifically wanted.
      </para>
  
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 13ccbe7d78c26e947112586da19a1e904b487e95..8b81106fa234b68fbff227152010a7e7a1df02ae 100644 (file)
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -5414,9 +5414,10 @@ cast(-44 as bit(12))           <lineannotation>111111010100</lineannotation>
     </caution>
  
     <para>
-    The pattern matching operators of all three kinds do not support
-    nondeterministic collations.  If required, apply a different collation to
-    the expression to work around this limitation.
+    <function>SIMILAR TO</function> and <acronym>POSIX</acronym>-style regular
+    expressions do not support nondeterministic collations.  If required, use
+    <function>LIKE</function> or apply a different collation to the expression
+    to work around this limitation.
     </para>
  
    <sect2 id="functions-like">
@@ -5462,6 +5463,46 @@ cast(-44 as bit(12))           <lineannotation>111111010100</lineannotation>
  </programlisting>
     </para>
  
+   <para>
+    <function>LIKE</function> pattern matching supports nondeterministic
+    collations (see <xref linkend="collation-nondeterministic"/>), such as
+    case-insensitive collations or collations that, say, ignore punctuation.
+    So with a case-insensitive collation, one could have:
+<programlisting>
+'AbC' LIKE 'abc' COLLATE case_insensitive    <lineannotation>true</lineannotation>
+'AbC' LIKE 'a%' COLLATE case_insensitive     <lineannotation>true</lineannotation>
+</programlisting>
+    With collations that ignore certain characters or in general that consider
+    strings of different lengths equal, the semantics can become a bit more
+    complicated.  Consider these examples:
+<programlisting>
+'.foo.' LIKE 'foo' COLLATE ign_punct    <lineannotation>true</lineannotation>
+'.foo.' LIKE 'f_o' COLLATE ign_punct    <lineannotation>true</lineannotation>
+'.foo.' LIKE '_oo' COLLATE ign_punct    <lineannotation>false</lineannotation>
+</programlisting>
+    The way the matching works is that the pattern is partitioned into
+    sequences of wildcards and non-wildcard strings (wildcards being
+    <literal>_</literal> and <literal>%</literal>).  For example, the pattern
+    <literal>f_o</literal> is partitioned into <literal>f, _, o</literal>, the
+    pattern <literal>_oo</literal> is partitioned into <literal>_,
+    oo</literal>.  The input string matches the pattern if it can be
+    partitioned in such a way that the wildcards match one character or any
+    number of characters respectively and the non-wildcard partitions are
+    equal under the applicable collation.  So for example, <literal>'.foo.'
+    LIKE 'f_o' COLLATE ign_punct</literal> is true because one can partition
+    <literal>.foo.</literal> into <literal>.f, o, o.</literal>, and then
+    <literal>'.f' = 'f' COLLATE ign_punct</literal>, <literal>'o'</literal>
+    matches the <literal>_</literal> wildcard, and <literal>'o.' = 'o' COLLATE
+    ign_punct</literal>.  But <literal>'.foo.' LIKE '_oo' COLLATE
+    ign_punct</literal> is false because <literal>.foo.</literal> cannot be
+    partitioned in a way that the first character is any character and the
+    rest of the string compares equal to <literal>oo</literal>.  (Note that
+    the single-character wildcard always matches exactly one character,
+    independent of the collation.  So in this example, the
+    <literal>_</literal> would match <literal>.</literal>, but then the rest
+    of the input string won't match the rest of the pattern.)
+   </para>
+
     <para>
      <function>LIKE</function> pattern matching always covers the entire
      string.  Therefore, if it's desired to match a sequence anywhere within
@@ -5503,8 +5544,9 @@ cast(-44 as bit(12))           <lineannotation>111111010100</lineannotation>
  
     <para>
      The key word <token>ILIKE</token> can be used instead of
-    <token>LIKE</token> to make the match case-insensitive according
-    to the active locale.  This is not in the <acronym>SQL</acronym> standard but is a
+    <token>LIKE</token> to make the match case-insensitive according to the
+    active locale.  (But this does not support nondeterministic collations.)
+    This is not in the <acronym>SQL</acronym> standard but is a
      <productname>PostgreSQL</productname> extension.
     </para>
  
diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 0152723b2a6dc87c54e90db2e7842740595d6434..7b3d1b5be71f4b7bf7c183ab9d4399ddfb547de9 100644 (file)
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -147,22 +147,28 @@ SB_lower_char(unsigned char c, pg_locale_t locale)
  static inline int
  GenericMatchText(const char *s, int slen, const char *p, int plen, Oid collation)
  {
-   if (collation)
-   {
-       pg_locale_t locale = pg_newlocale_from_collation(collation);
+   pg_locale_t locale;
  
-       if (!locale->deterministic)
-           ereport(ERROR,
-                   (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                    errmsg("nondeterministic collations are not supported for LIKE")));
+   if (!OidIsValid(collation))
+   {
+       /*
+        * This typically means that the parser could not resolve a conflict
+        * of implicit collations, so report it that way.
+        */
+       ereport(ERROR,
+               (errcode(ERRCODE_INDETERMINATE_COLLATION),
+                errmsg("could not determine which collation to use for LIKE"),
+                errhint("Use the COLLATE clause to set the collation explicitly.")));
     }
  
+   locale = pg_newlocale_from_collation(collation);
+
     if (pg_database_encoding_max_length() == 1)
-       return SB_MatchText(s, slen, p, plen, 0);
+       return SB_MatchText(s, slen, p, plen, locale);
     else if (GetDatabaseEncoding() == PG_UTF8)
-       return UTF8_MatchText(s, slen, p, plen, 0);
+       return UTF8_MatchText(s, slen, p, plen, locale);
     else
-       return MB_MatchText(s, slen, p, plen, 0);
+       return MB_MatchText(s, slen, p, plen, locale);
  }
  
  static inline int
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c
index f561cc15e4c76e5e58aa7e934512179dfa24899c..afe5406cf40945d00272790b701409bd4b6a78e9 100644 (file)
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -157,7 +157,9 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
              * the first pattern byte to each text byte to avoid recursing
              * more than we have to.  This fact also guarantees that we don't
              * have to consider a match to the zero-length substring at the
-            * end of the text.
+            * end of the text.  With a nondeterministic collation, we can't
+            * rely on the first bytes being equal, so we have to recurse in
+            * any case.
              */
             if (*p == '\\')
             {
@@ -172,7 +174,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
  
             while (tlen > 0)
             {
-               if (GETCHAR(*t, locale) == firstpat)
+               if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic))
                 {
                     int         matched = MatchText(t, tlen, p, plen, locale);
  
@@ -196,6 +198,149 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
             NextByte(p, plen);
             continue;
         }
+       else if (locale && !locale->deterministic)
+       {
+           /*
+            * For nondeterministic locales, we find the next substring of the
+            * pattern that does not contain wildcards and try to find a
+            * matching substring in the text.  Crucially, we cannot do this
+            * character by character, as in the normal case, but must do it
+            * substring by substring, partitioned by the wildcard characters.
+            * (This is per SQL standard.)
+            */
+           const char *p1;
+           size_t      p1len;
+           const char *t1;
+           size_t      t1len;
+           bool        found_escape;
+           const char *subpat;
+           size_t      subpatlen;
+           char       *buf = NULL;
+
+           /*
+            * Determine next substring of pattern without wildcards.  p is
+            * the start of the subpattern, p1 is one past the last byte. Also
+            * track if we found an escape character.
+            */
+           p1 = p;
+           p1len = plen;
+           found_escape = false;
+           while (p1len > 0)
+           {
+               if (*p1 == '\\')
+               {
+                   found_escape = true;
+                   NextByte(p1, p1len);
+                   if (p1len == 0)
+                       ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+                                errmsg("LIKE pattern must not end with escape character")));
+               }
+               else if (*p1 == '_' || *p1 == '%')
+                   break;
+               NextByte(p1, p1len);
+           }
+
+           /*
+            * If we found an escape character, then make an unescaped copy of
+            * the subpattern.
+            */
+           if (found_escape)
+           {
+               char       *b;
+
+               b = buf = palloc(p1 - p);
+               for (const char *c = p; c < p1; c++)
+               {
+                   if (*c == '\\')
+                       ;
+                   else
+                       *(b++) = *c;
+               }
+
+               subpat = buf;
+               subpatlen = b - buf;
+           }
+           else
+           {
+               subpat = p;
+               subpatlen = p1 - p;
+           }
+
+           /*
+            * Shortcut: If this is the end of the pattern, then the rest of
+            * the text has to match the rest of the pattern.
+            */
+           if (p1len == 0)
+           {
+               int         cmp;
+
+               cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale);
+
+               if (buf)
+                   pfree(buf);
+               if (cmp == 0)
+                   return LIKE_TRUE;
+               else
+                   return LIKE_FALSE;
+           }
+
+           /*
+            * Now build a substring of the text and try to match it against
+            * the subpattern.  t is the start of the text, t1 is one past the
+            * last byte.  We start with a zero-length string.
+            */
+           t1 = t;
+           t1len = tlen;
+           for (;;)
+           {
+               int         cmp;
+
+               CHECK_FOR_INTERRUPTS();
+
+               cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale);
+
+               /*
+                * If we found a match, we have to test if the rest of pattern
+                * can match against the rest of the string.  Otherwise we
+                * have to continue here try matching with a longer substring.
+                * (This is similar to the recursion for the '%' wildcard
+                * above.)
+                *
+                * Note that we can't just wind forward p and t and continue
+                * with the main loop.  This would fail for example with
+                *
+                * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents
+                *
+                * You'd find that t=\0061 matches p=\00E4, but then the rest
+                * won't match; but t=\0061\0308 also matches p=\00E4, and
+                * then the rest will match.
+                */
+               if (cmp == 0)
+               {
+                   int         matched = MatchText(t1, t1len, p1, p1len, locale);
+
+                   if (matched == LIKE_TRUE)
+                   {
+                       if (buf)
+                           pfree(buf);
+                       return matched;
+                   }
+               }
+
+               /*
+                * Didn't match.  If we used up the whole text, then the match
+                * fails.  Otherwise, try again with a longer substring.
+                */
+               if (t1len == 0)
+                   return LIKE_FALSE;
+               else
+                   NextChar(t1, t1len);
+           }
+           if (buf)
+               pfree(buf);
+           continue;
+       }
         else if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
         {
             /* non-wildcard pattern char fails to match text char */
diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c
index 8b15509a3bf6950daf5933621a4ac1686b8ff3d1..ee71ca89ffdab69a44aef8a684fef6931108f114 100644 (file)
--- a/src/backend/utils/adt/like_support.c
+++ b/src/backend/utils/adt/like_support.c
@@ -272,22 +272,6 @@ match_pattern_prefix(Node *leftop,
         return NIL;
     patt = (Const *) rightop;
  
-   /*
-    * Not supported if the expression collation is nondeterministic.  The
-    * optimized equality or prefix tests use bytewise comparisons, which is
-    * not consistent with nondeterministic collations.  The actual
-    * pattern-matching implementation functions will later error out that
-    * pattern-matching is not supported with nondeterministic collations. (We
-    * could also error out here, but by doing it later we get more precise
-    * error messages.)  (It should be possible to support at least
-    * Pattern_Prefix_Exact, but no point as long as the actual
-    * pattern-matching implementations don't support it.)
-    *
-    * expr_coll is not set for a non-collation-aware data type such as bytea.
-    */
-   if (expr_coll && !get_collation_isdeterministic(expr_coll))
-       return NIL;
-
     /*
      * Try to extract a fixed prefix from the pattern.
      */
@@ -404,6 +388,8 @@ match_pattern_prefix(Node *leftop,
     {
         if (!op_in_opfamily(eqopr, opfamily))
             return NIL;
+       if (indexcollation != expr_coll)
+           return NIL;
         expr = make_opclause(eqopr, BOOLOID, false,
                              (Expr *) leftop, (Expr *) prefix,
                              InvalidOid, indexcollation);
@@ -411,6 +397,17 @@ match_pattern_prefix(Node *leftop,
         return result;
     }
  
+   /*
+    * Anything other than Pattern_Prefix_Exact is not supported if the
+    * expression collation is nondeterministic.  The optimized equality or
+    * prefix tests use bytewise comparisons, which is not consistent with
+    * nondeterministic collations.
+    *
+    * expr_coll is not set for a non-collation-aware data type such as bytea.
+    */
+   if (expr_coll && !get_collation_isdeterministic(expr_coll))
+       return NIL;
+
     /*
      * Otherwise, we have a nonempty required prefix of the values.  Some
      * opclasses support prefix checks directly, otherwise we'll try to
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index de17f7db6ce3bb6e25493f33eed92129e378442f..79e76d4b850f9701dc05d601017b088d6e029b53 100644 (file)
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1274,6 +1274,30 @@ CREATE COLLATION ctest_det (provider = icu, locale = '', deterministic = true);
  NOTICE:  using standard form "und" for ICU locale ""
  CREATE COLLATION ctest_nondet (provider = icu, locale = '', deterministic = false);
  NOTICE:  using standard form "und" for ICU locale ""
+SELECT 'abc' LIKE 'abc' COLLATE ctest_det;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT 'abc' LIKE 'a\bc' COLLATE ctest_det;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT 'abc' LIKE 'abc' COLLATE ctest_nondet;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT 'abc' LIKE 'a\bc' COLLATE ctest_nondet;
+ ?column? 
+----------
+ t
+(1 row)
+
  CREATE TABLE test6 (a int, b text);
  -- same string in different normal forms
  INSERT INTO test6 VALUES (1, U&'\00E4bc');
@@ -1298,6 +1322,19 @@ SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_nondet;
   2 | äbc
  (2 rows)
  
+SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_det;
+ a |  b  
+---+-----
+ 1 | äbc
+(1 row)
+
+SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_nondet;
+ a |  b  
+---+-----
+ 1 | äbc
+ 2 | äbc
+(2 rows)
+
  -- same with arrays
  CREATE TABLE test6a (a int, b text[]);
  INSERT INTO test6a VALUES (1, ARRAY[U&'\00E4bc']);
@@ -1514,7 +1551,12 @@ SELECT x FROM test3ci WHERE x <> 'abc';
  (2 rows)
  
  SELECT x FROM test3ci WHERE x LIKE 'a%';
-ERROR:  nondeterministic collations are not supported for LIKE
+  x  
+-----
+ abc
+ ABC
+(2 rows)
+
  SELECT x FROM test3ci WHERE x ILIKE 'a%';
  ERROR:  nondeterministic collations are not supported for ILIKE
  SELECT x FROM test3ci WHERE x SIMILAR TO 'a%';
@@ -1632,7 +1674,12 @@ SELECT x FROM test3bpci WHERE x <> 'abc';
  (2 rows)
  
  SELECT x FROM test3bpci WHERE x LIKE 'a%';
-ERROR:  nondeterministic collations are not supported for LIKE
+  x  
+-----
+ abc
+ ABC
+(2 rows)
+
  SELECT x FROM test3bpci WHERE x ILIKE 'a%';
  ERROR:  nondeterministic collations are not supported for ILIKE
  SELECT x FROM test3bpci WHERE x SIMILAR TO 'a%';
@@ -1729,7 +1776,7 @@ SELECT string_to_array('ABCDEFGHI'::char(9) COLLATE case_insensitive, NULL, 'b')
  -- This tests the issue described in match_pattern_prefix().  In the
  -- absence of that check, the case_insensitive tests below would
  -- return no rows where they should logically return one.
-CREATE TABLE test4c (x text COLLATE "C");
+CREATE TABLE test4c (x text COLLATE case_insensitive);
  INSERT INTO test4c VALUES ('abc');
  CREATE INDEX ON test4c (x);
  SET enable_seqscan = off;
@@ -1743,10 +1790,18 @@ SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_sensitive;  -- ok, no rows
  ---
  (0 rows)
  
-SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_insensitive;  -- error
-ERROR:  nondeterministic collations are not supported for LIKE
-SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_insensitive;  -- error
-ERROR:  nondeterministic collations are not supported for LIKE
+SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_insensitive;  -- ok
+  x  
+-----
+ abc
+(1 row)
+
+SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_insensitive;  -- ok
+  x  
+-----
+ abc
+(1 row)
+
  RESET enable_seqscan;
  -- Unicode special case: different variants of Greek lower case sigma.
  -- A naive implementation like citext that just does lower(x) =
@@ -1840,6 +1895,126 @@ SELECT * FROM test4 WHERE b = 'Cote' COLLATE case_insensitive;
   1 | cote
  (1 row)
  
+-- This is a tricky one.  A naive implementation would first test
+-- \00E4 matches \0061, which is true under ignore_accents, but then
+-- the rest of the string won't match anymore.  Therefore, the
+-- algorithm has to test whether the rest of the string matches, and
+-- if not try matching \00E4 against a longer substring like
+-- \0061\0308, which will then work out.
+SELECT U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- and in reverse:
+SELECT U&'\00E4bc' LIKE U&'\0061\0308_c' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- inner % matches b:
+SELECT U&'\0061\0308bc' LIKE U&'\00E4%c' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- inner %% matches b then zero:
+SELECT U&'\0061\0308bc' LIKE U&'\00E4%%c' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- inner %% matches b then zero:
+SELECT U&'cb\0061\0308' LIKE U&'c%%\00E4' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- trailing _ matches two codepoints that form one grapheme:
+SELECT U&'cb\0061\0308' LIKE U&'cb_' COLLATE ignore_accents;
+ ?column? 
+----------
+ f
+(1 row)
+
+-- trailing __ matches two codepoints that form one grapheme:
+SELECT U&'cb\0061\0308' LIKE U&'cb__' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- leading % matches zero:
+SELECT U&'\0061\0308bc' LIKE U&'%\00E4bc' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- leading % matches zero (with later %):
+SELECT U&'\0061\0308bc' LIKE U&'%\00E4%c' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- trailing % matches zero:
+SELECT U&'\0061\0308bc' LIKE U&'\00E4bc%' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- trailing % matches zero (with previous %):
+SELECT U&'\0061\0308bc' LIKE U&'\00E4%c%' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- _ versus two codepoints that form one grapheme:
+SELECT U&'\0061\0308bc' LIKE U&'_bc' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- (actually this matches because)
+SELECT U&'\0308bc' = 'bc' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- __ matches two codepoints that form one grapheme:
+SELECT U&'\0061\0308bc' LIKE U&'__bc' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- _ matches one codepoint that forms half a grapheme:
+SELECT U&'\0061\0308bc' LIKE U&'_\0308bc' COLLATE ignore_accents;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- doesn't match because \00e4 doesn't match only \0308
+SELECT U&'\0061\0308bc' LIKE U&'_\00e4bc' COLLATE ignore_accents;
+ ?column? 
+----------
+ f
+(1 row)
+
+-- escape character at end of pattern
+SELECT 'foox' LIKE 'foo\' COLLATE ignore_accents;
+ERROR:  LIKE pattern must not end with escape character
  -- foreign keys (mixing different nondeterministic collations not allowed)
  CREATE TABLE test10pk (x text COLLATE case_sensitive PRIMARY KEY);
  CREATE TABLE test10fk (x text COLLATE case_insensitive REFERENCES test10pk (x) ON UPDATE CASCADE ON DELETE CASCADE);  -- error
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 0c9491c260e70565eaa862d769f55da791d9949b..797e93ac714db7809e27da3910bfd9953cb83990 100644 (file)
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -514,6 +514,12 @@ CREATE COLLATION testcoll_rulesx (provider = icu, locale = '', rules = '!!wrong!
  CREATE COLLATION ctest_det (provider = icu, locale = '', deterministic = true);
  CREATE COLLATION ctest_nondet (provider = icu, locale = '', deterministic = false);
  
+SELECT 'abc' LIKE 'abc' COLLATE ctest_det;
+SELECT 'abc' LIKE 'a\bc' COLLATE ctest_det;
+
+SELECT 'abc' LIKE 'abc' COLLATE ctest_nondet;
+SELECT 'abc' LIKE 'a\bc' COLLATE ctest_nondet;
+
  CREATE TABLE test6 (a int, b text);
  -- same string in different normal forms
  INSERT INTO test6 VALUES (1, U&'\00E4bc');
@@ -522,6 +528,9 @@ SELECT * FROM test6;
  SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_det;
  SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_nondet;
  
+SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_det;
+SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_nondet;
+
  -- same with arrays
  CREATE TABLE test6a (a int, b text[]);
  INSERT INTO test6a VALUES (1, ARRAY[U&'\00E4bc']);
@@ -637,14 +646,14 @@ SELECT string_to_array('ABCDEFGHI'::char(9) COLLATE case_insensitive, NULL, 'b')
  -- This tests the issue described in match_pattern_prefix().  In the
  -- absence of that check, the case_insensitive tests below would
  -- return no rows where they should logically return one.
-CREATE TABLE test4c (x text COLLATE "C");
+CREATE TABLE test4c (x text COLLATE case_insensitive);
  INSERT INTO test4c VALUES ('abc');
  CREATE INDEX ON test4c (x);
  SET enable_seqscan = off;
  SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_sensitive;  -- ok, no rows
  SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_sensitive;  -- ok, no rows
-SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_insensitive;  -- error
-SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_insensitive;  -- error
+SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_insensitive;  -- ok
+SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_insensitive;  -- ok
  RESET enable_seqscan;
  
  -- Unicode special case: different variants of Greek lower case sigma.
@@ -687,6 +696,46 @@ SELECT * FROM test4 WHERE b = 'cote' COLLATE ignore_accents;
  SELECT * FROM test4 WHERE b = 'Cote' COLLATE ignore_accents;  -- still case-sensitive
  SELECT * FROM test4 WHERE b = 'Cote' COLLATE case_insensitive;
  
+-- This is a tricky one.  A naive implementation would first test
+-- \00E4 matches \0061, which is true under ignore_accents, but then
+-- the rest of the string won't match anymore.  Therefore, the
+-- algorithm has to test whether the rest of the string matches, and
+-- if not try matching \00E4 against a longer substring like
+-- \0061\0308, which will then work out.
+SELECT U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents;
+-- and in reverse:
+SELECT U&'\00E4bc' LIKE U&'\0061\0308_c' COLLATE ignore_accents;
+-- inner % matches b:
+SELECT U&'\0061\0308bc' LIKE U&'\00E4%c' COLLATE ignore_accents;
+-- inner %% matches b then zero:
+SELECT U&'\0061\0308bc' LIKE U&'\00E4%%c' COLLATE ignore_accents;
+-- inner %% matches b then zero:
+SELECT U&'cb\0061\0308' LIKE U&'c%%\00E4' COLLATE ignore_accents;
+-- trailing _ matches two codepoints that form one grapheme:
+SELECT U&'cb\0061\0308' LIKE U&'cb_' COLLATE ignore_accents;
+-- trailing __ matches two codepoints that form one grapheme:
+SELECT U&'cb\0061\0308' LIKE U&'cb__' COLLATE ignore_accents;
+-- leading % matches zero:
+SELECT U&'\0061\0308bc' LIKE U&'%\00E4bc' COLLATE ignore_accents;
+-- leading % matches zero (with later %):
+SELECT U&'\0061\0308bc' LIKE U&'%\00E4%c' COLLATE ignore_accents;
+-- trailing % matches zero:
+SELECT U&'\0061\0308bc' LIKE U&'\00E4bc%' COLLATE ignore_accents;
+-- trailing % matches zero (with previous %):
+SELECT U&'\0061\0308bc' LIKE U&'\00E4%c%' COLLATE ignore_accents;
+-- _ versus two codepoints that form one grapheme:
+SELECT U&'\0061\0308bc' LIKE U&'_bc' COLLATE ignore_accents;
+-- (actually this matches because)
+SELECT U&'\0308bc' = 'bc' COLLATE ignore_accents;
+-- __ matches two codepoints that form one grapheme:
+SELECT U&'\0061\0308bc' LIKE U&'__bc' COLLATE ignore_accents;
+-- _ matches one codepoint that forms half a grapheme:
+SELECT U&'\0061\0308bc' LIKE U&'_\0308bc' COLLATE ignore_accents;
+-- doesn't match because \00e4 doesn't match only \0308
+SELECT U&'\0061\0308bc' LIKE U&'_\00e4bc' COLLATE ignore_accents;
+-- escape character at end of pattern
+SELECT 'foox' LIKE 'foo\' COLLATE ignore_accents;
+
  -- foreign keys (mixing different nondeterministic collations not allowed)
  CREATE TABLE test10pk (x text COLLATE case_sensitive PRIMARY KEY);
  CREATE TABLE test10fk (x text COLLATE case_insensitive REFERENCES test10pk (x) ON UPDATE CASCADE ON DELETE CASCADE);  -- error
author	Peter Eisentraut <[email protected]>
	Wed, 27 Nov 2024 07:18:35 +0000 (08:18 +0100)
committer	Peter Eisentraut <[email protected]>
	Wed, 27 Nov 2024 07:19:42 +0000 (08:19 +0100)
doc/src/sgml/charset.sgml		\| blob \| blame \| history
doc/src/sgml/func.sgml		\| blob \| blame \| history
src/backend/utils/adt/like.c		\| blob \| blame \| history
src/backend/utils/adt/like_match.c		\| blob \| blame \| history
src/backend/utils/adt/like_support.c		\| blob \| blame \| history
src/test/regress/expected/collate.icu.utf8.out		\| blob \| blame \| history
src/test/regress/sql/collate.icu.utf8.sql		\| blob \| blame \| history