1/*
2   Copyright (c) 2015, MariaDB Foundation
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; version 2 of the License.
7
8   This program is distributed in the hope that it will be useful,
9   but WITHOUT ANY WARRANTY; without even the implied warranty of
10   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   GNU General Public License for more details.
12
13   You should have received a copy of the GNU General Public License
14   along with this program; if not, write to the Free Software
15   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1335  USA
16*/
17
18#ifndef MY_FUNCTION_NAME
19#error MY_FUNCTION_NAME is not defined
20#endif
21
22/*
23  Define strnncoll() and strnncollsp() by default,
24  unless "#define DEFINE_STRNNCOLL 0" is specified.
25*/
26#ifndef DEFINE_STRNNCOLL
27#define DEFINE_STRNNCOLL 1
28#endif
29
30
31/*
32  The weight for automatically padded spaces when comparing strings with
33  the PAD SPACE property.
34  Should normally be equal to the weight of a regular space.
35*/
36#ifndef WEIGHT_PAD_SPACE
37#define WEIGHT_PAD_SPACE  (' ')
38#endif
39
40
41/*
42  Weight of an illegal byte, must follow these rules:
43  1. Must be greater than weight of any normal character in the collation.
44  2. Two different bad bytes must have different weights and must be
45     compared in their binary order.
46
47  Depends on mbmaxlen of the character set, as well as how the collation
48  sorts various single-byte and multi-byte character blocks.
49
50  The macro below is the default definition, it is suitable for mbmaxlen=2
51  character sets that sort all multi-byte characters after all single-byte
52  characters: big5, euckr, gb2312, gbk.
53
54  All mbmaxlen>2 character sets must provide their own definitions.
55  All collations that have a more complex order (than just MB1 followed by MB2)
56  must also provide their own definitions (see definitions for
57  cp932_japanese_ci and sjis_japanese_ci as examples of a more complex order).
58*/
59#ifndef WEIGHT_ILSEQ
60#define WEIGHT_ILSEQ(x)   (0xFF00 + (x))
61#endif
62
63
64#if DEFINE_STRNNCOLL
65
66/**
67  Scan a valid character, or a bad byte, or an auto-padded space
68  from a string and calculate the weight of the scanned sequence.
69
70  @param [OUT] weight - the weight is returned here
71  @param str          - the string
72  @param end          - the end of the string
73  @return             - the number of bytes scanned
74
75  The including source file must define the following macros:
76  IS_MB1_CHAR(b0)          - for character sets that have MB1 characters
77  IS_MB1_MB2HEAD_GAP(b0)   - optional, for better performance
78  IS_MB2_CHAR(b0,b1)       - for character sets that have MB2 characters
79  IS_MB3_CHAR(b0,b1,b2)    - for character sets that have MB3 characters
80  IS_MB4_CHAR(b0,b1,b2,b3) - for character sets with have MB4 characters
81  WEIGHT_PAD_SPACE
82  WEIGHT_MB1(b0)           - for character sets that have MB1 characters
83  WEIGHT_MB2(b0,b1)        - for character sets that have MB2 characters
84  WEIGHT_MB3(b0,b1,b2)     - for character sets that have MB3 characters
85  WEIGHT_MB4(b0,b1,b2,b3)  - for character sets that have MB4 characters
86  WEIGHT_ILSEQ(x)
87*/
88static inline uint
89MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
90{
91  if (str >= end)
92  {
93    *weight= WEIGHT_PAD_SPACE;
94    return 0;
95  }
96
97#ifdef IS_MB1_CHAR
98  if (IS_MB1_CHAR(*str))
99  {
100    *weight= WEIGHT_MB1(*str);           /* A valid single byte character*/
101    return 1;
102  }
103#endif
104
105#ifdef IS_MB1_MBHEAD_UNUSED_GAP
106  /*
107    Quickly filter out unused bytes that are neither MB1 nor MBHEAD.
108    E.g. [0x80..0xC1] in utf8. This allows using simplified conditions
109    in IS_MB2_CHAR(), IS_MB3_CHAR(), etc.
110  */
111  if (IS_MB1_MBHEAD_UNUSED_GAP(*str))
112    goto bad;
113#endif
114
115#ifdef IS_MB2_CHAR
116  if (str + 2 > end)                     /* The string ended unexpectedly */
117    goto bad;                            /* Treat as a bad byte */
118
119  if (IS_MB2_CHAR(str[0], str[1]))
120  {
121    *weight= WEIGHT_MB2(str[0], str[1]);
122    return 2;                            /* A valid two-byte character */
123  }
124#endif
125
126#ifdef IS_MB3_CHAR
127  if (str + 3 > end)                     /* Incomplete three-byte character */
128    goto bad;
129
130  if (IS_MB3_CHAR(str[0], str[1], str[2]))
131  {
132    *weight= WEIGHT_MB3(str[0], str[1], str[2]);
133    return 3;                            /* A valid three-byte character */
134  }
135#endif
136
137#ifdef IS_MB4_CHAR
138  if (str + 4 > end)                     /* Incomplete four-byte character */
139    goto bad;
140
141  if (IS_MB4_CHAR(str[0], str[1], str[2], str[3]))
142  {
143    *weight= WEIGHT_MB4(str[0], str[1], str[2], str[3]);
144    return 4;                            /* A valid four-byte character */
145  }
146
147#endif
148
149bad:
150  *weight= WEIGHT_ILSEQ(str[0]);         /* Bad byte */
151  return 1;
152}
153
154
155/**
156  Compare two strings according to the collation,
157  without handling the PAD SPACE property.
158
159  Note, cs->coll->strnncoll() is usually used to compare identifiers.
160  Perhaps we should eventually (in 10.2?) create a new collation
161  my_charset_utf8_general_ci_no_pad and have only one comparison function
162  in MY_COLLATION_HANDLER.
163
164  @param cs          - the character set and collation
165  @param a           - the left string
166  @param a_length    - the length of the left string
167  @param b           - the right string
168  @param b_length    - the length of the right string
169  @param b_is_prefix - if the caller wants to check if "b" is a prefix of "a"
170  @return            - the comparison result
171*/
172static int
173MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs __attribute__((unused)),
174                            const uchar *a, size_t a_length,
175                            const uchar *b, size_t b_length,
176                            my_bool b_is_prefix)
177{
178  const uchar *a_end= a + a_length;
179  const uchar *b_end= b + b_length;
180  for ( ; ; )
181  {
182    int a_weight, b_weight, res;
183    uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
184    uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
185    /*
186      a_wlen  b_wlen Comment
187      ------  ------ -------
188      0       0      Strings ended simultaneously, "a" and "b" are equal.
189      0       >0     "a" is a prefix of "b", so "a" is smaller.
190      >0      0      "b" is a prefix of "a", check b_is_prefix.
191      >0      >0     Two weights were scanned, check weight difference.
192    */
193    if (!a_wlen)
194      return b_wlen ? -b_weight : 0;
195
196    if (!b_wlen)
197      return b_is_prefix ? 0 : a_weight;
198
199    if ((res= (a_weight - b_weight)))
200      return res;
201    /*
202      None of the strings has ended yet.
203    */
204    DBUG_ASSERT(a < a_end);
205    DBUG_ASSERT(b < b_end);
206    a+= a_wlen;
207    b+= b_wlen;
208  }
209  DBUG_ASSERT(0);
210  return 0;
211}
212
213
214#ifdef DEFINE_STRNNCOLLSP_NOPAD
215
216/**
217  Compare two strings according to the collation, with NO PAD handling.
218
219  @param cs          - the character set and collation
220  @param a           - the left string
221  @param a_length    - the length of the left string
222  @param b           - the right string
223  @param b_length    - the length of the right string
224  @return            - the comparison result
225*/
226static int
227MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
228                                    const uchar *a, size_t a_length,
229                                    const uchar *b, size_t b_length)
230{
231  return MY_FUNCTION_NAME(strnncoll)(cs, a, a_length, b, b_length, FALSE);
232}
233#else
234/**
235  Compare two strings according to the collation, with PAD SPACE handling.
236
237  @param cs          - the character set and collation
238  @param a           - the left string
239  @param a_length    - the length of the left string
240  @param b           - the right string
241  @param b_length    - the length of the right string
242  @return            - the comparison result
243*/
244static int
245MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
246                              const uchar *a, size_t a_length,
247                              const uchar *b, size_t b_length)
248{
249  const uchar *a_end= a + a_length;
250  const uchar *b_end= b + b_length;
251  for ( ; ; )
252  {
253    int a_weight, b_weight, res;
254    uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
255    uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
256    if ((res= (a_weight - b_weight)))
257    {
258      /*
259        Got two different weights. Each weight can be generated by either of:
260        - a real character
261        - a bad byte sequence or an incomplete byte sequence
262        - an auto-generated trailing space (PAD SPACE)
263        It does not matter how exactly each weight was generated.
264        Just return the weight difference.
265      */
266      return res;
267    }
268    if (!a_wlen && !b_wlen)
269    {
270      /*
271        Got two auto-generated trailing spaces, i.e.
272        both strings have now ended, so they are equal.
273      */
274      DBUG_ASSERT(a == a_end);
275      DBUG_ASSERT(b == b_end);
276      return 0;
277    }
278    /*
279      At least one of the strings has not ended yet, continue comparison.
280    */
281    DBUG_ASSERT(a < a_end || b < b_end);
282    a+= a_wlen;
283    b+= b_wlen;
284  }
285  DBUG_ASSERT(0);
286  return 0;
287}
288#endif
289
290
291/**
292  Compare two strings according to the collation,
293  with trailing space padding or trimming, according to "nchars".
294
295  @param cs          - the character set and collation
296  @param a           - the left string
297  @param a_length    - the length of the left string
298  @param b           - the right string
299  @param b_length    - the length of the right string
300  @param nchars      - compare this amount of characters only
301  @return            - the comparison result
302*/
303static int
304MY_FUNCTION_NAME(strnncollsp_nchars)(CHARSET_INFO *cs __attribute__((unused)),
305                                     const uchar *a, size_t a_length,
306                                     const uchar *b, size_t b_length,
307                                     size_t nchars)
308{
309  const uchar *a_end= a + a_length;
310  const uchar *b_end= b + b_length;
311  for ( ; nchars ; nchars--)
312  {
313    int a_weight, b_weight, res;
314    uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
315    uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
316
317    if ((res= (a_weight - b_weight)))
318    {
319      /* Got two different weights. See comments in strnncollsp above. */
320      return res;
321    }
322    if (!a_wlen && !b_wlen)
323    {
324      /* Got two auto-generated trailing spaces. */
325      DBUG_ASSERT(a == a_end);
326      DBUG_ASSERT(b == b_end);
327      return 0;
328    }
329    /*
330      At least one of the strings has not ended yet, continue comparison.
331    */
332    DBUG_ASSERT(a < a_end || b < b_end);
333    a+= a_wlen;
334    b+= b_wlen;
335  }
336  return 0;
337}
338
339
340#endif /* DEFINE_STRNNCOLL */
341
342
343#ifdef DEFINE_STRNXFRM
344#ifndef WEIGHT_MB2_FRM
345#define WEIGHT_MB2_FRM(x,y)  WEIGHT_MB2(x,y)
346#endif
347
348static size_t
349MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
350                           uchar *dst, size_t dstlen, uint nweights,
351                           const uchar *src, size_t srclen, uint flags)
352{
353  uchar *d0= dst;
354  uchar *de= dst + dstlen;
355  const uchar *se= src + srclen;
356  const uchar *sort_order= cs->sort_order;
357
358  for (; dst < de && src < se && nweights; nweights--)
359  {
360    if (my_charlen(cs, (const char *) src, (const char *) se) > 1)
361    {
362      /*
363        Note, it is safe not to check (src < se)
364        in the code below, because my_charlen() would
365        not return 2 if src was too short
366      */
367      uint16 e= WEIGHT_MB2_FRM(src[0], src[1]);
368      *dst++= (uchar) (e >> 8);
369      if (dst < de)
370        *dst++= (uchar) (e & 0xFF);
371      src+= 2;
372    }
373    else
374      *dst++= sort_order ? sort_order[*src++] : *src++;
375  }
376#ifdef DEFINE_STRNNCOLLSP_NOPAD
377  return my_strxfrm_pad_desc_and_reverse_nopad(cs, d0, dst, de,
378					       nweights, flags, 0);
379#else
380  return my_strxfrm_pad_desc_and_reverse(cs, d0, dst, de, nweights, flags, 0);
381#endif
382}
383#endif /* DEFINE_STRNXFRM */
384
385
386#if defined(DEFINE_STRNXFRM_UNICODE) || defined(DEFINE_STRNXFRM_UNICODE_NOPAD)
387
388/*
389  Store sorting weights using 2 bytes per character.
390
391  This function is shared between
392  - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
393    which support BMP only (U+0000..U+FFFF).
394  - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
395    which map all supplementary characters to weight 0xFFFD.
396*/
397
398#ifndef MY_MB_WC
399#error MY_MB_WC must be defined for DEFINE_STRNXFRM_UNICODE
400#endif
401
402#ifndef OPTIMIZE_ASCII
403#error OPTIMIZE_ASCII must be defined for DEFINE_STRNXFRM_UNICODE
404#endif
405
406#ifndef UNICASE_MAXCHAR
407#error UNICASE_MAXCHAR must be defined for DEFINE_STRNXFRM_UNICODE
408#endif
409
410#ifndef UNICASE_PAGE0
411#error UNICASE_PAGE0 must be defined for DEFINE_STRNXFRM_UNICODE
412#endif
413
414#ifndef UNICASE_PAGES
415#error UNICASE_PAGES must be defined for DEFINE_STRNXFRM_UNICODE
416#endif
417
418
419static size_t
420MY_FUNCTION_NAME(strnxfrm_internal)(CHARSET_INFO *cs __attribute__((unused)),
421                                    uchar *dst, uchar *de,
422                                    uint *nweights,
423                                    const uchar *src, const uchar *se)
424{
425  my_wc_t UNINIT_VAR(wc);
426  uchar *dst0= dst;
427
428  DBUG_ASSERT(src || !se);
429  DBUG_ASSERT((cs->state & MY_CS_LOWER_SORT) == 0);
430  DBUG_ASSERT(0x7F <= UNICASE_MAXCHAR);
431
432  for (; dst < de && *nweights; (*nweights)--)
433  {
434    int res;
435#if OPTIMIZE_ASCII
436    if (src >= se)
437      break;
438    if (src[0] <= 0x7F)
439    {
440      wc= UNICASE_PAGE0[*src++].sort;
441      PUT_WC_BE2_HAVE_1BYTE(dst, de, wc);
442      continue;
443    }
444#endif
445    if ((res= MY_MB_WC(cs, &wc, src, se)) <= 0)
446      break;
447    src+= res;
448    if (wc <= UNICASE_MAXCHAR)
449    {
450      MY_UNICASE_CHARACTER *page;
451      if ((page= UNICASE_PAGES[wc >> 8]))
452        wc= page[wc & 0xFF].sort;
453    }
454    else
455      wc= MY_CS_REPLACEMENT_CHARACTER;
456    PUT_WC_BE2_HAVE_1BYTE(dst, de, wc);
457  }
458  return dst - dst0;
459}
460
461
462static size_t
463MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
464                           uchar *dst, size_t dstlen, uint nweights,
465                           const uchar *src, size_t srclen, uint flags)
466{
467  uchar *dst0= dst;
468  uchar *de= dst + dstlen;
469  dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights,
470                                            src, src + srclen);
471  DBUG_ASSERT(dst <= de); /* Safety */
472
473  if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
474    dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights);
475
476  my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
477
478  if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
479    dst+= my_strxfrm_pad_unicode(dst, de);
480  return dst - dst0;
481}
482
483
484#ifdef DEFINE_STRNXFRM_UNICODE_NOPAD
485static size_t
486MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs,
487                                 uchar *dst, size_t dstlen,
488                                 uint nweights,
489                                 const uchar *src, size_t srclen, uint flags)
490{
491  uchar *dst0= dst;
492  uchar *de= dst + dstlen;
493  dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights,
494                                            src, src + srclen);
495  DBUG_ASSERT(dst <= de); /* Safety */
496
497  if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
498  {
499    size_t len= de - dst;
500    set_if_smaller(len, nweights * 2);
501    memset(dst, 0x00, len);
502    dst+= len;
503  }
504
505  my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
506
507  if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
508  {
509    memset(dst, 0x00, de - dst);
510    dst= de;
511  }
512  return dst - dst0;
513}
514#endif
515
516#endif /* DEFINE_STRNXFRM_UNICODE || DEFINE_STRNXFRM_UNICODE_NOPAD */
517
518
519
520#ifdef DEFINE_STRNXFRM_UNICODE_BIN2
521
522/*
523  Store sorting weights using 2 bytes per character.
524
525  These functions are shared between
526  - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
527    which support BMP only (U+0000..U+FFFF).
528  - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
529    which map all supplementary characters to weight 0xFFFD.
530*/
531
532#ifndef MY_MB_WC
533#error MY_MB_WC must be defined for DEFINE_STRNXFRM_UNICODE_BIN2
534#endif
535
536#ifndef OPTIMIZE_ASCII
537#error OPTIMIZE_ASCII must be defined for DEFINE_STRNXFRM_UNICODE_BIN2
538#endif
539
540
541static size_t
542MY_FUNCTION_NAME(strnxfrm_internal)(CHARSET_INFO *cs __attribute__((unused)),
543                                    uchar *dst, uchar *de,
544                                    uint *nweights,
545                                    const uchar *src,
546                                    const uchar *se)
547{
548  my_wc_t UNINIT_VAR(wc);
549  uchar *dst0= dst;
550
551  DBUG_ASSERT(src || !se);
552
553  for (; dst < de && *nweights; (*nweights)--)
554  {
555    int res;
556#if OPTIMIZE_ASCII
557    if (src >= se)
558      break;
559    if (src[0] <= 0x7F)
560    {
561      wc= *src++;
562      PUT_WC_BE2_HAVE_1BYTE(dst, de, wc);
563      continue;
564    }
565#endif
566    if ((res= MY_MB_WC(cs, &wc, src, se)) <= 0)
567      break;
568    src+= res;
569    if (wc > 0xFFFF)
570      wc= MY_CS_REPLACEMENT_CHARACTER;
571    PUT_WC_BE2_HAVE_1BYTE(dst, de, wc);
572  }
573  return dst - dst0;
574}
575
576
577static size_t
578MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
579                           uchar *dst, size_t dstlen, uint nweights,
580                           const uchar *src, size_t srclen, uint flags)
581{
582  uchar *dst0= dst;
583  uchar *de= dst + dstlen;
584  dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights,
585                                            src, src + srclen);
586  DBUG_ASSERT(dst <= de); /* Safety */
587
588  if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
589    dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights);
590
591  my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
592
593  if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
594    dst+= my_strxfrm_pad_unicode(dst, de);
595  return dst - dst0;
596}
597
598
599static size_t
600MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs,
601                                 uchar *dst, size_t dstlen, uint nweights,
602                                 const uchar *src, size_t srclen, uint flags)
603{
604  uchar *dst0= dst;
605  uchar *de= dst + dstlen;
606  dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights,
607                                            src, src + srclen);
608  DBUG_ASSERT(dst <= de); /* Safety */
609
610  if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
611  {
612    size_t len= de - dst;
613    set_if_smaller(len, nweights * 2);
614    memset(dst, 0x00, len);
615    dst+= len;
616  }
617
618  my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
619
620  if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
621  {
622    memset(dst, 0x00, de - dst);
623    dst= de;
624  }
625  return dst - dst0;
626}
627
628#endif /* DEFINE_STRNXFRM_UNICODE_BIN2 */
629
630
631/*
632  We usually include this file at least two times from the same source file,
633  for the _ci and the _bin collations. Prepare for the second inclusion.
634*/
635#undef MY_FUNCTION_NAME
636#undef MY_MB_WC
637#undef OPTIMIZE_ASCII
638#undef UNICASE_MAXCHAR
639#undef UNICASE_PAGE0
640#undef UNICASE_PAGES
641#undef WEIGHT_ILSEQ
642#undef WEIGHT_MB1
643#undef WEIGHT_MB2
644#undef WEIGHT_MB3
645#undef WEIGHT_MB4
646#undef WEIGHT_PAD_SPACE
647#undef WEIGHT_MB2_FRM
648#undef DEFINE_STRNXFRM
649#undef DEFINE_STRNXFRM_UNICODE
650#undef DEFINE_STRNXFRM_UNICODE_NOPAD
651#undef DEFINE_STRNXFRM_UNICODE_BIN2
652#undef DEFINE_STRNNCOLL
653#undef DEFINE_STRNNCOLLSP_NOPAD
654