1 /* Copyright (c) 2003, 2013, Oracle and/or its affiliates
2    Copyright (c) 2009, 2016, MariaDB
3 
4    This library is free software; you can redistribute it and/or
5    modify it under the terms of the GNU Library General Public
6    License as published by the Free Software Foundation; version 2
7    of the License.
8 
9    This library is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12    Library General Public License for more details.
13 
14    You should have received a copy of the GNU Library General Public
15    License along with this library; if not, write to the Free
16    Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
17    MA 02110-1335  USA */
18 
19 /* UCS2 support. Written by Alexander Barkov <bar@mysql.com> */
20 
21 #include "strings_def.h"
22 #include <m_ctype.h>
23 #include <my_sys.h>
24 #include <stdarg.h>
25 
26 #include "ctype-unidata.h"
27 
28 
29 #if defined(HAVE_CHARSET_utf16) || defined(HAVE_CHARSET_ucs2)
30 #define HAVE_CHARSET_mb2
31 #endif
32 
33 
34 #if defined(HAVE_CHARSET_mb2) || defined(HAVE_CHARSET_utf32)
35 #define HAVE_CHARSET_mb2_or_mb4
36 #endif
37 
38 
39 #ifndef EILSEQ
40 #define EILSEQ ENOENT
41 #endif
42 
43 #undef  ULONGLONG_MAX
44 #define ULONGLONG_MAX                (~(ulonglong) 0)
45 #define MAX_NEGATIVE_NUMBER        ((ulonglong) 0x8000000000000000LL)
46 #define INIT_CNT  9
47 #define LFACTOR   1000000000ULL
48 #define LFACTOR1  10000000000ULL
49 #define LFACTOR2  100000000000ULL
50 
51 #if defined(HAVE_CHARSET_utf32) || defined(HAVE_CHARSET_mb2)
52 static unsigned long lfactor[9]=
53 { 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L };
54 #endif
55 
56 
57 #ifdef HAVE_CHARSET_mb2_or_mb4
58 static size_t
my_caseup_str_mb2_or_mb4(CHARSET_INFO * cs,char * s)59 my_caseup_str_mb2_or_mb4(CHARSET_INFO * cs  __attribute__((unused)),
60                          char * s __attribute__((unused)))
61 {
62   DBUG_ASSERT(0);
63   return 0;
64 }
65 
66 
67 static size_t
my_casedn_str_mb2_or_mb4(CHARSET_INFO * cs,char * s)68 my_casedn_str_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)),
69                          char * s __attribute__((unused)))
70 {
71   DBUG_ASSERT(0);
72   return 0;
73 }
74 
75 
76 static int
my_strcasecmp_mb2_or_mb4(CHARSET_INFO * cs,const char * s,const char * t)77 my_strcasecmp_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)),
78                          const char *s __attribute__((unused)),
79                          const char *t __attribute__((unused)))
80 {
81   DBUG_ASSERT(0);
82   return 0;
83 }
84 
85 
86 typedef enum
87 {
88   MY_CHAR_COPY_OK=       0, /* The character was Okey */
89   MY_CHAR_COPY_ERROR=    1, /* The character was not Ok, and could not fix */
90   MY_CHAR_COPY_FIXED=    2  /* The character was not Ok, was fixed to '?' */
91 } my_char_copy_status_t;
92 
93 
94 /*
95   Copies an incomplete character, lef-padding it with 0x00 bytes.
96 
97   @param cs           Character set
98   @param dst          The destination string
99   @param dst_length   Space available in dst
100   @param src          The source string
101   @param src_length   Length of src
102   @param nchars       Copy not more than nchars characters.
103                       The "nchars" parameter of the caller.
104                       Only 0 and non-0 are important here.
105   @param fix          What to do if after zero-padding didn't get a valid
106                       character:
107                       - FALSE - exit with error.
108                       - TRUE  - try to put '?' instead.
109 
110   @return  MY_CHAR_COPY_OK     if after zero-padding got a valid character.
111                                cs->mbmaxlen bytes were written to "dst".
112   @return  MY_CHAR_COPY_FIXED  if after zero-padding did not get a valid
113                                character, but wrote '?' to the destination
114                                string instead.
115                                cs->mbminlen bytes were written to "dst".
116   @return  MY_CHAR_COPY_ERROR  If failed and nothing was written to "dst".
117                                Possible reasons:
118                                - dst_length was too short
119                                - nchars was 0
120                                - the character after padding appeared not
121                                  to be valid, and could not fix it to '?'.
122 */
123 static my_char_copy_status_t
my_copy_incomplete_char(CHARSET_INFO * cs,char * dst,size_t dst_length,const char * src,size_t src_length,size_t nchars,my_bool fix)124 my_copy_incomplete_char(CHARSET_INFO *cs,
125                         char *dst, size_t dst_length,
126                         const char *src, size_t src_length,
127                         size_t nchars, my_bool fix)
128 {
129   size_t pad_length;
130   size_t src_offset= src_length % cs->mbminlen;
131   if (dst_length < cs->mbminlen || !nchars)
132     return MY_CHAR_COPY_ERROR;
133 
134   pad_length= cs->mbminlen - src_offset;
135   bzero(dst, pad_length);
136   memmove(dst + pad_length, src, src_offset);
137   /*
138     In some cases left zero-padding can create an incorrect character.
139     For example:
140       INSERT INTO t1 (utf32_column) VALUES (0x110000);
141     We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
142     The valid characters range is limited to 0x00000000..0x0010FFFF.
143 
144     Make sure we didn't pad to an incorrect character.
145   */
146   if (cs->cset->charlen(cs, (uchar *) dst, (uchar *) dst + cs->mbminlen) ==
147       (int) cs->mbminlen)
148     return MY_CHAR_COPY_OK;
149 
150   if (fix &&
151       cs->cset->wc_mb(cs, '?', (uchar *) dst, (uchar *) dst + cs->mbminlen) ==
152       (int) cs->mbminlen)
153     return MY_CHAR_COPY_FIXED;
154 
155   return MY_CHAR_COPY_ERROR;
156 }
157 
158 
159 /*
160   Copy an UCS2/UTF16/UTF32 string, fix bad characters.
161 */
162 static size_t
my_copy_fix_mb2_or_mb4(CHARSET_INFO * cs,char * dst,size_t dst_length,const char * src,size_t src_length,size_t nchars,MY_STRCOPY_STATUS * status)163 my_copy_fix_mb2_or_mb4(CHARSET_INFO *cs,
164                        char *dst, size_t dst_length,
165                        const char *src, size_t src_length,
166                        size_t nchars, MY_STRCOPY_STATUS *status)
167 {
168   size_t length2, src_offset= src_length % cs->mbminlen;
169   my_char_copy_status_t padstatus;
170 
171   if (!src_offset)
172     return  my_copy_fix_mb(cs, dst, dst_length,
173                                src, src_length, nchars, status);
174   if ((padstatus= my_copy_incomplete_char(cs, dst, dst_length,
175                                           src, src_length, nchars, TRUE)) ==
176       MY_CHAR_COPY_ERROR)
177   {
178     status->m_source_end_pos= status->m_well_formed_error_pos= src;
179     return 0;
180   }
181   length2= my_copy_fix_mb(cs, dst + cs->mbminlen, dst_length - cs->mbminlen,
182                           src + src_offset, src_length - src_offset,
183                           nchars - 1, status);
184   if (padstatus == MY_CHAR_COPY_FIXED)
185     status->m_well_formed_error_pos= src;
186   return cs->mbminlen /* The left-padded character */ + length2;
187 }
188 
189 
190 static long
my_strntol_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)191 my_strntol_mb2_or_mb4(CHARSET_INFO *cs,
192                       const char *nptr, size_t l, int base,
193                       char **endptr, int *err)
194 {
195   int      negative= 0;
196   int      overflow;
197   int      cnv;
198   my_wc_t  wc;
199   my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
200   register unsigned int cutlim;
201   register uint32 cutoff;
202   register uint32 res;
203   register const uchar *s= (const uchar*) nptr;
204   register const uchar *e= (const uchar*) nptr+l;
205   const uchar *save;
206 
207   *err= 0;
208   do
209   {
210     if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
211     {
212       switch (wc)
213       {
214         case ' ' : break;
215         case '\t': break;
216         case '-' : negative= !negative; break;
217         case '+' : break;
218         default  : goto bs;
219       }
220     }
221     else /* No more characters or bad multibyte sequence */
222     {
223       if (endptr != NULL )
224         *endptr= (char*) s;
225       err[0]= (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
226       return 0;
227     }
228     s+= cnv;
229   } while (1);
230 
231 bs:
232 
233   overflow= 0;
234   res= 0;
235   save= s;
236   cutoff= ((uint32)~0L) / (uint32) base;
237   cutlim= (uint) (((uint32)~0L) % (uint32) base);
238 
239   do {
240     if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
241     {
242       s+= cnv;
243       if (wc >= '0' && wc <= '9')
244         wc-= '0';
245       else if (wc >= 'A' && wc <= 'Z')
246         wc= wc - 'A' + 10;
247       else if (wc >= 'a' && wc <= 'z')
248         wc= wc - 'a' + 10;
249       else
250         break;
251       if ((int)wc >= base)
252         break;
253       if (res > cutoff || (res == cutoff && wc > cutlim))
254         overflow= 1;
255       else
256       {
257         res*= (uint32) base;
258         res+= wc;
259       }
260     }
261     else if (cnv == MY_CS_ILSEQ)
262     {
263       if (endptr !=NULL )
264         *endptr = (char*) s;
265       err[0]= EILSEQ;
266       return 0;
267     }
268     else
269     {
270       /* No more characters */
271       break;
272     }
273   } while(1);
274 
275   if (endptr != NULL)
276     *endptr = (char *) s;
277 
278   if (s == save)
279   {
280     err[0]= EDOM;
281     return 0L;
282   }
283 
284   if (negative)
285   {
286     if (res > (uint32) INT_MIN32)
287       overflow= 1;
288   }
289   else if (res > INT_MAX32)
290     overflow= 1;
291 
292   if (overflow)
293   {
294     err[0]= ERANGE;
295     return negative ? INT_MIN32 : INT_MAX32;
296   }
297 
298   return (negative ? -((long) res) : (long) res);
299 }
300 
301 
302 static ulong
my_strntoul_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)303 my_strntoul_mb2_or_mb4(CHARSET_INFO *cs,
304                        const char *nptr, size_t l, int base,
305                        char **endptr, int *err)
306 {
307   int      negative= 0;
308   int      overflow;
309   int      cnv;
310   my_wc_t  wc;
311   my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
312   register unsigned int cutlim;
313   register uint32 cutoff;
314   register uint32 res;
315   register const uchar *s= (const uchar*) nptr;
316   register const uchar *e= (const uchar*) nptr + l;
317   const uchar *save;
318 
319   *err= 0;
320   do
321   {
322     if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
323     {
324       switch (wc)
325       {
326         case ' ' : break;
327         case '\t': break;
328         case '-' : negative= !negative; break;
329         case '+' : break;
330         default  : goto bs;
331       }
332     }
333     else /* No more characters or bad multibyte sequence */
334     {
335       if (endptr !=NULL )
336         *endptr= (char*)s;
337       err[0]= (cnv == MY_CS_ILSEQ) ? EILSEQ : EDOM;
338       return 0;
339     }
340     s+= cnv;
341   } while (1);
342 
343 bs:
344 
345   overflow= 0;
346   res= 0;
347   save= s;
348   cutoff= ((uint32)~0L) / (uint32) base;
349   cutlim= (uint) (((uint32)~0L) % (uint32) base);
350 
351   do
352   {
353     if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
354     {
355       s+= cnv;
356       if (wc >= '0' && wc <= '9')
357         wc-= '0';
358       else if (wc >= 'A' && wc <= 'Z')
359         wc= wc - 'A' + 10;
360       else if (wc >= 'a' && wc <= 'z')
361         wc= wc - 'a' + 10;
362       else
363         break;
364       if ((int) wc >= base)
365         break;
366       if (res > cutoff || (res == cutoff && wc > cutlim))
367         overflow = 1;
368       else
369       {
370         res*= (uint32) base;
371         res+= wc;
372       }
373     }
374     else if (cnv == MY_CS_ILSEQ)
375     {
376       if (endptr != NULL )
377         *endptr= (char*)s;
378       err[0]= EILSEQ;
379       return 0;
380     }
381     else
382     {
383       /* No more characters */
384       break;
385     }
386   } while(1);
387 
388   if (endptr != NULL)
389     *endptr= (char *) s;
390 
391   if (s == save)
392   {
393     err[0]= EDOM;
394     return 0L;
395   }
396 
397   if (overflow)
398   {
399     err[0]= (ERANGE);
400     return (~(uint32) 0);
401   }
402 
403   return (negative ? -((long) res) : (long) res);
404 }
405 
406 
407 static longlong
my_strntoll_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)408 my_strntoll_mb2_or_mb4(CHARSET_INFO *cs,
409                        const char *nptr, size_t l, int base,
410                        char **endptr, int *err)
411 {
412   int      negative=0;
413   int      overflow;
414   int      cnv;
415   my_wc_t  wc;
416   my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
417   register ulonglong    cutoff;
418   register unsigned int cutlim;
419   register ulonglong    res;
420   register const uchar *s= (const uchar*) nptr;
421   register const uchar *e= (const uchar*) nptr+l;
422   const uchar *save;
423 
424   *err= 0;
425   do
426   {
427     if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
428     {
429       switch (wc)
430       {
431         case ' ' : break;
432         case '\t': break;
433         case '-' : negative= !negative; break;
434         case '+' : break;
435         default  : goto bs;
436       }
437     }
438     else /* No more characters or bad multibyte sequence */
439     {
440       if (endptr !=NULL )
441         *endptr = (char*)s;
442       err[0] = (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
443       return 0;
444     }
445     s+=cnv;
446   } while (1);
447 
448 bs:
449 
450   overflow = 0;
451   res = 0;
452   save = s;
453   cutoff = (~(ulonglong) 0) / (unsigned long int) base;
454   cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
455 
456   do {
457     if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
458     {
459       s+=cnv;
460       if ( wc>='0' && wc<='9')
461         wc -= '0';
462       else if ( wc>='A' && wc<='Z')
463         wc = wc - 'A' + 10;
464       else if ( wc>='a' && wc<='z')
465         wc = wc - 'a' + 10;
466       else
467         break;
468       if ((int)wc >= base)
469         break;
470       if (res > cutoff || (res == cutoff && wc > cutlim))
471         overflow = 1;
472       else
473       {
474         res *= (ulonglong) base;
475         res += wc;
476       }
477     }
478     else if (cnv==MY_CS_ILSEQ)
479     {
480       if (endptr !=NULL )
481         *endptr = (char*)s;
482       err[0]=EILSEQ;
483       return 0;
484     }
485     else
486     {
487       /* No more characters */
488       break;
489     }
490   } while(1);
491 
492   if (endptr != NULL)
493     *endptr = (char *) s;
494 
495   if (s == save)
496   {
497     err[0]=EDOM;
498     return 0L;
499   }
500 
501   if (negative)
502   {
503     if (res  > (ulonglong) LONGLONG_MIN)
504       overflow = 1;
505   }
506   else if (res > (ulonglong) LONGLONG_MAX)
507     overflow = 1;
508 
509   if (overflow)
510   {
511     err[0]=ERANGE;
512     return negative ? LONGLONG_MIN : LONGLONG_MAX;
513   }
514 
515   return (negative ? -((longlong)res) : (longlong)res);
516 }
517 
518 
519 static ulonglong
my_strntoull_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)520 my_strntoull_mb2_or_mb4(CHARSET_INFO *cs,
521                         const char *nptr, size_t l, int base,
522                         char **endptr, int *err)
523 {
524   int      negative= 0;
525   int      overflow;
526   int      cnv;
527   my_wc_t  wc;
528   my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
529   register ulonglong    cutoff;
530   register unsigned int cutlim;
531   register ulonglong    res;
532   register const uchar *s= (const uchar*) nptr;
533   register const uchar *e= (const uchar*) nptr + l;
534   const uchar *save;
535 
536   *err= 0;
537   do
538   {
539     if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
540     {
541       switch (wc)
542       {
543         case ' ' : break;
544         case '\t': break;
545         case '-' : negative= !negative; break;
546         case '+' : break;
547         default  : goto bs;
548       }
549     }
550     else /* No more characters or bad multibyte sequence */
551     {
552       if (endptr !=NULL )
553         *endptr = (char*)s;
554       err[0]= (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
555       return 0;
556     }
557     s+=cnv;
558   } while (1);
559 
560 bs:
561 
562   overflow = 0;
563   res = 0;
564   save = s;
565   cutoff = (~(ulonglong) 0) / (unsigned long int) base;
566   cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
567 
568   do
569   {
570     if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
571     {
572       s+=cnv;
573       if ( wc>='0' && wc<='9')
574         wc -= '0';
575       else if ( wc>='A' && wc<='Z')
576         wc = wc - 'A' + 10;
577       else if ( wc>='a' && wc<='z')
578         wc = wc - 'a' + 10;
579       else
580         break;
581       if ((int)wc >= base)
582         break;
583       if (res > cutoff || (res == cutoff && wc > cutlim))
584         overflow = 1;
585       else
586       {
587         res *= (ulonglong) base;
588         res += wc;
589       }
590     }
591     else if (cnv==MY_CS_ILSEQ)
592     {
593       if (endptr !=NULL )
594         *endptr = (char*)s;
595       err[0]= EILSEQ;
596       return 0;
597     }
598     else
599     {
600       /* No more characters */
601       break;
602     }
603   } while(1);
604 
605   if (endptr != NULL)
606     *endptr = (char *) s;
607 
608   if (s == save)
609   {
610     err[0]= EDOM;
611     return 0L;
612   }
613 
614   if (overflow)
615   {
616     err[0]= ERANGE;
617     return (~(ulonglong) 0);
618   }
619 
620   return (negative ? -((longlong) res) : (longlong) res);
621 }
622 
623 
624 static double
my_strntod_mb2_or_mb4(CHARSET_INFO * cs,char * nptr,size_t length,char ** endptr,int * err)625 my_strntod_mb2_or_mb4(CHARSET_INFO *cs,
626                       char *nptr, size_t length,
627                       char **endptr, int *err)
628 {
629   char     buf[256];
630   double   res;
631   register char *b= buf;
632   register const uchar *s= (const uchar*) nptr;
633   const uchar *end;
634   my_wc_t  wc;
635   my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
636   int     cnv;
637 
638   *err= 0;
639   /* Cut too long strings */
640   if (length >= sizeof(buf))
641     length= sizeof(buf) - 1;
642   end= s + length;
643 
644   while ((cnv= mb_wc(cs, &wc, s, end)) > 0)
645   {
646     s+= cnv;
647     if (wc > (int) (uchar) 'e' || !wc)
648       break;                                        /* Can't be part of double */
649     *b++= (char) wc;
650   }
651 
652   *endptr= b;
653   res= my_strtod(buf, endptr, err);
654   *endptr= nptr + cs->mbminlen * (size_t) (*endptr - buf);
655   return res;
656 }
657 
658 
659 static ulonglong
my_strntoull10rnd_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t length,int unsign_fl,char ** endptr,int * err)660 my_strntoull10rnd_mb2_or_mb4(CHARSET_INFO *cs,
661                              const char *nptr, size_t length,
662                              int unsign_fl,
663                              char **endptr, int *err)
664 {
665   char  buf[256], *b= buf;
666   ulonglong res;
667   const uchar *end, *s= (const uchar*) nptr;
668   my_wc_t  wc;
669   my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
670   int     cnv;
671 
672   /* Cut too long strings */
673   if (length >= sizeof(buf))
674     length= sizeof(buf)-1;
675   end= s + length;
676 
677   while ((cnv= mb_wc(cs, &wc, s, end)) > 0)
678   {
679     s+= cnv;
680     if (wc > (int) (uchar) 'e' || !wc)
681       break;                            /* Can't be a number part */
682     *b++= (char) wc;
683   }
684 
685   res= my_strntoull10rnd_8bit(cs, buf, b - buf, unsign_fl, endptr, err);
686   *endptr= (char*) nptr + cs->mbminlen * (size_t) (*endptr - buf);
687   return res;
688 }
689 
690 
691 /*
692   This is a fast version optimized for the case of radix 10 / -10
693 */
694 
695 static size_t
my_l10tostr_mb2_or_mb4(CHARSET_INFO * cs,char * dst,size_t len,int radix,long int val)696 my_l10tostr_mb2_or_mb4(CHARSET_INFO *cs,
697                        char *dst, size_t len, int radix, long int val)
698 {
699   char buffer[66];
700   register char *p, *db, *de;
701   long int new_val;
702   int  sl= 0;
703   unsigned long int uval = (unsigned long int) val;
704 
705   p= &buffer[sizeof(buffer) - 1];
706   *p= '\0';
707 
708   if (radix < 0)
709   {
710     if (val < 0)
711     {
712       sl= 1;
713       /* Avoid integer overflow in (-val) for LONGLONG_MIN (BUG#31799). */
714       uval  = (unsigned long int)0 - uval;
715     }
716   }
717 
718   new_val = (long) (uval / 10);
719   *--p    = '0'+ (char) (uval - (unsigned long) new_val * 10);
720   val= new_val;
721 
722   while (val != 0)
723   {
724     new_val= val / 10;
725     *--p= '0' + (char) (val - new_val * 10);
726     val= new_val;
727   }
728 
729   if (sl)
730   {
731     *--p= '-';
732   }
733 
734   for ( db= dst, de= dst + len ; (dst < de) && *p ; p++)
735   {
736     int cnvres= cs->cset->wc_mb(cs,(my_wc_t)p[0],(uchar*) dst, (uchar*) de);
737     if (cnvres > 0)
738       dst+= cnvres;
739     else
740       break;
741   }
742   return (int) (dst - db);
743 }
744 
745 
746 static size_t
my_ll10tostr_mb2_or_mb4(CHARSET_INFO * cs,char * dst,size_t len,int radix,longlong val)747 my_ll10tostr_mb2_or_mb4(CHARSET_INFO *cs,
748                         char *dst, size_t len, int radix, longlong val)
749 {
750   char buffer[65];
751   register char *p, *db, *de;
752   long long_val;
753   int sl= 0;
754   ulonglong uval= (ulonglong) val;
755 
756   if (radix < 0)
757   {
758     if (val < 0)
759     {
760       sl= 1;
761       /* Avoid integer overflow in (-val) for LONGLONG_MIN (BUG#31799). */
762       uval = (ulonglong)0 - uval;
763     }
764   }
765 
766   p= &buffer[sizeof(buffer)-1];
767   *p='\0';
768 
769   if (uval == 0)
770   {
771     *--p= '0';
772     goto cnv;
773   }
774 
775   while (uval > (ulonglong) LONG_MAX)
776   {
777     ulonglong quo= uval/(uint) 10;
778     uint rem= (uint) (uval- quo* (uint) 10);
779     *--p= '0' + rem;
780     uval= quo;
781   }
782 
783   long_val= (long) uval;
784   while (long_val != 0)
785   {
786     long quo= long_val/10;
787     *--p= (char) ('0' + (long_val - quo*10));
788     long_val= quo;
789   }
790 
791 cnv:
792   if (sl)
793   {
794     *--p= '-';
795   }
796 
797   for ( db= dst, de= dst + len ; (dst < de) && *p ; p++)
798   {
799     int cnvres= cs->cset->wc_mb(cs, (my_wc_t) p[0], (uchar*) dst, (uchar*) de);
800     if (cnvres > 0)
801       dst+= cnvres;
802     else
803       break;
804   }
805   return (int) (dst -db);
806 }
807 
808 #endif /* HAVE_CHARSET_mb2_or_mb4 */
809 
810 
811 #ifdef HAVE_CHARSET_mb2
812 /**
813   Convert a Unicode code point to a digit.
814   @param      wc  - the input Unicode code point
815   @param[OUT] c   - the output character representing the digit value 0..9
816 
817   @return   0     - if wc is a good digit
818   @return   1     - if wc is not a digit
819 */
820 static inline my_bool
wc2digit_uchar(uchar * c,my_wc_t wc)821 wc2digit_uchar(uchar *c, my_wc_t wc)
822 {
823   return wc > '9' || (c[0]= (uchar) (wc - '0')) > 9;
824 }
825 
826 
827 static longlong
my_strtoll10_mb2(CHARSET_INFO * cs,const char * nptr,char ** endptr,int * error)828 my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)),
829                  const char *nptr, char **endptr, int *error)
830 {
831   const uchar *s, *end, *start, *n_end, *true_end;
832   uchar UNINIT_VAR(c);
833   unsigned long i, j, k;
834   ulonglong li;
835   int negative;
836   ulong cutoff, cutoff2, cutoff3;
837   my_wc_t wc;
838   int res;
839   my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
840 
841   s= (const uchar *) nptr;
842   /* If fixed length string */
843   if (endptr)
844   {
845     /*
846       Make sure string length is even.
847       Odd length indicates a bug in the caller.
848       Assert in debug, round in production.
849     */
850     DBUG_ASSERT((*endptr - (const char *) s) % 2 == 0);
851     end= s + ((*endptr - (const char*) s) / 2) * 2;
852 
853     for ( ; ; ) /* Skip leading spaces and tabs */
854     {
855       if ((res= mb_wc(cs, &wc, s, end)) <= 0)
856         goto no_conv;
857       s+= res;
858       if (wc != ' ' && wc != '\t')
859         break;
860     }
861   }
862   else
863   {
864      /* We don't support null terminated strings in UCS2 */
865      goto no_conv;
866   }
867 
868   /* Check for a sign. */
869   negative= 0;
870   if (wc == '-')
871   {
872     *error= -1;                                        /* Mark as negative number */
873     negative= 1;
874     if ((res= mb_wc(cs, &wc, s, end)) <= 0)
875       goto no_conv;
876     s+= res; /* wc is now expected to hold the first digit. */
877     cutoff=  MAX_NEGATIVE_NUMBER / LFACTOR2;
878     cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
879     cutoff3=  MAX_NEGATIVE_NUMBER % 100;
880   }
881   else
882   {
883     *error= 0;
884     if (wc == '+')
885     {
886       if ((res= mb_wc(cs, &wc, s, end)) <= 0)
887         goto no_conv;
888       s+= res; /* wc is now expected to hold the first digit. */
889     }
890     cutoff=  ULONGLONG_MAX / LFACTOR2;
891     cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
892     cutoff3=  ULONGLONG_MAX % 100;
893   }
894 
895   /*
896     The code below assumes that 'wc' holds the first digit
897     and 's' points to the next character after it.
898 
899     Scan pre-zeros if any.
900   */
901   if (wc == '0')
902   {
903     i= 0;
904     for ( ; ; s+= res)
905     {
906       if (s == end)
907         goto end_i;                                /* Return 0 */
908       if ((res= mb_wc(cs, &wc, s, end)) <= 0)
909         goto no_conv;
910       if (wc != '0')
911         break;
912     }
913     n_end= s + 2 * INIT_CNT;
914   }
915   else
916   {
917     /* Read first digit to check that it's a valid number */
918     if ((i= (wc - '0')) > 9)
919       goto no_conv;
920     n_end= s + 2 * (INIT_CNT-1);
921   }
922 
923   /* Handle first 9 digits and store them in i */
924   if (n_end > end)
925     n_end= end;
926   for ( ; ; s+= res)
927   {
928     if ((res= mb_wc(cs, &wc, s, n_end)) <= 0)
929       break;
930     if (wc2digit_uchar(&c, wc))
931       goto end_i;
932     i= i*10+c;
933   }
934   if (s == end)
935     goto end_i;
936 
937   /* Handle next 9 digits and store them in j */
938   j= 0;
939   start= s;                                /* Used to know how much to shift i */
940   n_end= true_end= s + 2 * INIT_CNT;
941   if (n_end > end)
942     n_end= end;
943   do
944   {
945     if ((res= mb_wc(cs, &wc, s, end)) <= 0)
946       goto no_conv;
947     if (wc2digit_uchar(&c, wc))
948       goto end_i_and_j;
949     s+= res;
950     j= j * 10 + c;
951   } while (s != n_end);
952   if (s == end)
953   {
954     if (s != true_end)
955       goto end_i_and_j;
956     goto end3;
957   }
958 
959   /* Handle the next 1 or 2 digits and store them in k */
960   if ((res= mb_wc(cs, &wc, s, end)) <= 0)
961     goto no_conv;
962   if ((k= (wc - '0')) > 9)
963     goto end3;
964   s+= res;
965 
966   if (s == end)
967     goto end4;
968   if ((res= mb_wc(cs, &wc, s, end)) <= 0)
969     goto no_conv;
970   if (wc2digit_uchar(&c, wc))
971     goto end4;
972   s+= res;
973   k= k*10+c;
974   *endptr= (char*) s;
975 
976   /* number string should have ended here */
977   if (s != end && mb_wc(cs, &wc, s, end) > 0 && ((uchar) (wc - '0')) <= 9)
978     goto overflow;
979 
980   /* Check that we didn't get an overflow with the last digit */
981   if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
982                                      k > cutoff3)))
983     goto overflow;
984   li=i*LFACTOR2+ (ulonglong) j*100 + k;
985   return (longlong) li;
986 
987 overflow:                                        /* *endptr is set here */
988   *error= MY_ERRNO_ERANGE;
989   return negative ? LONGLONG_MIN : (longlong) ULONGLONG_MAX;
990 
991 end_i:
992   *endptr= (char*) s;
993   return (negative ? ((longlong) -(long) i) : (longlong) i);
994 
995 end_i_and_j:
996   li= (ulonglong) i * lfactor[(size_t) (s-start) / 2] + j;
997   *endptr= (char*) s;
998   return (negative ? -((longlong) li) : (longlong) li);
999 
1000 end3:
1001   li=(ulonglong) i*LFACTOR+ (ulonglong) j;
1002   *endptr= (char*) s;
1003   return (negative ? -((longlong) li) : (longlong) li);
1004 
1005 end4:
1006   li=(ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
1007   *endptr= (char*) s;
1008   if (negative)
1009   {
1010    if (li > MAX_NEGATIVE_NUMBER)
1011      goto overflow;
1012    return -((longlong) li);
1013   }
1014   return (longlong) li;
1015 
1016 no_conv:
1017   /* There was no number to convert.  */
1018   *error= MY_ERRNO_EDOM;
1019   *endptr= (char *) nptr;
1020   return 0;
1021 }
1022 
1023 
1024 static size_t
my_scan_mb2(CHARSET_INFO * cs,const char * str,const char * end,int sequence_type)1025 my_scan_mb2(CHARSET_INFO *cs __attribute__((unused)),
1026             const char *str, const char *end, int sequence_type)
1027 {
1028   const char *str0= str;
1029   my_wc_t wc;
1030   my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
1031   int res;
1032 
1033   switch (sequence_type)
1034   {
1035   case MY_SEQ_SPACES:
1036     for (res= mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end);
1037          res > 0 && wc == ' ';
1038          str+= res,
1039          res= mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end))
1040     {
1041     }
1042     return (size_t) (str - str0);
1043   case MY_SEQ_NONSPACES:
1044     DBUG_ASSERT(0); /* Not implemented */
1045     /* pass through */
1046   default:
1047     return 0;
1048   }
1049 }
1050 
1051 
1052 static void
my_fill_mb2(CHARSET_INFO * cs,char * s,size_t slen,int fill)1053 my_fill_mb2(CHARSET_INFO *cs, char *s, size_t slen, int fill)
1054 {
1055   char buf[10], *last;
1056   size_t buflen, remainder;
1057 
1058   DBUG_ASSERT((slen % 2) == 0);
1059 
1060   buflen= cs->cset->wc_mb(cs, (my_wc_t) fill, (uchar*) buf,
1061                           (uchar*) buf + sizeof(buf));
1062 
1063   DBUG_ASSERT(buflen > 0);
1064 
1065   /*
1066     "last" in the last position where a sequence of "buflen" bytes can start.
1067   */
1068   for (last= s + slen - buflen; s <= last; s+= buflen)
1069   {
1070     /* Enough space for the character */
1071     memcpy(s, buf, buflen);
1072   }
1073 
1074   /*
1075     If there are some more space which is not enough
1076     for the whole multibyte character, then add trailing zeros.
1077   */
1078   if ((remainder= last + buflen - s) > 0)
1079     bzero(s, (size_t) remainder);
1080 }
1081 
1082 
1083 static size_t
my_vsnprintf_mb2(char * dst,size_t n,const char * fmt,va_list ap)1084 my_vsnprintf_mb2(char *dst, size_t n, const char* fmt, va_list ap)
1085 {
1086   char *start=dst, *end= dst + n - 1;
1087   for (; *fmt ; fmt++)
1088   {
1089     if (fmt[0] != '%')
1090     {
1091       if (dst == end)                     /* End of buffer */
1092         break;
1093 
1094       *dst++='\0';
1095       *dst++= *fmt;          /* Copy ordinary char */
1096       continue;
1097     }
1098 
1099     fmt++;
1100 
1101     /* Skip if max size is used (to be compatible with printf) */
1102     while ( (*fmt >= '0' && *fmt <= '9') || *fmt == '.' || *fmt == '-')
1103       fmt++;
1104 
1105     if (*fmt == 'l')
1106       fmt++;
1107 
1108     if (*fmt == 's')                      /* String parameter */
1109     {
1110       char *par= va_arg(ap, char *);
1111       size_t plen;
1112       size_t left_len= (size_t)(end-dst);
1113       if (!par)
1114         par= (char*) "(null)";
1115       plen= strlen(par);
1116       if (left_len <= plen * 2)
1117         plen = left_len / 2 - 1;
1118 
1119       for ( ; plen ; plen--, dst+=2, par++)
1120       {
1121         dst[0]= '\0';
1122         dst[1]= par[0];
1123       }
1124       continue;
1125     }
1126     else if (*fmt == 'd' || *fmt == 'u')  /* Integer parameter */
1127     {
1128       int iarg;
1129       char nbuf[16];
1130       char *pbuf= nbuf;
1131 
1132       if ((size_t) (end - dst) < 32)
1133         break;
1134       iarg= va_arg(ap, int);
1135       if (*fmt == 'd')
1136         int10_to_str((long) iarg, nbuf, -10);
1137       else
1138         int10_to_str((long) (uint) iarg, nbuf,10);
1139 
1140       for (; pbuf[0]; pbuf++)
1141       {
1142         *dst++= '\0';
1143         *dst++= *pbuf;
1144       }
1145       continue;
1146     }
1147 
1148     /* We come here on '%%', unknown code or too long parameter */
1149     if (dst == end)
1150       break;
1151     *dst++= '\0';
1152     *dst++= '%';                            /* % used as % or unknown code */
1153   }
1154 
1155   DBUG_ASSERT(dst <= end);
1156   *dst='\0';                                /* End of errmessage */
1157   return (size_t) (dst - start);
1158 }
1159 
1160 
1161 static size_t
my_snprintf_mb2(CHARSET_INFO * cs,char * to,size_t n,const char * fmt,...)1162 my_snprintf_mb2(CHARSET_INFO *cs __attribute__((unused)),
1163                 char* to, size_t n, const char* fmt, ...)
1164 {
1165   size_t ret;
1166   va_list args;
1167   va_start(args,fmt);
1168   ret= my_vsnprintf_mb2(to, n, fmt, args);
1169   va_end(args);
1170   return ret;
1171 }
1172 
1173 
1174 static size_t
my_lengthsp_mb2(CHARSET_INFO * cs,const char * ptr,size_t length)1175 my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)),
1176                 const char *ptr, size_t length)
1177 {
1178   const char *end= ptr + length;
1179   while (end > ptr + 1 && end[-1] == ' ' && end[-2] == '\0')
1180     end-= 2;
1181   return (size_t) (end - ptr);
1182 }
1183 
1184 #endif /* HAVE_CHARSET_mb2*/
1185 
1186 
1187 /*
1188   Next part is actually HAVE_CHARSET_utf16-specific,
1189   but the JSON functions needed my_utf16_uni()
1190   so the #ifdef was moved lower.
1191 */
1192 #include "ctype-utf16.h"
1193 
1194 #define IS_MB2_CHAR(b0,b1)       (!MY_UTF16_SURROGATE_HEAD(b0))
1195 #define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b0) && MY_UTF16_LOW_HEAD(b2))
1196 
my_weight_mb2_utf16mb2_general_ci(uchar b0,uchar b1)1197 static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1)
1198 {
1199   my_wc_t wc= MY_UTF16_WC2(b0, b1);
1200   MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
1201   return (int) (page ? page[wc & 0xFF].sort : wc);
1202 }
1203 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf16_general_ci
1204 #define DEFINE_STRNXFRM_UNICODE
1205 #define DEFINE_STRNXFRM_UNICODE_NOPAD
1206 #define MY_MB_WC(cs, pwc, s, e)  my_mb_wc_utf16_quick(pwc, s, e)
1207 #define OPTIMIZE_ASCII           0
1208 #define UNICASE_MAXCHAR          MY_UNICASE_INFO_DEFAULT_MAXCHAR
1209 #define UNICASE_PAGE0            my_unicase_default_page00
1210 #define UNICASE_PAGES            my_unicase_default_pages
1211 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
1212 #define WEIGHT_MB2(b0,b1)        my_weight_mb2_utf16mb2_general_ci(b0,b1)
1213 #define WEIGHT_MB4(b0,b1,b2,b3)  MY_CS_REPLACEMENT_CHARACTER
1214 #include "strcoll.inl"
1215 
1216 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf16_bin
1217 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
1218 #define WEIGHT_MB2(b0,b1)        ((int) MY_UTF16_WC2(b0, b1))
1219 #define WEIGHT_MB4(b0,b1,b2,b3)  ((int) MY_UTF16_WC4(b0, b1, b2, b3))
1220 #include "strcoll.inl"
1221 
1222 #define DEFINE_STRNNCOLLSP_NOPAD
1223 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf16_general_nopad_ci
1224 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
1225 #define WEIGHT_MB2(b0,b1)        my_weight_mb2_utf16mb2_general_ci(b0,b1)
1226 #define WEIGHT_MB4(b0,b1,b2,b3)  MY_CS_REPLACEMENT_CHARACTER
1227 #include "strcoll.inl"
1228 
1229 #define DEFINE_STRNNCOLLSP_NOPAD
1230 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf16_nopad_bin
1231 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
1232 #define WEIGHT_MB2(b0,b1)        ((int) MY_UTF16_WC2(b0, b1))
1233 #define WEIGHT_MB4(b0,b1,b2,b3)  ((int) MY_UTF16_WC4(b0, b1, b2, b3))
1234 #include "strcoll.inl"
1235 
1236 #undef IS_MB2_CHAR
1237 #undef IS_MB4_CHAR
1238 
1239 /*
1240   These two functions are used in JSON library, so made exportable
1241   and unconditionally compiled into the library.
1242 */
1243 
1244 /*static*/ int
my_utf16_uni(CHARSET_INFO * cs,my_wc_t * pwc,const uchar * s,const uchar * e)1245 my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)),
1246              my_wc_t *pwc, const uchar *s, const uchar *e)
1247 {
1248   return my_mb_wc_utf16_quick(pwc, s, e);
1249 }
1250 
1251 
1252 /*static*/ int
my_uni_utf16(CHARSET_INFO * cs,my_wc_t wc,uchar * s,uchar * e)1253 my_uni_utf16(CHARSET_INFO *cs __attribute__((unused)),
1254              my_wc_t wc, uchar *s, uchar *e)
1255 {
1256   if (wc <= 0xFFFF)
1257   {
1258     if (s + 2 > e)
1259       return MY_CS_TOOSMALL2;
1260     if (MY_UTF16_SURROGATE(wc))
1261       return MY_CS_ILUNI;
1262     *s++= (uchar) (wc >> 8);
1263     *s= (uchar) (wc & 0xFF);
1264     return 2;
1265   }
1266 
1267   if (wc <= 0x10FFFF)
1268   {
1269     if (s + 4 > e)
1270       return MY_CS_TOOSMALL4;
1271     *s++= (uchar) ((wc-= 0x10000) >> 18) | 0xD8;
1272     *s++= (uchar) (wc >> 10) & 0xFF;
1273     *s++= (uchar) ((wc >> 8) & 3) | 0xDC;
1274     *s= (uchar) wc & 0xFF;
1275     return 4;
1276   }
1277 
1278   return MY_CS_ILUNI;
1279 }
1280 
1281 
1282 #ifdef HAVE_CHARSET_utf16
1283 
1284 
1285 static inline void
my_tolower_utf16(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1286 my_tolower_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1287 {
1288   MY_UNICASE_CHARACTER *page;
1289   if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1290     *wc= page[*wc & 0xFF].tolower;
1291 }
1292 
1293 
1294 static inline void
my_toupper_utf16(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1295 my_toupper_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1296 {
1297   MY_UNICASE_CHARACTER *page;
1298   if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1299     *wc= page[*wc & 0xFF].toupper;
1300 }
1301 
1302 
1303 static inline void
my_tosort_utf16(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1304 my_tosort_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1305 {
1306   if (*wc <= uni_plane->maxchar)
1307   {
1308     MY_UNICASE_CHARACTER *page;
1309     if ((page= uni_plane->page[*wc >> 8]))
1310       *wc= page[*wc & 0xFF].sort;
1311   }
1312   else
1313   {
1314     *wc= MY_CS_REPLACEMENT_CHARACTER;
1315   }
1316 }
1317 
1318 
1319 
1320 static size_t
my_caseup_utf16(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)1321 my_caseup_utf16(CHARSET_INFO *cs, const char *src, size_t srclen,
1322                 char *dst, size_t dstlen)
1323 {
1324   my_wc_t wc;
1325   my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
1326   my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb;
1327   int res;
1328   const char *srcend= src + srclen;
1329   char *dstend= dst + dstlen;
1330   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1331   DBUG_ASSERT(srclen <= dstlen);
1332 
1333   while ((src < srcend) &&
1334          (res= mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
1335   {
1336     my_toupper_utf16(uni_plane, &wc);
1337     if (res != wc_mb(cs, wc, (uchar *) dst, (uchar *) dstend))
1338       break;
1339     src+= res;
1340     dst+= res;
1341   }
1342   return srclen;
1343 }
1344 
1345 
1346 static void
my_hash_sort_utf16_nopad(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)1347 my_hash_sort_utf16_nopad(CHARSET_INFO *cs,
1348                          const uchar *s, size_t slen,
1349                          ulong *nr1, ulong *nr2)
1350 {
1351   my_wc_t wc;
1352   my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
1353   int res;
1354   const uchar *e= s + slen;
1355   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1356   register ulong m1= *nr1, m2= *nr2;
1357 
1358   while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0)
1359   {
1360     my_tosort_utf16(uni_plane, &wc);
1361     MY_HASH_ADD_16(m1, m2, wc);
1362     s+= res;
1363   }
1364   *nr1= m1;
1365   *nr2= m2;
1366 }
1367 
1368 
1369 static void
my_hash_sort_utf16(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)1370 my_hash_sort_utf16(CHARSET_INFO *cs, const uchar *s, size_t slen,
1371                    ulong *nr1, ulong *nr2)
1372 {
1373   size_t lengthsp= cs->cset->lengthsp(cs, (const char *) s, slen);
1374   my_hash_sort_utf16_nopad(cs, s, lengthsp, nr1, nr2);
1375 }
1376 
1377 
1378 static size_t
my_casedn_utf16(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)1379 my_casedn_utf16(CHARSET_INFO *cs, const char *src, size_t srclen,
1380                 char *dst, size_t dstlen)
1381 {
1382   my_wc_t wc;
1383   my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
1384   my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb;
1385   int res;
1386   const char *srcend= src + srclen;
1387   char *dstend= dst + dstlen;
1388   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1389   DBUG_ASSERT(srclen <= dstlen);
1390 
1391   while ((src < srcend) &&
1392          (res= mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
1393   {
1394     my_tolower_utf16(uni_plane, &wc);
1395     if (res != wc_mb(cs, wc, (uchar *) dst, (uchar *) dstend))
1396       break;
1397     src+= res;
1398     dst+= res;
1399   }
1400   return srclen;
1401 }
1402 
1403 
1404 static int
my_charlen_utf16(CHARSET_INFO * cs,const uchar * str,const uchar * end)1405 my_charlen_utf16(CHARSET_INFO *cs, const uchar *str, const uchar *end)
1406 {
1407   my_wc_t wc;
1408   return cs->cset->mb_wc(cs, &wc, str, end);
1409 }
1410 
1411 
1412 #define MY_FUNCTION_NAME(x)       my_ ## x ## _utf16
1413 #define CHARLEN(cs,str,end)       my_charlen_utf16(cs,str,end)
1414 #define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
1415 #include "ctype-mb.inl"
1416 #undef MY_FUNCTION_NAME
1417 #undef CHARLEN
1418 #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
1419 /* Defines my_well_formed_char_length_utf16 */
1420 
1421 
1422 static size_t
my_numchars_utf16(CHARSET_INFO * cs,const char * b,const char * e)1423 my_numchars_utf16(CHARSET_INFO *cs,
1424                   const char *b, const char *e)
1425 {
1426   size_t nchars= 0;
1427   for ( ; ; nchars++)
1428   {
1429     size_t charlen= my_ismbchar(cs, b, e);
1430     if (!charlen)
1431       break;
1432     b+= charlen;
1433   }
1434   return nchars;
1435 }
1436 
1437 
1438 static size_t
my_charpos_utf16(CHARSET_INFO * cs,const char * b,const char * e,size_t pos)1439 my_charpos_utf16(CHARSET_INFO *cs,
1440                  const char *b, const char *e, size_t pos)
1441 {
1442   const char *b0= b;
1443   uint charlen;
1444 
1445   for ( ; pos; b+= charlen, pos--)
1446   {
1447     if (!(charlen= my_ismbchar(cs, b, e)))
1448       return (e + 2 - b0); /* Error, return pos outside the string */
1449   }
1450   return (size_t) (pos ? (e + 2 - b0) : (b - b0));
1451 }
1452 
1453 
1454 static int
my_wildcmp_utf16_ci(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)1455 my_wildcmp_utf16_ci(CHARSET_INFO *cs,
1456                     const char *str,const char *str_end,
1457                     const char *wildstr,const char *wildend,
1458                     int escape, int w_one, int w_many)
1459 {
1460   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1461   return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
1462                             escape, w_one, w_many, uni_plane);
1463 }
1464 
1465 
1466 static int
my_wildcmp_utf16_bin(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)1467 my_wildcmp_utf16_bin(CHARSET_INFO *cs,
1468                      const char *str,const char *str_end,
1469                      const char *wildstr,const char *wildend,
1470                      int escape, int w_one, int w_many)
1471 {
1472   return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
1473                             escape, w_one, w_many, NULL);
1474 }
1475 
1476 
1477 static void
my_hash_sort_utf16_nopad_bin(CHARSET_INFO * cs,const uchar * pos,size_t len,ulong * nr1,ulong * nr2)1478 my_hash_sort_utf16_nopad_bin(CHARSET_INFO *cs  __attribute__((unused)),
1479                              const uchar *pos, size_t len,
1480                              ulong *nr1, ulong *nr2)
1481 {
1482   const uchar *end= pos + len;
1483   register ulong m1= *nr1, m2= *nr2;
1484 
1485   for ( ; pos < end ; pos++)
1486   {
1487     MY_HASH_ADD(m1, m2, (uint)*pos);
1488   }
1489   *nr1= m1;
1490   *nr2= m2;
1491 }
1492 
1493 
1494 static void
my_hash_sort_utf16_bin(CHARSET_INFO * cs,const uchar * pos,size_t len,ulong * nr1,ulong * nr2)1495 my_hash_sort_utf16_bin(CHARSET_INFO *cs,
1496                        const uchar *pos, size_t len, ulong *nr1, ulong *nr2)
1497 {
1498   size_t lengthsp= cs->cset->lengthsp(cs, (const char *) pos, len);
1499   my_hash_sort_utf16_nopad_bin(cs, pos, lengthsp, nr1, nr2);
1500 }
1501 
1502 
1503 static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler =
1504 {
1505   NULL,                /* init */
1506   my_strnncoll_utf16_general_ci,
1507   my_strnncollsp_utf16_general_ci,
1508   my_strnncollsp_nchars_utf16_general_ci,
1509   my_strnxfrm_utf16_general_ci,
1510   my_strnxfrmlen_unicode,
1511   my_like_range_generic,
1512   my_wildcmp_utf16_ci,
1513   my_strcasecmp_mb2_or_mb4,
1514   my_instr_mb,
1515   my_hash_sort_utf16,
1516   my_propagate_simple
1517 };
1518 
1519 
1520 static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
1521 {
1522   NULL,                /* init */
1523   my_strnncoll_utf16_bin,
1524   my_strnncollsp_utf16_bin,
1525   my_strnncollsp_nchars_utf16_bin,
1526   my_strnxfrm_unicode_full_bin,
1527   my_strnxfrmlen_unicode_full_bin,
1528   my_like_range_generic,
1529   my_wildcmp_utf16_bin,
1530   my_strcasecmp_mb2_or_mb4,
1531   my_instr_mb,
1532   my_hash_sort_utf16_bin,
1533   my_propagate_simple
1534 };
1535 
1536 
1537 static MY_COLLATION_HANDLER my_collation_utf16_general_nopad_ci_handler =
1538 {
1539   NULL,                /* init */
1540   my_strnncoll_utf16_general_ci,
1541   my_strnncollsp_utf16_general_nopad_ci,
1542   my_strnncollsp_nchars_utf16_general_nopad_ci,
1543   my_strnxfrm_nopad_utf16_general_ci,
1544   my_strnxfrmlen_unicode,
1545   my_like_range_generic,
1546   my_wildcmp_utf16_ci,
1547   my_strcasecmp_mb2_or_mb4,
1548   my_instr_mb,
1549   my_hash_sort_utf16_nopad,
1550   my_propagate_simple
1551 };
1552 
1553 
1554 static MY_COLLATION_HANDLER my_collation_utf16_nopad_bin_handler =
1555 {
1556   NULL,                /* init */
1557   my_strnncoll_utf16_bin,
1558   my_strnncollsp_utf16_nopad_bin,
1559   my_strnncollsp_nchars_utf16_nopad_bin,
1560   my_strnxfrm_unicode_full_nopad_bin,
1561   my_strnxfrmlen_unicode_full_bin,
1562   my_like_range_generic,
1563   my_wildcmp_utf16_bin,
1564   my_strcasecmp_mb2_or_mb4,
1565   my_instr_mb,
1566   my_hash_sort_utf16_nopad_bin,
1567   my_propagate_simple
1568 };
1569 
1570 
1571 MY_CHARSET_HANDLER my_charset_utf16_handler=
1572 {
1573   NULL,                /* init         */
1574   my_numchars_utf16,
1575   my_charpos_utf16,
1576   my_lengthsp_mb2,
1577   my_numcells_mb,
1578   my_utf16_uni,        /* mb_wc        */
1579   my_uni_utf16,        /* wc_mb        */
1580   my_mb_ctype_mb,
1581   my_caseup_str_mb2_or_mb4,
1582   my_casedn_str_mb2_or_mb4,
1583   my_caseup_utf16,
1584   my_casedn_utf16,
1585   my_snprintf_mb2,
1586   my_l10tostr_mb2_or_mb4,
1587   my_ll10tostr_mb2_or_mb4,
1588   my_fill_mb2,
1589   my_strntol_mb2_or_mb4,
1590   my_strntoul_mb2_or_mb4,
1591   my_strntoll_mb2_or_mb4,
1592   my_strntoull_mb2_or_mb4,
1593   my_strntod_mb2_or_mb4,
1594   my_strtoll10_mb2,
1595   my_strntoull10rnd_mb2_or_mb4,
1596   my_scan_mb2,
1597   my_charlen_utf16,
1598   my_well_formed_char_length_utf16,
1599   my_copy_fix_mb2_or_mb4,
1600   my_uni_utf16,
1601 };
1602 
1603 
1604 struct charset_info_st my_charset_utf16_general_ci=
1605 {
1606   54,0,0,              /* number       */
1607   MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1608   "utf16",             /* cs name    */
1609   "utf16_general_ci",  /* name         */
1610   "UTF-16 Unicode",    /* comment      */
1611   NULL,                /* tailoring    */
1612   NULL,                /* ctype        */
1613   NULL,                /* to_lower     */
1614   NULL,                /* to_upper     */
1615   NULL,                /* sort_order   */
1616   NULL,                /* uca          */
1617   NULL,                /* tab_to_uni   */
1618   NULL,                /* tab_from_uni */
1619   &my_unicase_default, /* caseinfo     */
1620   NULL,                /* state_map    */
1621   NULL,                /* ident_map    */
1622   1,                   /* strxfrm_multiply */
1623   1,                   /* caseup_multiply  */
1624   1,                   /* casedn_multiply  */
1625   2,                   /* mbminlen     */
1626   4,                   /* mbmaxlen     */
1627   0,                   /* min_sort_char */
1628   0xFFFF,              /* max_sort_char */
1629   ' ',                 /* pad char      */
1630   0,                   /* escape_with_backslash_is_dangerous */
1631   1,                   /* levels_for_order   */
1632   &my_charset_utf16_handler,
1633   &my_collation_utf16_general_ci_handler
1634 };
1635 
1636 
1637 struct charset_info_st my_charset_utf16_bin=
1638 {
1639   55,0,0,              /* number       */
1640   MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1641   "utf16",             /* cs name      */
1642   "utf16_bin",         /* name         */
1643   "UTF-16 Unicode",    /* comment      */
1644   NULL,                /* tailoring    */
1645   NULL,                /* ctype        */
1646   NULL,                /* to_lower     */
1647   NULL,                /* to_upper     */
1648   NULL,                /* sort_order   */
1649   NULL,                /* uca          */
1650   NULL,                /* tab_to_uni   */
1651   NULL,                /* tab_from_uni */
1652   &my_unicase_default, /* caseinfo     */
1653   NULL,                /* state_map    */
1654   NULL,                /* ident_map    */
1655   1,                   /* strxfrm_multiply */
1656   1,                   /* caseup_multiply  */
1657   1,                   /* casedn_multiply  */
1658   2,                   /* mbminlen     */
1659   4,                   /* mbmaxlen     */
1660   0,                   /* min_sort_char */
1661   0xFFFF,              /* max_sort_char */
1662   ' ',                 /* pad char      */
1663   0,                   /* escape_with_backslash_is_dangerous */
1664   1,                   /* levels_for_order   */
1665   &my_charset_utf16_handler,
1666   &my_collation_utf16_bin_handler
1667 };
1668 
1669 
1670 struct charset_info_st my_charset_utf16_general_nopad_ci=
1671 {
1672   MY_NOPAD_ID(54),0,0, /* number           */
1673   MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
1674   "utf16",             /* cs name          */
1675   "utf16_general_nopad_ci", /* name        */
1676   "UTF-16 Unicode",    /* comment          */
1677   NULL,                /* tailoring        */
1678   NULL,                /* ctype            */
1679   NULL,                /* to_lower         */
1680   NULL,                /* to_upper         */
1681   NULL,                /* sort_order       */
1682   NULL,                /* uca              */
1683   NULL,                /* tab_to_uni       */
1684   NULL,                /* tab_from_uni     */
1685   &my_unicase_default, /* caseinfo         */
1686   NULL,                /* state_map        */
1687   NULL,                /* ident_map        */
1688   1,                   /* strxfrm_multiply */
1689   1,                   /* caseup_multiply  */
1690   1,                   /* casedn_multiply  */
1691   2,                   /* mbminlen         */
1692   4,                   /* mbmaxlen         */
1693   0,                   /* min_sort_char    */
1694   0xFFFF,              /* max_sort_char    */
1695   ' ',                 /* pad char         */
1696   0,                   /* escape_with_backslash_is_dangerous */
1697   1,                   /* levels_for_order */
1698   &my_charset_utf16_handler,
1699   &my_collation_utf16_general_nopad_ci_handler
1700 };
1701 
1702 
1703 struct charset_info_st my_charset_utf16_nopad_bin=
1704 {
1705   MY_NOPAD_ID(55),0,0, /* number           */
1706   MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|
1707   MY_CS_NOPAD,
1708   "utf16",             /* cs name          */
1709   "utf16_nopad_bin",   /* name             */
1710   "UTF-16 Unicode",    /* comment          */
1711   NULL,                /* tailoring        */
1712   NULL,                /* ctype            */
1713   NULL,                /* to_lower         */
1714   NULL,                /* to_upper         */
1715   NULL,                /* sort_order       */
1716   NULL,                /* uca              */
1717   NULL,                /* tab_to_uni       */
1718   NULL,                /* tab_from_uni     */
1719   &my_unicase_default, /* caseinfo         */
1720   NULL,                /* state_map        */
1721   NULL,                /* ident_map        */
1722   1,                   /* strxfrm_multiply */
1723   1,                   /* caseup_multiply  */
1724   1,                   /* casedn_multiply  */
1725   2,                   /* mbminlen         */
1726   4,                   /* mbmaxlen         */
1727   0,                   /* min_sort_char    */
1728   0xFFFF,              /* max_sort_char    */
1729   ' ',                 /* pad char         */
1730   0,                   /* escape_with_backslash_is_dangerous */
1731   1,                   /* levels_for_order */
1732   &my_charset_utf16_handler,
1733   &my_collation_utf16_nopad_bin_handler
1734 };
1735 
1736 
1737 #define IS_MB2_CHAR(b0,b1)       (!MY_UTF16_SURROGATE_HEAD(b1))
1738 #define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b1) && MY_UTF16_LOW_HEAD(b3))
1739 
1740 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf16le_general_ci
1741 #define DEFINE_STRNXFRM_UNICODE
1742 #define DEFINE_STRNXFRM_UNICODE_NOPAD
1743 #define MY_MB_WC(cs, pwc, s, e)  (cs->cset->mb_wc(cs, pwc, s, e))
1744 #define OPTIMIZE_ASCII           0
1745 #define UNICASE_MAXCHAR          MY_UNICASE_INFO_DEFAULT_MAXCHAR
1746 #define UNICASE_PAGE0            my_unicase_default_page00
1747 #define UNICASE_PAGES            my_unicase_default_pages
1748 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
1749 #define WEIGHT_MB2(b0,b1)        my_weight_mb2_utf16mb2_general_ci(b1,b0)
1750 #define WEIGHT_MB4(b0,b1,b2,b3)  MY_CS_REPLACEMENT_CHARACTER
1751 #include "strcoll.inl"
1752 
1753 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf16le_bin
1754 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
1755 #define WEIGHT_MB2(b0,b1)        ((int) MY_UTF16_WC2(b1, b0))
1756 #define WEIGHT_MB4(b0,b1,b2,b3)  ((int) MY_UTF16_WC4(b1, b0, b3, b2))
1757 #include "strcoll.inl"
1758 
1759 #define DEFINE_STRNNCOLLSP_NOPAD
1760 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf16le_general_nopad_ci
1761 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
1762 #define WEIGHT_MB2(b0,b1)        my_weight_mb2_utf16mb2_general_ci(b1,b0)
1763 #define WEIGHT_MB4(b0,b1,b2,b3)  MY_CS_REPLACEMENT_CHARACTER
1764 #include "strcoll.inl"
1765 
1766 #define DEFINE_STRNNCOLLSP_NOPAD
1767 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf16le_nopad_bin
1768 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
1769 #define WEIGHT_MB2(b0,b1)        ((int) MY_UTF16_WC2(b1, b0))
1770 #define WEIGHT_MB4(b0,b1,b2,b3)  ((int) MY_UTF16_WC4(b1, b0, b3, b2))
1771 #include "strcoll.inl"
1772 
1773 #undef IS_MB2_CHAR
1774 #undef IS_MB4_CHAR
1775 
1776 static int
my_utf16le_uni(CHARSET_INFO * cs,my_wc_t * pwc,const uchar * s,const uchar * e)1777 my_utf16le_uni(CHARSET_INFO *cs __attribute__((unused)),
1778                my_wc_t *pwc, const uchar *s, const uchar *e)
1779 {
1780   my_wc_t lo;
1781 
1782   if (s + 2 > e)
1783     return MY_CS_TOOSMALL2;
1784 
1785   if ((*pwc= uint2korr(s)) < MY_UTF16_SURROGATE_HIGH_FIRST ||
1786       (*pwc > MY_UTF16_SURROGATE_LOW_LAST))
1787     return 2; /* [0000-D7FF,E000-FFFF] */
1788 
1789   if (*pwc >= MY_UTF16_SURROGATE_LOW_FIRST)
1790     return MY_CS_ILSEQ; /* [DC00-DFFF] Low surrogate part without high part */
1791 
1792   if (s + 4  > e)
1793     return MY_CS_TOOSMALL4;
1794 
1795   s+= 2;
1796 
1797   if ((lo= uint2korr(s)) < MY_UTF16_SURROGATE_LOW_FIRST ||
1798       lo > MY_UTF16_SURROGATE_LOW_LAST)
1799     return MY_CS_ILSEQ; /* Expected low surrogate part, got something else */
1800 
1801   *pwc= 0x10000 + (((*pwc & 0x3FF) << 10) | (lo & 0x3FF));
1802   return 4;
1803 }
1804 
1805 
1806 static int
my_uni_utf16le(CHARSET_INFO * cs,my_wc_t wc,uchar * s,uchar * e)1807 my_uni_utf16le(CHARSET_INFO *cs __attribute__((unused)),
1808                my_wc_t wc, uchar *s, uchar *e)
1809 {
1810   uint32 first, second, total;
1811   if (wc < MY_UTF16_SURROGATE_HIGH_FIRST ||
1812       (wc > MY_UTF16_SURROGATE_LOW_LAST &&
1813        wc <= 0xFFFF))
1814   {
1815     if (s + 2 > e)
1816       return MY_CS_TOOSMALL2;
1817     int2store(s, wc);
1818     return 2; /* [0000-D7FF,E000-FFFF] */
1819   }
1820 
1821   if (wc < 0xFFFF || wc > 0x10FFFF)
1822     return MY_CS_ILUNI; /* [D800-DFFF,10FFFF+] */
1823 
1824   if (s + 4 > e)
1825     return MY_CS_TOOSMALL4;
1826 
1827   wc-= 0x10000;
1828   first=  (0xD800 | ((wc >> 10) & 0x3FF));
1829   second= (0xDC00 | (wc & 0x3FF));
1830   total=  first | (second << 16);
1831   int4store(s, total);
1832   return 4; /* [010000-10FFFF] */
1833 }
1834 
1835 
1836 static size_t
my_lengthsp_utf16le(CHARSET_INFO * cs,const char * ptr,size_t length)1837 my_lengthsp_utf16le(CHARSET_INFO *cs __attribute__((unused)),
1838                     const char *ptr, size_t length)
1839 {
1840   const char *end= ptr + length;
1841   while (end > ptr + 1 && uint2korr(end - 2) == ' ')
1842     end-= 2;
1843   return (size_t) (end - ptr);
1844 }
1845 
1846 
1847 static MY_COLLATION_HANDLER my_collation_utf16le_general_ci_handler =
1848 {
1849   NULL,                /* init */
1850   my_strnncoll_utf16le_general_ci,
1851   my_strnncollsp_utf16le_general_ci,
1852   my_strnncollsp_nchars_utf16le_general_ci,
1853   my_strnxfrm_utf16le_general_ci,
1854   my_strnxfrmlen_unicode,
1855   my_like_range_generic,
1856   my_wildcmp_utf16_ci,
1857   my_strcasecmp_mb2_or_mb4,
1858   my_instr_mb,
1859   my_hash_sort_utf16,
1860   my_propagate_simple
1861 };
1862 
1863 
1864 static MY_COLLATION_HANDLER my_collation_utf16le_bin_handler =
1865 {
1866   NULL,                /* init */
1867   my_strnncoll_utf16le_bin,
1868   my_strnncollsp_utf16le_bin,
1869   my_strnncollsp_nchars_utf16le_bin,
1870   my_strnxfrm_unicode_full_bin,
1871   my_strnxfrmlen_unicode_full_bin,
1872   my_like_range_generic,
1873   my_wildcmp_utf16_bin,
1874   my_strcasecmp_mb2_or_mb4,
1875   my_instr_mb,
1876   my_hash_sort_utf16_bin,
1877   my_propagate_simple
1878 };
1879 
1880 
1881 static MY_COLLATION_HANDLER my_collation_utf16le_general_nopad_ci_handler =
1882 {
1883   NULL,                /* init */
1884   my_strnncoll_utf16le_general_ci,
1885   my_strnncollsp_utf16le_general_nopad_ci,
1886   my_strnncollsp_nchars_utf16le_general_nopad_ci,
1887   my_strnxfrm_nopad_utf16le_general_ci,
1888   my_strnxfrmlen_unicode,
1889   my_like_range_generic,
1890   my_wildcmp_utf16_ci,
1891   my_strcasecmp_mb2_or_mb4,
1892   my_instr_mb,
1893   my_hash_sort_utf16_nopad,
1894   my_propagate_simple
1895 };
1896 
1897 
1898 static MY_COLLATION_HANDLER my_collation_utf16le_nopad_bin_handler =
1899 {
1900   NULL,                /* init */
1901   my_strnncoll_utf16le_bin,
1902   my_strnncollsp_utf16le_nopad_bin,
1903   my_strnncollsp_nchars_utf16le_nopad_bin,
1904   my_strnxfrm_unicode_full_nopad_bin,
1905   my_strnxfrmlen_unicode_full_bin,
1906   my_like_range_generic,
1907   my_wildcmp_utf16_bin,
1908   my_strcasecmp_mb2_or_mb4,
1909   my_instr_mb,
1910   my_hash_sort_utf16_nopad_bin,
1911   my_propagate_simple
1912 };
1913 
1914 
1915 static MY_CHARSET_HANDLER my_charset_utf16le_handler=
1916 {
1917   NULL,                /* init         */
1918   my_numchars_utf16,
1919   my_charpos_utf16,
1920   my_lengthsp_utf16le,
1921   my_numcells_mb,
1922   my_utf16le_uni,      /* mb_wc        */
1923   my_uni_utf16le,      /* wc_mb        */
1924   my_mb_ctype_mb,
1925   my_caseup_str_mb2_or_mb4,
1926   my_casedn_str_mb2_or_mb4,
1927   my_caseup_utf16,
1928   my_casedn_utf16,
1929   my_snprintf_mb2,
1930   my_l10tostr_mb2_or_mb4,
1931   my_ll10tostr_mb2_or_mb4,
1932   my_fill_mb2,
1933   my_strntol_mb2_or_mb4,
1934   my_strntoul_mb2_or_mb4,
1935   my_strntoll_mb2_or_mb4,
1936   my_strntoull_mb2_or_mb4,
1937   my_strntod_mb2_or_mb4,
1938   my_strtoll10_mb2,
1939   my_strntoull10rnd_mb2_or_mb4,
1940   my_scan_mb2,
1941   my_charlen_utf16,
1942   my_well_formed_char_length_utf16,
1943   my_copy_fix_mb2_or_mb4,
1944   my_uni_utf16le,
1945 };
1946 
1947 
1948 struct charset_info_st my_charset_utf16le_general_ci=
1949 {
1950   56,0,0,              /* number       */
1951   MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1952   "utf16le",           /* cs name    */
1953   "utf16le_general_ci",/* name         */
1954   "UTF-16LE Unicode",  /* comment      */
1955   NULL,                /* tailoring    */
1956   NULL,                /* ctype        */
1957   NULL,                /* to_lower     */
1958   NULL,                /* to_upper     */
1959   NULL,                /* sort_order   */
1960   NULL,                /* uca          */
1961   NULL,                /* tab_to_uni   */
1962   NULL,                /* tab_from_uni */
1963   &my_unicase_default, /* caseinfo     */
1964   NULL,                /* state_map    */
1965   NULL,                /* ident_map    */
1966   1,                   /* strxfrm_multiply */
1967   1,                   /* caseup_multiply  */
1968   1,                   /* casedn_multiply  */
1969   2,                   /* mbminlen     */
1970   4,                   /* mbmaxlen     */
1971   0,                   /* min_sort_char */
1972   0xFFFF,              /* max_sort_char */
1973   ' ',                 /* pad char      */
1974   0,                   /* escape_with_backslash_is_dangerous */
1975   1,                   /* levels_for_order   */
1976   &my_charset_utf16le_handler,
1977   &my_collation_utf16le_general_ci_handler
1978 };
1979 
1980 
1981 struct charset_info_st my_charset_utf16le_bin=
1982 {
1983   62,0,0,              /* number       */
1984   MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1985   "utf16le",           /* cs name      */
1986   "utf16le_bin",       /* name         */
1987   "UTF-16LE Unicode",  /* comment      */
1988   NULL,                /* tailoring    */
1989   NULL,                /* ctype        */
1990   NULL,                /* to_lower     */
1991   NULL,                /* to_upper     */
1992   NULL,                /* sort_order   */
1993   NULL,                /* uca          */
1994   NULL,                /* tab_to_uni   */
1995   NULL,                /* tab_from_uni */
1996   &my_unicase_default, /* caseinfo     */
1997   NULL,                /* state_map    */
1998   NULL,                /* ident_map    */
1999   1,                   /* strxfrm_multiply */
2000   1,                   /* caseup_multiply  */
2001   1,                   /* casedn_multiply  */
2002   2,                   /* mbminlen     */
2003   4,                   /* mbmaxlen     */
2004   0,                   /* min_sort_char */
2005   0xFFFF,              /* max_sort_char */
2006   ' ',                 /* pad char      */
2007   0,                   /* escape_with_backslash_is_dangerous */
2008   1,                   /* levels_for_order   */
2009   &my_charset_utf16le_handler,
2010   &my_collation_utf16le_bin_handler
2011 };
2012 
2013 
2014 struct charset_info_st my_charset_utf16le_general_nopad_ci=
2015 {
2016   MY_NOPAD_ID(56),0,0, /* number           */
2017   MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
2018   "utf16le",           /* cs name          */
2019   "utf16le_general_nopad_ci",/* name       */
2020   "UTF-16LE Unicode",  /* comment          */
2021   NULL,                /* tailoring        */
2022   NULL,                /* ctype            */
2023   NULL,                /* to_lower         */
2024   NULL,                /* to_upper         */
2025   NULL,                /* sort_order       */
2026   NULL,                /* uca              */
2027   NULL,                /* tab_to_uni       */
2028   NULL,                /* tab_from_uni     */
2029   &my_unicase_default, /* caseinfo         */
2030   NULL,                /* state_map        */
2031   NULL,                /* ident_map        */
2032   1,                   /* strxfrm_multiply */
2033   1,                   /* caseup_multiply  */
2034   1,                   /* casedn_multiply  */
2035   2,                   /* mbminlen         */
2036   4,                   /* mbmaxlen         */
2037   0,                   /* min_sort_char    */
2038   0xFFFF,              /* max_sort_char    */
2039   ' ',                 /* pad char         */
2040   0,                   /* escape_with_backslash_is_dangerous */
2041   1,                   /* levels_for_order */
2042   &my_charset_utf16le_handler,
2043   &my_collation_utf16le_general_nopad_ci_handler
2044 };
2045 
2046 
2047 struct charset_info_st my_charset_utf16le_nopad_bin=
2048 {
2049   MY_NOPAD_ID(62),0,0, /* number           */
2050   MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|
2051   MY_CS_NOPAD,
2052   "utf16le",           /* cs name          */
2053   "utf16le_nopad_bin", /* name             */
2054   "UTF-16LE Unicode",  /* comment          */
2055   NULL,                /* tailoring        */
2056   NULL,                /* ctype            */
2057   NULL,                /* to_lower         */
2058   NULL,                /* to_upper         */
2059   NULL,                /* sort_order       */
2060   NULL,                /* uca              */
2061   NULL,                /* tab_to_uni       */
2062   NULL,                /* tab_from_uni     */
2063   &my_unicase_default, /* caseinfo         */
2064   NULL,                /* state_map        */
2065   NULL,                /* ident_map        */
2066   1,                   /* strxfrm_multiply */
2067   1,                   /* caseup_multiply  */
2068   1,                   /* casedn_multiply  */
2069   2,                   /* mbminlen         */
2070   4,                   /* mbmaxlen         */
2071   0,                   /* min_sort_char    */
2072   0xFFFF,              /* max_sort_char    */
2073   ' ',                 /* pad char         */
2074   0,                   /* escape_with_backslash_is_dangerous */
2075   1,                   /* levels_for_order */
2076   &my_charset_utf16le_handler,
2077   &my_collation_utf16le_nopad_bin_handler
2078 };
2079 
2080 
2081 #endif /* HAVE_CHARSET_utf16 */
2082 
2083 
2084 #ifdef HAVE_CHARSET_utf32
2085 
2086 #include "ctype-utf32.h"
2087 
2088 /*
2089   Check is b0 and b1 start a valid UTF32 four-byte sequence.
2090   Don't accept characters greater than U+10FFFF.
2091 */
2092 #define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10))
2093 
2094 #define IS_MB4_CHAR(b0,b1,b2,b3)   (IS_UTF32_MBHEAD4(b0,b1))
2095 
2096 
my_weight_utf32_general_ci(uchar b0,uchar b1,uchar b2,uchar b3)2097 static inline int my_weight_utf32_general_ci(uchar b0, uchar b1,
2098                                              uchar b2, uchar b3)
2099 {
2100   my_wc_t wc= MY_UTF32_WC4(b0, b1, b2, b3);
2101   if (wc <= 0xFFFF)
2102   {
2103     MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
2104     return (int) (page ? page[wc & 0xFF].sort : wc);
2105   }
2106   return MY_CS_REPLACEMENT_CHARACTER;
2107 }
2108 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf32_general_ci
2109 #define DEFINE_STRNXFRM_UNICODE
2110 #define DEFINE_STRNXFRM_UNICODE_NOPAD
2111 #define MY_MB_WC(cs, pwc, s, e)  my_mb_wc_utf32_quick(pwc, s, e)
2112 #define OPTIMIZE_ASCII           0
2113 #define UNICASE_MAXCHAR          MY_UNICASE_INFO_DEFAULT_MAXCHAR
2114 #define UNICASE_PAGE0            my_unicase_default_page00
2115 #define UNICASE_PAGES            my_unicase_default_pages
2116 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
2117 #define WEIGHT_MB4(b0,b1,b2,b3)  my_weight_utf32_general_ci(b0, b1, b2, b3)
2118 #include "strcoll.inl"
2119 
2120 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf32_bin
2121 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
2122 #define WEIGHT_MB4(b0,b1,b2,b3)  ((int) MY_UTF32_WC4(b0, b1, b2, b3))
2123 #include "strcoll.inl"
2124 
2125 #define DEFINE_STRNNCOLLSP_NOPAD
2126 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf32_general_nopad_ci
2127 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
2128 #define WEIGHT_MB4(b0,b1,b2,b3)  my_weight_utf32_general_ci(b0, b1, b2, b3)
2129 #include "strcoll.inl"
2130 
2131 #define DEFINE_STRNNCOLLSP_NOPAD
2132 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf32_nopad_bin
2133 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
2134 #define WEIGHT_MB4(b0,b1,b2,b3)  ((int) MY_UTF32_WC4(b0, b1, b2, b3))
2135 #include "strcoll.inl"
2136 
2137 #undef IS_MB2_CHAR
2138 #undef IS_MB4_CHAR
2139 
2140 
2141 static int
my_utf32_uni(CHARSET_INFO * cs,my_wc_t * pwc,const uchar * s,const uchar * e)2142 my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
2143              my_wc_t *pwc, const uchar *s, const uchar *e)
2144 {
2145   return my_mb_wc_utf32_quick(pwc, s, e);
2146 }
2147 
2148 
2149 static int
my_uni_utf32(CHARSET_INFO * cs,my_wc_t wc,uchar * s,uchar * e)2150 my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)),
2151              my_wc_t wc, uchar *s, uchar *e)
2152 {
2153   if (s + 4 > e)
2154     return MY_CS_TOOSMALL4;
2155 
2156   if (wc > 0x10FFFF)
2157     return MY_CS_ILUNI;
2158 
2159   s[0]= (uchar) (wc >> 24);
2160   s[1]= (uchar) (wc >> 16) & 0xFF;
2161   s[2]= (uchar) (wc >> 8)  & 0xFF;
2162   s[3]= (uchar) wc & 0xFF;
2163   return 4;
2164 }
2165 
2166 
2167 static inline void
my_tolower_utf32(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2168 my_tolower_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2169 {
2170   MY_UNICASE_CHARACTER *page;
2171   if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
2172     *wc= page[*wc & 0xFF].tolower;
2173 }
2174 
2175 
2176 static inline void
my_toupper_utf32(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2177 my_toupper_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2178 {
2179   MY_UNICASE_CHARACTER *page;
2180   if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
2181     *wc= page[*wc & 0xFF].toupper;
2182 }
2183 
2184 
2185 static inline void
my_tosort_utf32(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2186 my_tosort_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2187 {
2188   if (*wc <= uni_plane->maxchar)
2189   {
2190     MY_UNICASE_CHARACTER *page;
2191     if ((page= uni_plane->page[*wc >> 8]))
2192       *wc= page[*wc & 0xFF].sort;
2193   }
2194   else
2195   {
2196     *wc= MY_CS_REPLACEMENT_CHARACTER;
2197   }
2198 }
2199 
2200 
2201 static size_t
my_lengthsp_utf32(CHARSET_INFO * cs,const char * ptr,size_t length)2202 my_lengthsp_utf32(CHARSET_INFO *cs __attribute__((unused)),
2203                   const char *ptr, size_t length)
2204 {
2205   const char *end= ptr + length;
2206   DBUG_ASSERT((length % 4) == 0);
2207   while (end > ptr + 3 && end[-1] == ' ' && !end[-2] && !end[-3] && !end[-4])
2208     end-= 4;
2209   return (size_t) (end - ptr);
2210 }
2211 
2212 
2213 static size_t
my_caseup_utf32(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)2214 my_caseup_utf32(CHARSET_INFO *cs, const char *src, size_t srclen,
2215                 char *dst, size_t dstlen)
2216 {
2217   my_wc_t wc;
2218   int res;
2219   const char *srcend= src + srclen;
2220   char *dstend= dst + dstlen;
2221   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2222   DBUG_ASSERT(srclen <= dstlen);
2223 
2224   while ((src < srcend) &&
2225          (res= my_utf32_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
2226   {
2227     my_toupper_utf32(uni_plane, &wc);
2228     if (res != my_uni_utf32(cs, wc, (uchar*) dst, (uchar*) dstend))
2229       break;
2230     src+= res;
2231     dst+= res;
2232   }
2233   return srclen;
2234 }
2235 
2236 
2237 static void
my_hash_sort_utf32_nopad(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)2238 my_hash_sort_utf32_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
2239                          ulong *nr1, ulong *nr2)
2240 {
2241   my_wc_t wc;
2242   int res;
2243   const uchar *e= s + slen;
2244   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2245   register ulong m1= *nr1, m2= *nr2;
2246 
2247   while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
2248   {
2249     my_tosort_utf32(uni_plane, &wc);
2250     MY_HASH_ADD(m1, m2, (uint) (wc >> 24));
2251     MY_HASH_ADD(m1, m2, (uint) (wc >> 16) & 0xFF);
2252     MY_HASH_ADD(m1, m2, (uint) (wc >> 8)  & 0xFF);
2253     MY_HASH_ADD(m1, m2, (uint) (wc & 0xFF));
2254     s+= res;
2255   }
2256   *nr1= m1;
2257   *nr2= m2;
2258 }
2259 
2260 
2261 static void
my_hash_sort_utf32(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)2262 my_hash_sort_utf32(CHARSET_INFO *cs, const uchar *s, size_t slen,
2263                    ulong *nr1, ulong *nr2)
2264 {
2265   size_t lengthsp= my_lengthsp_utf32(cs, (const char *) s, slen);
2266   my_hash_sort_utf32_nopad(cs, s, lengthsp, nr1, nr2);
2267 }
2268 
2269 
2270 static size_t
my_casedn_utf32(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)2271 my_casedn_utf32(CHARSET_INFO *cs, const char *src, size_t srclen,
2272                 char *dst, size_t dstlen)
2273 {
2274   my_wc_t wc;
2275   int res;
2276   const char *srcend= src + srclen;
2277   char *dstend= dst + dstlen;
2278   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2279   DBUG_ASSERT(srclen <= dstlen);
2280 
2281   while ((res= my_utf32_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
2282   {
2283     my_tolower_utf32(uni_plane,&wc);
2284     if (res != my_uni_utf32(cs, wc, (uchar*) dst, (uchar*) dstend))
2285       break;
2286     src+= res;
2287     dst+= res;
2288   }
2289   return srclen;
2290 }
2291 
2292 
2293 static int
my_charlen_utf32(CHARSET_INFO * cs,const uchar * b,const uchar * e)2294 my_charlen_utf32(CHARSET_INFO *cs __attribute__((unused)),
2295                  const uchar *b, const uchar *e)
2296 {
2297   return b + 4 > e ? MY_CS_TOOSMALL4 :
2298          IS_UTF32_MBHEAD4(b[0], b[1]) ? 4 : MY_CS_ILSEQ;
2299 }
2300 
2301 
2302 #define MY_FUNCTION_NAME(x)       my_ ## x ## _utf32
2303 #define CHARLEN(cs,str,end)       my_charlen_utf32(cs,str,end)
2304 #define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
2305 #include "ctype-mb.inl"
2306 #undef MY_FUNCTION_NAME
2307 #undef CHARLEN
2308 #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
2309 /* Defines my_well_formed_char_length_utf32 */
2310 
2311 
2312 static size_t
my_vsnprintf_utf32(char * dst,size_t n,const char * fmt,va_list ap)2313 my_vsnprintf_utf32(char *dst, size_t n, const char* fmt, va_list ap)
2314 {
2315   char *start= dst, *end= dst + n;
2316   DBUG_ASSERT((n % 4) == 0);
2317   for (; *fmt ; fmt++)
2318   {
2319     if (fmt[0] != '%')
2320     {
2321       if (dst >= end)                        /* End of buffer */
2322         break;
2323 
2324       *dst++= '\0';
2325       *dst++= '\0';
2326       *dst++= '\0';
2327       *dst++= *fmt;        /* Copy ordinary char */
2328       continue;
2329     }
2330 
2331     fmt++;
2332 
2333     /* Skip if max size is used (to be compatible with printf) */
2334     while ( (*fmt>='0' && *fmt<='9') || *fmt == '.' || *fmt == '-')
2335       fmt++;
2336 
2337     if (*fmt == 'l')
2338       fmt++;
2339 
2340     if (*fmt == 's')                                /* String parameter */
2341     {
2342       reg2 char *par= va_arg(ap, char *);
2343       size_t plen;
2344       size_t left_len= (size_t)(end - dst);
2345       if (!par) par= (char*)"(null)";
2346       plen= strlen(par);
2347       if (left_len <= plen*4)
2348         plen= left_len / 4 - 1;
2349 
2350       for ( ; plen ; plen--, dst+= 4, par++)
2351       {
2352         dst[0]= '\0';
2353         dst[1]= '\0';
2354         dst[2]= '\0';
2355         dst[3]= par[0];
2356       }
2357       continue;
2358     }
2359     else if (*fmt == 'd' || *fmt == 'u')        /* Integer parameter */
2360     {
2361       register int iarg;
2362       char nbuf[16];
2363       char *pbuf= nbuf;
2364 
2365       if ((size_t) (end - dst) < 64)
2366         break;
2367       iarg= va_arg(ap, int);
2368       if (*fmt == 'd')
2369         int10_to_str((long) iarg, nbuf, -10);
2370       else
2371         int10_to_str((long) (uint) iarg,nbuf,10);
2372 
2373       for (; pbuf[0]; pbuf++)
2374       {
2375         *dst++= '\0';
2376         *dst++= '\0';
2377         *dst++= '\0';
2378         *dst++= *pbuf;
2379       }
2380       continue;
2381     }
2382 
2383     /* We come here on '%%', unknown code or too long parameter */
2384     if (dst == end)
2385       break;
2386     *dst++= '\0';
2387     *dst++= '\0';
2388     *dst++= '\0';
2389     *dst++= '%';    /* % used as % or unknown code */
2390   }
2391 
2392   DBUG_ASSERT(dst < end);
2393   *dst++= '\0';
2394   *dst++= '\0';
2395   *dst++= '\0';
2396   *dst++= '\0';     /* End of errmessage */
2397   return (size_t) (dst - start - 4);
2398 }
2399 
2400 
2401 static size_t
my_snprintf_utf32(CHARSET_INFO * cs,char * to,size_t n,const char * fmt,...)2402 my_snprintf_utf32(CHARSET_INFO *cs __attribute__((unused)),
2403                   char* to, size_t n, const char* fmt, ...)
2404 {
2405   size_t ret;
2406   va_list args;
2407   va_start(args,fmt);
2408   ret= my_vsnprintf_utf32(to, n, fmt, args);
2409   va_end(args);
2410   return ret;
2411 }
2412 
2413 
2414 static longlong
my_strtoll10_utf32(CHARSET_INFO * cs,const char * nptr,char ** endptr,int * error)2415 my_strtoll10_utf32(CHARSET_INFO *cs __attribute__((unused)),
2416                    const char *nptr, char **endptr, int *error)
2417 {
2418   const char *s, *end, *start, *n_end, *true_end;
2419   uchar c;
2420   unsigned long i, j, k;
2421   ulonglong li;
2422   int negative;
2423   ulong cutoff, cutoff2, cutoff3;
2424 
2425   s= nptr;
2426   /* If fixed length string */
2427   if (endptr)
2428   {
2429     /* Make sure string length is even */
2430     end= s + ((*endptr - s) / 4) * 4;
2431     while (s < end && !s[0] && !s[1] && !s[2] &&
2432            (s[3] == ' ' || s[3] == '\t'))
2433       s+= 4;
2434     if (s == end)
2435       goto no_conv;
2436   }
2437   else
2438   {
2439      /* We don't support null terminated strings in UCS2 */
2440      goto no_conv;
2441   }
2442 
2443   /* Check for a sign. */
2444   negative= 0;
2445   if (!s[0] && !s[1] && !s[2] && s[3] == '-')
2446   {
2447     *error= -1;                                        /* Mark as negative number */
2448     negative= 1;
2449     s+= 4;
2450     if (s == end)
2451       goto no_conv;
2452     cutoff=  MAX_NEGATIVE_NUMBER / LFACTOR2;
2453     cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
2454     cutoff3=  MAX_NEGATIVE_NUMBER % 100;
2455   }
2456   else
2457   {
2458     *error= 0;
2459     if (!s[0] && !s[1] && !s[2] && s[3] == '+')
2460     {
2461       s+= 4;
2462       if (s == end)
2463         goto no_conv;
2464     }
2465     cutoff=  ULONGLONG_MAX / LFACTOR2;
2466     cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
2467     cutoff3=  ULONGLONG_MAX % 100;
2468   }
2469 
2470   /* Handle case where we have a lot of pre-zero */
2471   if (!s[0] && !s[1] && !s[2] && s[3] == '0')
2472   {
2473     i= 0;
2474     do
2475     {
2476       s+= 4;
2477       if (s == end)
2478         goto end_i;                                /* Return 0 */
2479     }
2480     while (!s[0] && !s[1] && !s[2] && s[3] == '0');
2481     n_end= s + 4 * INIT_CNT;
2482   }
2483   else
2484   {
2485     /* Read first digit to check that it's a valid number */
2486     if (s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
2487       goto no_conv;
2488     i= c;
2489     s+= 4;
2490     n_end= s + 4 * (INIT_CNT-1);
2491   }
2492 
2493   /* Handle first 9 digits and store them in i */
2494   if (n_end > end)
2495     n_end= end;
2496   for (; s != n_end ; s+= 4)
2497   {
2498     if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2499       goto end_i;
2500     i= i * 10 + c;
2501   }
2502   if (s == end)
2503     goto end_i;
2504 
2505   /* Handle next 9 digits and store them in j */
2506   j= 0;
2507   start= s;                                /* Used to know how much to shift i */
2508   n_end= true_end= s + 4 * INIT_CNT;
2509   if (n_end > end)
2510     n_end= end;
2511   do
2512   {
2513     if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2514       goto end_i_and_j;
2515     j= j * 10 + c;
2516     s+= 4;
2517   } while (s != n_end);
2518   if (s == end)
2519   {
2520     if (s != true_end)
2521       goto end_i_and_j;
2522     goto end3;
2523   }
2524   if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2525     goto end3;
2526 
2527   /* Handle the next 1 or 2 digits and store them in k */
2528   k=c;
2529   s+= 4;
2530   if (s == end || s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
2531     goto end4;
2532   k= k * 10 + c;
2533   s+= 4;
2534   *endptr= (char*) s;
2535 
2536   /* number string should have ended here */
2537   if (s != end && !s[0] && !s[1] && !s[2] && (c= (s[3] - '0')) <= 9)
2538     goto overflow;
2539 
2540   /* Check that we didn't get an overflow with the last digit */
2541   if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
2542                                      k > cutoff3)))
2543     goto overflow;
2544   li= i * LFACTOR2+ (ulonglong) j * 100 + k;
2545   return (longlong) li;
2546 
2547 overflow:                                        /* *endptr is set here */
2548   *error= MY_ERRNO_ERANGE;
2549   return negative ? LONGLONG_MIN : (longlong) ULONGLONG_MAX;
2550 
2551 end_i:
2552   *endptr= (char*) s;
2553   return (negative ? ((longlong) -(long) i) : (longlong) i);
2554 
2555 end_i_and_j:
2556   li= (ulonglong) i * lfactor[(size_t) (s-start) / 4] + j;
2557   *endptr= (char*) s;
2558   return (negative ? -((longlong) li) : (longlong) li);
2559 
2560 end3:
2561   li= (ulonglong) i*LFACTOR+ (ulonglong) j;
2562   *endptr= (char*) s;
2563   return (negative ? -((longlong) li) : (longlong) li);
2564 
2565 end4:
2566   li= (ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
2567   *endptr= (char*) s;
2568   if (negative)
2569   {
2570    if (li > MAX_NEGATIVE_NUMBER)
2571      goto overflow;
2572    return -((longlong) li);
2573   }
2574   return (longlong) li;
2575 
2576 no_conv:
2577   /* There was no number to convert.  */
2578   *error= MY_ERRNO_EDOM;
2579   *endptr= (char *) nptr;
2580   return 0;
2581 }
2582 
2583 
2584 static size_t
my_numchars_utf32(CHARSET_INFO * cs,const char * b,const char * e)2585 my_numchars_utf32(CHARSET_INFO *cs __attribute__((unused)),
2586                   const char *b, const char *e)
2587 {
2588   return (size_t) (e - b) / 4;
2589 }
2590 
2591 
2592 static size_t
my_charpos_utf32(CHARSET_INFO * cs,const char * b,const char * e,size_t pos)2593 my_charpos_utf32(CHARSET_INFO *cs __attribute__((unused)),
2594                  const char *b, const char *e, size_t pos)
2595 {
2596   size_t string_length= (size_t) (e - b);
2597   return pos * 4 > string_length ? string_length + 4 : pos * 4;
2598 }
2599 
2600 
2601 static
my_fill_utf32(CHARSET_INFO * cs,char * s,size_t slen,int fill)2602 void my_fill_utf32(CHARSET_INFO *cs,
2603                    char *s, size_t slen, int fill)
2604 {
2605   char buf[10];
2606 #ifdef DBUG_ASSERT_EXISTS
2607   uint buflen;
2608 #endif
2609   char *e= s + slen;
2610 
2611   DBUG_ASSERT((slen % 4) == 0);
2612 
2613 #ifdef DBUG_ASSERT_EXISTS
2614   buflen=
2615 #endif
2616     cs->cset->wc_mb(cs, (my_wc_t) fill, (uchar*) buf,
2617                     (uchar*) buf + sizeof(buf));
2618   DBUG_ASSERT(buflen == 4);
2619   while (s < e)
2620   {
2621     memcpy(s, buf, 4);
2622     s+= 4;
2623   }
2624 }
2625 
2626 
2627 static int
my_wildcmp_utf32_ci(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)2628 my_wildcmp_utf32_ci(CHARSET_INFO *cs,
2629                     const char *str, const char *str_end,
2630                     const char *wildstr, const char *wildend,
2631                     int escape, int w_one, int w_many)
2632 {
2633   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2634   return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
2635                             escape, w_one, w_many, uni_plane);
2636 }
2637 
2638 
2639 static int
my_wildcmp_utf32_bin(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)2640 my_wildcmp_utf32_bin(CHARSET_INFO *cs,
2641                      const char *str,const char *str_end,
2642                      const char *wildstr,const char *wildend,
2643                      int escape, int w_one, int w_many)
2644 {
2645   return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
2646                             escape, w_one, w_many, NULL);
2647 }
2648 
2649 
2650 static size_t
my_scan_utf32(CHARSET_INFO * cs,const char * str,const char * end,int sequence_type)2651 my_scan_utf32(CHARSET_INFO *cs,
2652               const char *str, const char *end, int sequence_type)
2653 {
2654   const char *str0= str;
2655 
2656   switch (sequence_type)
2657   {
2658   case MY_SEQ_SPACES:
2659     for ( ; str < end; )
2660     {
2661       my_wc_t wc;
2662       int res= my_utf32_uni(cs, &wc, (uchar*) str, (uchar*) end);
2663       if (res < 0 || wc != ' ')
2664         break;
2665       str+= res;
2666     }
2667     return (size_t) (str - str0);
2668   case MY_SEQ_NONSPACES:
2669     DBUG_ASSERT(0); /* Not implemented */
2670     /* pass through */
2671   default:
2672     return 0;
2673   }
2674 }
2675 
2676 
2677 static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler =
2678 {
2679   NULL, /* init */
2680   my_strnncoll_utf32_general_ci,
2681   my_strnncollsp_utf32_general_ci,
2682   my_strnncollsp_nchars_utf32_general_ci,
2683   my_strnxfrm_utf32_general_ci,
2684   my_strnxfrmlen_unicode,
2685   my_like_range_generic,
2686   my_wildcmp_utf32_ci,
2687   my_strcasecmp_mb2_or_mb4,
2688   my_instr_mb,
2689   my_hash_sort_utf32,
2690   my_propagate_simple
2691 };
2692 
2693 
2694 static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
2695 {
2696   NULL, /* init */
2697   my_strnncoll_utf32_bin,
2698   my_strnncollsp_utf32_bin,
2699   my_strnncollsp_nchars_utf32_bin,
2700   my_strnxfrm_unicode_full_bin,
2701   my_strnxfrmlen_unicode_full_bin,
2702   my_like_range_generic,
2703   my_wildcmp_utf32_bin,
2704   my_strcasecmp_mb2_or_mb4,
2705   my_instr_mb,
2706   my_hash_sort_utf32,
2707   my_propagate_simple
2708 };
2709 
2710 
2711 static MY_COLLATION_HANDLER my_collation_utf32_general_nopad_ci_handler =
2712 {
2713   NULL, /* init */
2714   my_strnncoll_utf32_general_ci,
2715   my_strnncollsp_utf32_general_nopad_ci,
2716   my_strnncollsp_nchars_utf32_general_nopad_ci,
2717   my_strnxfrm_nopad_utf32_general_ci,
2718   my_strnxfrmlen_unicode,
2719   my_like_range_generic,
2720   my_wildcmp_utf32_ci,
2721   my_strcasecmp_mb2_or_mb4,
2722   my_instr_mb,
2723   my_hash_sort_utf32_nopad,
2724   my_propagate_simple
2725 };
2726 
2727 
2728 static MY_COLLATION_HANDLER my_collation_utf32_nopad_bin_handler =
2729 {
2730   NULL, /* init */
2731   my_strnncoll_utf32_bin,
2732   my_strnncollsp_utf32_nopad_bin,
2733   my_strnncollsp_nchars_utf32_nopad_bin,
2734   my_strnxfrm_unicode_full_nopad_bin,
2735   my_strnxfrmlen_unicode_full_bin,
2736   my_like_range_generic,
2737   my_wildcmp_utf32_bin,
2738   my_strcasecmp_mb2_or_mb4,
2739   my_instr_mb,
2740   my_hash_sort_utf32_nopad,
2741   my_propagate_simple
2742 };
2743 
2744 
2745 MY_CHARSET_HANDLER my_charset_utf32_handler=
2746 {
2747   NULL, /* init */
2748   my_numchars_utf32,
2749   my_charpos_utf32,
2750   my_lengthsp_utf32,
2751   my_numcells_mb,
2752   my_utf32_uni,
2753   my_uni_utf32,
2754   my_mb_ctype_mb,
2755   my_caseup_str_mb2_or_mb4,
2756   my_casedn_str_mb2_or_mb4,
2757   my_caseup_utf32,
2758   my_casedn_utf32,
2759   my_snprintf_utf32,
2760   my_l10tostr_mb2_or_mb4,
2761   my_ll10tostr_mb2_or_mb4,
2762   my_fill_utf32,
2763   my_strntol_mb2_or_mb4,
2764   my_strntoul_mb2_or_mb4,
2765   my_strntoll_mb2_or_mb4,
2766   my_strntoull_mb2_or_mb4,
2767   my_strntod_mb2_or_mb4,
2768   my_strtoll10_utf32,
2769   my_strntoull10rnd_mb2_or_mb4,
2770   my_scan_utf32,
2771   my_charlen_utf32,
2772   my_well_formed_char_length_utf32,
2773   my_copy_fix_mb2_or_mb4,
2774   my_uni_utf32,
2775 };
2776 
2777 
2778 struct charset_info_st my_charset_utf32_general_ci=
2779 {
2780   60,0,0,              /* number       */
2781   MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
2782   "utf32",             /* cs name    */
2783   "utf32_general_ci",  /* name         */
2784   "UTF-32 Unicode",    /* comment      */
2785   NULL,                /* tailoring    */
2786   NULL,                /* ctype        */
2787   NULL,                /* to_lower     */
2788   NULL,                /* to_upper     */
2789   NULL,                /* sort_order   */
2790   NULL,                /* uca          */
2791   NULL,                /* tab_to_uni   */
2792   NULL,                /* tab_from_uni */
2793   &my_unicase_default, /* caseinfo     */
2794   NULL,                /* state_map    */
2795   NULL,                /* ident_map    */
2796   1,                   /* strxfrm_multiply */
2797   1,                   /* caseup_multiply  */
2798   1,                   /* casedn_multiply  */
2799   4,                   /* mbminlen     */
2800   4,                   /* mbmaxlen     */
2801   0,                   /* min_sort_char */
2802   0xFFFF,              /* max_sort_char */
2803   ' ',                 /* pad char      */
2804   0,                   /* escape_with_backslash_is_dangerous */
2805   1,                   /* levels_for_order   */
2806   &my_charset_utf32_handler,
2807   &my_collation_utf32_general_ci_handler
2808 };
2809 
2810 
2811 struct charset_info_st my_charset_utf32_bin=
2812 {
2813   61,0,0,              /* number       */
2814   MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
2815   "utf32",             /* cs name    */
2816   "utf32_bin",         /* name         */
2817   "UTF-32 Unicode",    /* comment      */
2818   NULL,                /* tailoring    */
2819   NULL,                /* ctype        */
2820   NULL,                /* to_lower     */
2821   NULL,                /* to_upper     */
2822   NULL,                /* sort_order   */
2823   NULL,                /* uca          */
2824   NULL,                /* tab_to_uni   */
2825   NULL,                /* tab_from_uni */
2826   &my_unicase_default, /* caseinfo     */
2827   NULL,                /* state_map    */
2828   NULL,                /* ident_map    */
2829   1,                   /* strxfrm_multiply */
2830   1,                   /* caseup_multiply  */
2831   1,                   /* casedn_multiply  */
2832   4,                   /* mbminlen     */
2833   4,                   /* mbmaxlen     */
2834   0,                   /* min_sort_char */
2835   0xFFFF,              /* max_sort_char */
2836   ' ',                 /* pad char      */
2837   0,                   /* escape_with_backslash_is_dangerous */
2838   1,                   /* levels_for_order   */
2839   &my_charset_utf32_handler,
2840   &my_collation_utf32_bin_handler
2841 };
2842 
2843 
2844 struct charset_info_st my_charset_utf32_general_nopad_ci=
2845 {
2846   MY_NOPAD_ID(60),0,0, /* number           */
2847   MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
2848   "utf32",             /* cs name          */
2849   "utf32_general_nopad_ci", /* name        */
2850   "UTF-32 Unicode",    /* comment          */
2851   NULL,                /* tailoring        */
2852   NULL,                /* ctype            */
2853   NULL,                /* to_lower         */
2854   NULL,                /* to_upper         */
2855   NULL,                /* sort_order       */
2856   NULL,                /* uca              */
2857   NULL,                /* tab_to_uni       */
2858   NULL,                /* tab_from_uni     */
2859   &my_unicase_default, /* caseinfo         */
2860   NULL,                /* state_map        */
2861   NULL,                /* ident_map        */
2862   1,                   /* strxfrm_multiply */
2863   1,                   /* caseup_multiply  */
2864   1,                   /* casedn_multiply  */
2865   4,                   /* mbminlen         */
2866   4,                   /* mbmaxlen         */
2867   0,                   /* min_sort_char    */
2868   0xFFFF,              /* max_sort_char    */
2869   ' ',                 /* pad char         */
2870   0,                   /* escape_with_backslash_is_dangerous */
2871   1,                   /* levels_for_order */
2872   &my_charset_utf32_handler,
2873   &my_collation_utf32_general_nopad_ci_handler
2874 };
2875 
2876 
2877 struct charset_info_st my_charset_utf32_nopad_bin=
2878 {
2879   MY_NOPAD_ID(61),0,0, /* number           */
2880   MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|
2881   MY_CS_NOPAD,
2882   "utf32",             /* cs name          */
2883   "utf32_nopad_bin",   /* name             */
2884   "UTF-32 Unicode",    /* comment          */
2885   NULL,                /* tailoring        */
2886   NULL,                /* ctype            */
2887   NULL,                /* to_lower         */
2888   NULL,                /* to_upper         */
2889   NULL,                /* sort_order       */
2890   NULL,                /* uca              */
2891   NULL,                /* tab_to_uni       */
2892   NULL,                /* tab_from_uni     */
2893   &my_unicase_default, /* caseinfo         */
2894   NULL,                /* state_map        */
2895   NULL,                /* ident_map        */
2896   1,                   /* strxfrm_multiply */
2897   1,                   /* caseup_multiply  */
2898   1,                   /* casedn_multiply  */
2899   4,                   /* mbminlen         */
2900   4,                   /* mbmaxlen         */
2901   0,                   /* min_sort_char    */
2902   0xFFFF,              /* max_sort_char    */
2903   ' ',                 /* pad char         */
2904   0,                   /* escape_with_backslash_is_dangerous */
2905   1,                   /* levels_for_order */
2906   &my_charset_utf32_handler,
2907   &my_collation_utf32_nopad_bin_handler
2908 };
2909 
2910 
2911 #endif /* HAVE_CHARSET_utf32 */
2912 
2913 
2914 #ifdef HAVE_CHARSET_ucs2
2915 
2916 #include "ctype-ucs2.h"
2917 
2918 static const uchar ctype_ucs2[] = {
2919     0,
2920    32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32,
2921    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2922    72, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
2923   132,132,132,132,132,132,132,132,132,132, 16, 16, 16, 16, 16, 16,
2924    16,129,129,129,129,129,129,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2925     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 16, 16, 16, 16, 16,
2926    16,130,130,130,130,130,130,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2927     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, 16, 16, 16, 16, 32,
2928     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2929     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2930     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2931     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2932     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2933     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2934     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2935     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
2936 };
2937 
2938 static const uchar to_lower_ucs2[] = {
2939     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
2940    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2941    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2942    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
2943    64, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
2944   112,113,114,115,116,117,118,119,120,121,122, 91, 92, 93, 94, 95,
2945    96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
2946   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2947   128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2948   144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
2949   160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2950   176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2951   192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2952   208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2953   224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
2954   240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
2955 };
2956 
2957 static const uchar to_upper_ucs2[] = {
2958     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
2959    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2960    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2961    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
2962    64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
2963    80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
2964    96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
2965    80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,123,124,125,126,127,
2966   128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2967   144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
2968   160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2969   176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2970   192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2971   208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2972   224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
2973   240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
2974 };
2975 
2976 
2977 /* Definitions for strcoll.inl */
2978 #define IS_MB2_CHAR(x,y)            (1)
2979 #define UCS2_CODE(b0,b1)            (((uchar) b0) << 8 | ((uchar) b1))
2980 
2981 
my_weight_mb2_ucs2_general_ci(uchar b0,uchar b1)2982 static inline int my_weight_mb2_ucs2_general_ci(uchar b0, uchar b1)
2983 {
2984   my_wc_t wc= UCS2_CODE(b0, b1);
2985   MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
2986   return (int) (page ? page[wc & 0xFF].sort : wc);
2987 }
2988 
2989 
2990 #define MY_FUNCTION_NAME(x)      my_ ## x ## _ucs2_general_ci
2991 #define DEFINE_STRNXFRM_UNICODE
2992 #define DEFINE_STRNXFRM_UNICODE_NOPAD
2993 #define MY_MB_WC(cs, pwc, s, e)  my_mb_wc_ucs2_quick(pwc, s, e)
2994 #define OPTIMIZE_ASCII           0
2995 #define UNICASE_MAXCHAR          MY_UNICASE_INFO_DEFAULT_MAXCHAR
2996 #define UNICASE_PAGE0            my_unicase_default_page00
2997 #define UNICASE_PAGES            my_unicase_default_pages
2998 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
2999 #define WEIGHT_MB2(b0,b1)        my_weight_mb2_ucs2_general_ci(b0,b1)
3000 #include "strcoll.inl"
3001 
3002 
3003 #define MY_FUNCTION_NAME(x)      my_ ## x ## _ucs2_bin
3004 #define DEFINE_STRNXFRM_UNICODE_BIN2
3005 #define MY_MB_WC(cs, pwc, s, e)  my_mb_wc_ucs2_quick(pwc, s, e)
3006 #define OPTIMIZE_ASCII           0
3007 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
3008 #define WEIGHT_MB2(b0,b1)        UCS2_CODE(b0,b1)
3009 #include "strcoll.inl"
3010 
3011 
3012 #define DEFINE_STRNNCOLLSP_NOPAD
3013 #define MY_FUNCTION_NAME(x)    my_ ## x ## _ucs2_general_nopad_ci
3014 #define WEIGHT_ILSEQ(x)        (0xFF0000 + (uchar) (x))
3015 #define WEIGHT_MB2(b0,b1)      my_weight_mb2_ucs2_general_ci(b0,b1)
3016 #include "strcoll.inl"
3017 
3018 
3019 #define DEFINE_STRNNCOLLSP_NOPAD
3020 #define MY_FUNCTION_NAME(x)    my_ ## x ## _ucs2_nopad_bin
3021 #define WEIGHT_ILSEQ(x)        (0xFF0000 + (uchar) (x))
3022 #define WEIGHT_MB2(b0,b1)      UCS2_CODE(b0,b1)
3023 #include "strcoll.inl"
3024 
3025 
3026 static int
my_charlen_ucs2(CHARSET_INFO * cs,const uchar * s,const uchar * e)3027 my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3028 		const uchar *s, const uchar *e)
3029 {
3030   return s + 2 > e ? MY_CS_TOOSMALLN(2) : 2;
3031 }
3032 
3033 
my_ucs2_uni(CHARSET_INFO * cs,my_wc_t * pwc,const uchar * s,const uchar * e)3034 static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)),
3035 		       my_wc_t * pwc, const uchar *s, const uchar *e)
3036 {
3037   return my_mb_wc_ucs2_quick(pwc, s, e);
3038 }
3039 
my_uni_ucs2(CHARSET_INFO * cs,my_wc_t wc,uchar * r,uchar * e)3040 static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) ,
3041 		       my_wc_t wc, uchar *r, uchar *e)
3042 {
3043   if ( r+2 > e )
3044     return MY_CS_TOOSMALL2;
3045 
3046   if (wc > 0xFFFF) /* UCS2 does not support characters outside BMP */
3047     return MY_CS_ILUNI;
3048 
3049   r[0]= (uchar) (wc >> 8);
3050   r[1]= (uchar) (wc & 0xFF);
3051   return 2;
3052 }
3053 
3054 
3055 static inline void
my_tolower_ucs2(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)3056 my_tolower_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
3057 {
3058   MY_UNICASE_CHARACTER *page;
3059   if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
3060     *wc= page[*wc & 0xFF].tolower;
3061 }
3062 
3063 
3064 static inline void
my_toupper_ucs2(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)3065 my_toupper_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
3066 {
3067   MY_UNICASE_CHARACTER *page;
3068   if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
3069     *wc= page[*wc & 0xFF].toupper;
3070 }
3071 
3072 
3073 static inline void
my_tosort_ucs2(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)3074 my_tosort_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
3075 {
3076   MY_UNICASE_CHARACTER *page;
3077   if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
3078     *wc= page[*wc & 0xFF].sort;
3079 }
3080 
my_caseup_ucs2(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)3081 static size_t my_caseup_ucs2(CHARSET_INFO *cs, const char *src, size_t srclen,
3082                            char *dst, size_t dstlen)
3083 {
3084   my_wc_t wc;
3085   int res;
3086   const char *srcend= src + srclen;
3087   char *dstend= dst + dstlen;
3088   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3089   DBUG_ASSERT(srclen <= dstlen);
3090 
3091   while ((src < srcend) &&
3092          (res= my_ucs2_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
3093   {
3094     my_toupper_ucs2(uni_plane, &wc);
3095     if (res != my_uni_ucs2(cs, wc, (uchar*) dst, (uchar*) dstend))
3096       break;
3097     src+= res;
3098     dst+= res;
3099   }
3100   return srclen;
3101 }
3102 
3103 
3104 static void
my_hash_sort_ucs2_nopad(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)3105 my_hash_sort_ucs2_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
3106                         ulong *nr1, ulong *nr2)
3107 {
3108   my_wc_t wc;
3109   int res;
3110   const uchar *e=s+slen;
3111   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3112   register ulong m1= *nr1, m2= *nr2;
3113 
3114   while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
3115   {
3116     my_tosort_ucs2(uni_plane, &wc);
3117     MY_HASH_ADD_16(m1, m2, wc);
3118     s+=res;
3119   }
3120   *nr1= m1;
3121   *nr2= m2;
3122 }
3123 
3124 
my_hash_sort_ucs2(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)3125 static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, size_t slen,
3126 			      ulong *nr1, ulong *nr2)
3127 {
3128   size_t lengthsp= my_lengthsp_mb2(cs, (const char *) s, slen);
3129   my_hash_sort_ucs2_nopad(cs, s, lengthsp, nr1, nr2);
3130 }
3131 
my_casedn_ucs2(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)3132 static size_t my_casedn_ucs2(CHARSET_INFO *cs, const char *src, size_t srclen,
3133                            char *dst, size_t dstlen)
3134 {
3135   my_wc_t wc;
3136   int res;
3137   const char *srcend= src + srclen;
3138   char *dstend= dst + dstlen;
3139   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3140   DBUG_ASSERT(srclen <= dstlen);
3141 
3142   while ((src < srcend) &&
3143          (res= my_ucs2_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
3144   {
3145     my_tolower_ucs2(uni_plane, &wc);
3146     if (res != my_uni_ucs2(cs, wc, (uchar*) dst, (uchar*) dstend))
3147       break;
3148     src+= res;
3149     dst+= res;
3150   }
3151   return srclen;
3152 }
3153 
3154 
3155 static void
my_fill_ucs2(CHARSET_INFO * cs,char * s,size_t l,int fill)3156 my_fill_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3157              char *s, size_t l, int fill)
3158 {
3159   DBUG_ASSERT(fill <= 0xFFFF);
3160 #ifdef WAITING_FOR_GCC_VECTORIZATION_BUG_TO_BE_FIXED
3161   /*
3162     This code with int2store() is known to be faster on some processors,
3163     but crashes on other processors due to a possible bug in GCC's
3164     -ftree-vectorization (which is enabled in -O3) in case of
3165     a   non-aligned memory. See here for details:
3166     http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58039
3167   */
3168   char *last= s + l - 2;
3169   uint16 tmp= (fill >> 8) + ((fill & 0xFF) << 8); /* swap bytes */
3170   DBUG_ASSERT(fill <= 0xFFFF);
3171   for ( ; s <= last; s+= 2)
3172     int2store(s, tmp); /* store little-endian */
3173 #else
3174   for ( ; l >= 2; s[0]= (fill >> 8), s[1]= (fill & 0xFF), s+= 2, l-= 2);
3175 #endif
3176 }
3177 
3178 
3179 static
my_numchars_ucs2(CHARSET_INFO * cs,const char * b,const char * e)3180 size_t my_numchars_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3181                         const char *b, const char *e)
3182 {
3183   return (size_t) (e-b)/2;
3184 }
3185 
3186 
3187 static
my_charpos_ucs2(CHARSET_INFO * cs,const char * b,const char * e,size_t pos)3188 size_t my_charpos_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3189                        const char *b  __attribute__((unused)),
3190                        const char *e  __attribute__((unused)),
3191                        size_t pos)
3192 {
3193   size_t string_length= (size_t) (e - b);
3194   return pos > string_length ? string_length + 2 : pos * 2;
3195 }
3196 
3197 
3198 static size_t
my_well_formed_char_length_ucs2(CHARSET_INFO * cs,const char * b,const char * e,size_t nchars,MY_STRCOPY_STATUS * status)3199 my_well_formed_char_length_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3200                                 const char *b, const char *e,
3201                                 size_t nchars, MY_STRCOPY_STATUS *status)
3202 {
3203   size_t length= e - b;
3204   if (nchars * 2 <= length)
3205   {
3206     status->m_well_formed_error_pos= NULL;
3207     status->m_source_end_pos= b + (nchars * 2);
3208     return nchars;
3209   }
3210   if (length % 2)
3211   {
3212     status->m_well_formed_error_pos= status->m_source_end_pos= e - 1;
3213   }
3214   else
3215   {
3216     status->m_well_formed_error_pos= NULL;
3217     status->m_source_end_pos= e;
3218   }
3219   return length / 2;
3220 }
3221 
3222 
3223 static
my_wildcmp_ucs2_ci(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)3224 int my_wildcmp_ucs2_ci(CHARSET_INFO *cs,
3225 		    const char *str,const char *str_end,
3226 		    const char *wildstr,const char *wildend,
3227 		    int escape, int w_one, int w_many)
3228 {
3229   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3230   return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
3231                             escape,w_one,w_many,uni_plane);
3232 }
3233 
3234 
3235 static
my_wildcmp_ucs2_bin(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)3236 int my_wildcmp_ucs2_bin(CHARSET_INFO *cs,
3237 		    const char *str,const char *str_end,
3238 		    const char *wildstr,const char *wildend,
3239 		    int escape, int w_one, int w_many)
3240 {
3241   return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
3242                             escape,w_one,w_many,NULL);
3243 }
3244 
3245 
3246 static void
my_hash_sort_ucs2_nopad_bin(CHARSET_INFO * cs,const uchar * key,size_t len,ulong * nr1,ulong * nr2)3247 my_hash_sort_ucs2_nopad_bin(CHARSET_INFO *cs __attribute__((unused)),
3248                             const uchar *key, size_t len,
3249                             ulong *nr1, ulong *nr2)
3250 {
3251   const uchar *end= key + len;
3252   register ulong m1= *nr1, m2= *nr2;
3253   for ( ; key < end ; key++)
3254   {
3255     MY_HASH_ADD(m1, m2, (uint)*key);
3256   }
3257   *nr1= m1;
3258   *nr2= m2;
3259 }
3260 
3261 
3262 static void
my_hash_sort_ucs2_bin(CHARSET_INFO * cs,const uchar * key,size_t len,ulong * nr1,ulong * nr2)3263 my_hash_sort_ucs2_bin(CHARSET_INFO *cs,
3264                       const uchar *key, size_t len, ulong *nr1, ulong *nr2)
3265 {
3266   size_t lengthsp= my_lengthsp_mb2(cs, (const char *) key, len);
3267   my_hash_sort_ucs2_nopad_bin(cs, key, lengthsp, nr1, nr2);
3268 }
3269 
3270 
3271 static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
3272 {
3273     NULL,		/* init */
3274     my_strnncoll_ucs2_general_ci,
3275     my_strnncollsp_ucs2_general_ci,
3276     my_strnncollsp_nchars_ucs2_general_ci,
3277     my_strnxfrm_ucs2_general_ci,
3278     my_strnxfrmlen_unicode,
3279     my_like_range_generic,
3280     my_wildcmp_ucs2_ci,
3281     my_strcasecmp_mb2_or_mb4,
3282     my_instr_mb,
3283     my_hash_sort_ucs2,
3284     my_propagate_simple
3285 };
3286 
3287 
3288 static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
3289 {
3290     NULL,		/* init */
3291     my_strnncoll_ucs2_bin,
3292     my_strnncollsp_ucs2_bin,
3293     my_strnncollsp_nchars_ucs2_bin,
3294     my_strnxfrm_ucs2_bin,
3295     my_strnxfrmlen_unicode,
3296     my_like_range_generic,
3297     my_wildcmp_ucs2_bin,
3298     my_strcasecmp_mb2_or_mb4,
3299     my_instr_mb,
3300     my_hash_sort_ucs2_bin,
3301     my_propagate_simple
3302 };
3303 
3304 
3305 static MY_COLLATION_HANDLER my_collation_ucs2_general_nopad_ci_handler =
3306 {
3307     NULL,		/* init */
3308     my_strnncoll_ucs2_general_ci,
3309     my_strnncollsp_ucs2_general_nopad_ci,
3310     my_strnncollsp_nchars_ucs2_general_nopad_ci,
3311     my_strnxfrm_nopad_ucs2_general_ci,
3312     my_strnxfrmlen_unicode,
3313     my_like_range_generic,
3314     my_wildcmp_ucs2_ci,
3315     my_strcasecmp_mb2_or_mb4,
3316     my_instr_mb,
3317     my_hash_sort_ucs2_nopad,
3318     my_propagate_simple
3319 };
3320 
3321 
3322 static MY_COLLATION_HANDLER my_collation_ucs2_nopad_bin_handler =
3323 {
3324     NULL,		/* init */
3325     my_strnncoll_ucs2_bin,
3326     my_strnncollsp_ucs2_nopad_bin,
3327     my_strnncollsp_nchars_ucs2_nopad_bin,
3328     my_strnxfrm_nopad_ucs2_bin,
3329     my_strnxfrmlen_unicode,
3330     my_like_range_generic,
3331     my_wildcmp_ucs2_bin,
3332     my_strcasecmp_mb2_or_mb4,
3333     my_instr_mb,
3334     my_hash_sort_ucs2_nopad_bin,
3335     my_propagate_simple
3336 };
3337 
3338 
3339 MY_CHARSET_HANDLER my_charset_ucs2_handler=
3340 {
3341     NULL,		/* init */
3342     my_numchars_ucs2,
3343     my_charpos_ucs2,
3344     my_lengthsp_mb2,
3345     my_numcells_mb,
3346     my_ucs2_uni,	/* mb_wc        */
3347     my_uni_ucs2,	/* wc_mb        */
3348     my_mb_ctype_mb,
3349     my_caseup_str_mb2_or_mb4,
3350     my_casedn_str_mb2_or_mb4,
3351     my_caseup_ucs2,
3352     my_casedn_ucs2,
3353     my_snprintf_mb2,
3354     my_l10tostr_mb2_or_mb4,
3355     my_ll10tostr_mb2_or_mb4,
3356     my_fill_ucs2,
3357     my_strntol_mb2_or_mb4,
3358     my_strntoul_mb2_or_mb4,
3359     my_strntoll_mb2_or_mb4,
3360     my_strntoull_mb2_or_mb4,
3361     my_strntod_mb2_or_mb4,
3362     my_strtoll10_mb2,
3363     my_strntoull10rnd_mb2_or_mb4,
3364     my_scan_mb2,
3365     my_charlen_ucs2,
3366     my_well_formed_char_length_ucs2,
3367     my_copy_fix_mb2_or_mb4,
3368     my_uni_ucs2,
3369 };
3370 
3371 
3372 struct charset_info_st my_charset_ucs2_general_ci=
3373 {
3374     35,0,0,		/* number       */
3375     MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
3376     "ucs2",		/* cs name    */
3377     "ucs2_general_ci",	/* name         */
3378     "",			/* comment      */
3379     NULL,		/* tailoring    */
3380     ctype_ucs2,		/* ctype        */
3381     to_lower_ucs2,	/* to_lower     */
3382     to_upper_ucs2,	/* to_upper     */
3383     to_upper_ucs2,	/* sort_order   */
3384     NULL,		/* uca          */
3385     NULL,		/* tab_to_uni   */
3386     NULL,		/* tab_from_uni */
3387     &my_unicase_default,/* caseinfo     */
3388     NULL,		/* state_map    */
3389     NULL,		/* ident_map    */
3390     1,			/* strxfrm_multiply */
3391     1,                  /* caseup_multiply  */
3392     1,                  /* casedn_multiply  */
3393     2,			/* mbminlen     */
3394     2,			/* mbmaxlen     */
3395     0,			/* min_sort_char */
3396     0xFFFF,		/* max_sort_char */
3397     ' ',                /* pad char      */
3398     0,                  /* escape_with_backslash_is_dangerous */
3399     1,                  /* levels_for_order   */
3400     &my_charset_ucs2_handler,
3401     &my_collation_ucs2_general_ci_handler
3402 };
3403 
3404 
3405 struct charset_info_st my_charset_ucs2_general_mysql500_ci=
3406 {
3407   159, 0, 0,                                       /* number           */
3408   MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, /* state */
3409   "ucs2",                                          /* cs name          */
3410   "ucs2_general_mysql500_ci",                      /* name             */
3411   "",                                              /* comment          */
3412   NULL,                                            /* tailoring        */
3413   ctype_ucs2,                                      /* ctype            */
3414   to_lower_ucs2,                                   /* to_lower         */
3415   to_upper_ucs2,                                   /* to_upper         */
3416   to_upper_ucs2,                                   /* sort_order       */
3417   NULL,                                            /* uca              */
3418   NULL,                                            /* tab_to_uni       */
3419   NULL,                                            /* tab_from_uni     */
3420   &my_unicase_mysql500,                            /* caseinfo         */
3421   NULL,                                            /* state_map        */
3422   NULL,                                            /* ident_map        */
3423   1,                                               /* strxfrm_multiply */
3424   1,                                               /* caseup_multiply  */
3425   1,                                               /* casedn_multiply  */
3426   2,                                               /* mbminlen         */
3427   2,                                               /* mbmaxlen         */
3428   0,                                               /* min_sort_char    */
3429   0xFFFF,                                          /* max_sort_char    */
3430   ' ',                                             /* pad char         */
3431   0,                          /* escape_with_backslash_is_dangerous    */
3432   1,                                               /* levels_for_order   */
3433   &my_charset_ucs2_handler,
3434   &my_collation_ucs2_general_ci_handler
3435 };
3436 
3437 
3438 struct charset_info_st my_charset_ucs2_bin=
3439 {
3440     90,0,0,		/* number       */
3441     MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII,
3442     "ucs2",		/* cs name    */
3443     "ucs2_bin",		/* name         */
3444     "",			/* comment      */
3445     NULL,		/* tailoring    */
3446     ctype_ucs2,		/* ctype        */
3447     to_lower_ucs2,	/* to_lower     */
3448     to_upper_ucs2,	/* to_upper     */
3449     NULL,		/* sort_order   */
3450     NULL,		/* uca          */
3451     NULL,		/* tab_to_uni   */
3452     NULL,		/* tab_from_uni */
3453     &my_unicase_default,/* caseinfo     */
3454     NULL,		/* state_map    */
3455     NULL,		/* ident_map    */
3456     1,			/* strxfrm_multiply */
3457     1,                  /* caseup_multiply  */
3458     1,                  /* casedn_multiply  */
3459     2,			/* mbminlen     */
3460     2,			/* mbmaxlen     */
3461     0,			/* min_sort_char */
3462     0xFFFF,		/* max_sort_char */
3463     ' ',                /* pad char      */
3464     0,                  /* escape_with_backslash_is_dangerous */
3465     1,                  /* levels_for_order   */
3466     &my_charset_ucs2_handler,
3467     &my_collation_ucs2_bin_handler
3468 };
3469 
3470 
3471 struct charset_info_st my_charset_ucs2_general_nopad_ci=
3472 {
3473     MY_NOPAD_ID(35),0,0,     /* number           */
3474     MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
3475     "ucs2",                  /* cs name          */
3476     "ucs2_general_nopad_ci", /* name             */
3477     "",                      /* comment          */
3478     NULL,                    /* tailoring        */
3479     ctype_ucs2,              /* ctype            */
3480     to_lower_ucs2,           /* to_lower         */
3481     to_upper_ucs2,           /* to_upper         */
3482     to_upper_ucs2,           /* sort_order       */
3483     NULL,                    /* uca              */
3484     NULL,                    /* tab_to_uni       */
3485     NULL,                    /* tab_from_uni     */
3486     &my_unicase_default,     /* caseinfo         */
3487     NULL,                    /* state_map        */
3488     NULL,                    /* ident_map        */
3489     1,                       /* strxfrm_multiply */
3490     1,                       /* caseup_multiply  */
3491     1,                       /* casedn_multiply  */
3492     2,                       /* mbminlen         */
3493     2,                       /* mbmaxlen         */
3494     0,                       /* min_sort_char    */
3495     0xFFFF,                  /* max_sort_char    */
3496     ' ',                     /* pad char         */
3497     0,                       /* escape_with_backslash_is_dangerous */
3498     1,                       /* levels_for_order */
3499     &my_charset_ucs2_handler,
3500     &my_collation_ucs2_general_nopad_ci_handler
3501 };
3502 
3503 
3504 struct charset_info_st my_charset_ucs2_nopad_bin=
3505 {
3506     MY_NOPAD_ID(90),0,0,     /* number           */
3507     MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
3508     "ucs2",                  /* cs name          */
3509     "ucs2_nopad_bin",        /* name             */
3510     "",                      /* comment          */
3511     NULL,                    /* tailoring        */
3512     ctype_ucs2,              /* ctype            */
3513     to_lower_ucs2,           /* to_lower         */
3514     to_upper_ucs2,           /* to_upper         */
3515     NULL,                    /* sort_order       */
3516     NULL,                    /* uca              */
3517     NULL,                    /* tab_to_uni       */
3518     NULL,                    /* tab_from_uni     */
3519     &my_unicase_default,     /* caseinfo         */
3520     NULL,                    /* state_map        */
3521     NULL,                    /* ident_map        */
3522     1,                       /* strxfrm_multiply */
3523     1,                       /* caseup_multiply  */
3524     1,                       /* casedn_multiply  */
3525     2,                       /* mbminlen         */
3526     2,                       /* mbmaxlen         */
3527     0,                       /* min_sort_char    */
3528     0xFFFF,                  /* max_sort_char    */
3529     ' ',                     /* pad char         */
3530     0,                       /* escape_with_backslash_is_dangerous */
3531     1,                       /* levels_for_order */
3532     &my_charset_ucs2_handler,
3533     &my_collation_ucs2_nopad_bin_handler
3534 };
3535 
3536 #endif /* HAVE_CHARSET_ucs2 */
3537